# Scalable Analysis of Bayesian Networks

We run experiments on some sample topologies.

In [3]:
import gc
import time
import numpy as np
import pandas as pd
import causalnex
from sklearn.preprocessing import LabelEncoder
from main import labeler

## Notation for Adjacency Matrix
Below, the adjacency matrix describes the causal DAG $G = (V, E)$, where for any row index $i \in V$ and column index $j \in V$,
the tuple $(i,j) = 1$ denotes a directed edge from vertex $i$ to vertex $j$.

## Chain

In [42]:
# 0 -> 1 -> 2
start_time = time.perf_counter()
chain_adj = np.array([[0, 1, 0], [0, 0, 1], [0, 0, 0]])
end_time = time.perf_counter()
duration = (end_time - start_time) * 10e5
print("Completes in {}us".format(duration))
print(labeler(chain_adj))

Completes in 48.207999498117715us
[0: {collider = False, chain = False, fork = False}, 1: {collider = False, chain = True, fork = False}, 2: {collider = False, chain = False, fork = False}]


As expected, we see from the output above that node $1$ is correctly marked as being in a chain.

## Collider

In [40]:
# 0 -> 1 <- 2
start_time = time.perf_counter()
collider_adj = np.array([[0, 1, 0], [0, 0, 0], [0, 1, 0]])
end_time = time.perf_counter()
duration = (end_time - start_time) * 10e5
print("Completes in {}us".format(duration))
print(labeler(collider_adj))

Completes in 53.250005294103175us
[0: {collider = False, chain = False, fork = False}, 1: {collider = True, chain = False, fork = False}, 2: {collider = False, chain = False, fork = False}]


## Fork

In [38]:
# 0 -> 1
# 0 -> 2
start_time = time.perf_counter()
fork_adj = np.array([[0, 1, 1], [0, 0, 0], [0, 0, 0]])
end_time = time.perf_counter()
duration = (end_time - start_time) * 10e5
print("Completes in {}us".format(duration))
print(labeler(fork_adj))

Completes in 47.74999979417771us
[0: {collider = False, chain = False, fork = True}, 1: {collider = False, chain = False, fork = False}, 2: {collider = False, chain = False, fork = False}]


## CSuite large_backdoor_binary_t
The below is adapted from the CSuite dataset by [Geffner et. al](https://github.com/microsoft/csuite?tab=readme-ov-file)
![alt text](figs/sab.png "CSuite large_backdoor")

In [44]:
start_time = time.perf_counter()
backdoor_adj = np.array([[0, 1, 0, 1, 0], [0, 0, 1, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])
end_time = time.perf_counter()
duration = (end_time - start_time) * 10e5
print("Completes in {}us".format(duration))
print(labeler(backdoor_adj))

Completes in 63.041996327228844us
[0: {collider = False, chain = False, fork = True}, 1: {collider = False, chain = True, fork = False}, 2: {collider = False, chain = True, fork = False}, 3: {collider = False, chain = True, fork = False}, 4: {collider = True, chain = False, fork = False}]


## Pipeline

In [4]:
data = pd.read_csv('datasets/student-por.csv')
drop_col = ['school','sex','age','Mjob', 'Fjob','reason','guardian']
data = data.drop(columns=drop_col)
data.head(5)

Unnamed: 0,address,famsize,Pstatus,Medu,Fedu,traveltime,studytime,failures,schoolsup,famsup,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,U,GT3,A,4,4,2,2,0,yes,no,...,4,3,4,1,1,3,4,0,11,11
1,U,GT3,T,1,1,1,2,0,no,yes,...,5,3,3,1,1,3,2,9,11,11
2,U,LE3,T,1,1,1,2,0,yes,no,...,4,3,2,2,3,3,6,12,13,12
3,U,GT3,T,4,2,1,3,0,no,yes,...,3,2,2,1,1,5,0,14,14,14
4,U,GT3,T,3,3,1,2,0,no,yes,...,4,3,2,1,2,5,0,11,13,13


In [5]:
struct_data = data.copy()
non_numeric_columns = list(struct_data.select_dtypes(exclude=[np.number]).columns)

print(non_numeric_columns)

['address', 'famsize', 'Pstatus', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']


In [6]:
le = LabelEncoder()

for col in non_numeric_columns:
    struct_data[col] = le.fit_transform(struct_data[col])

struct_data.head(5)

Unnamed: 0,address,famsize,Pstatus,Medu,Fedu,traveltime,studytime,failures,schoolsup,famsup,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,1,0,0,4,4,2,2,0,1,0,...,4,3,4,1,1,3,4,0,11,11
1,1,0,1,1,1,1,2,0,0,1,...,5,3,3,1,1,3,2,9,11,11
2,1,1,1,1,1,1,2,0,1,0,...,4,3,2,2,3,3,6,12,13,12
3,1,0,1,4,2,1,3,0,0,1,...,3,2,2,1,1,5,0,14,14,14
4,1,0,1,3,3,1,2,0,0,1,...,4,3,2,1,2,5,0,11,13,13


In [None]:
from causalnex.structure.notears import from_pandas
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE
sm = from_pandas(struct_data, w_threshold=0.8)

In [29]:
for n in sm.edges:
    print(n)

address
famsize
Pstatus
Medu
Fedu
traveltime
studytime
failures
schoolsup
famsup
paid
activities
nursery
higher
internet
romantic
famrel
freetime
goout
Dalc
Walc
health
absences
G1
G2
G3
