# Install causalnex, pydotplus and required libraries

In [None]:
!pip install causalnex


In [None]:
pip install "causalnex[all]"

In [None]:
!pip install pydotplus

In [None]:
!pip install -q pydot

In [None]:
!apt install libgraphviz-dev
!pip install pygraphviz

In [None]:
!pip install pygraphviz

In [None]:
#sudo apt-get install python-pip python-virtualenv

In [None]:
#!pip install graphviz !apt-get install graphviz


# Import data

In [None]:
import pandas as pd

In [None]:
data=pd.read_csv('student-por.csv',sep=';')
data.head()

In [None]:
drop_col = ['school','sex','age','Mjob', 'Fjob','reason','guardian']
data = data.drop(columns=drop_col)
data.head(5)

In [None]:
import numpy as np
struct_data = data.copy()

non_numeric_columns = list(struct_data.select_dtypes(exclude=[np.number]).columns)
print(non_numeric_columns)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in non_numeric_columns:
    struct_data[col] = le.fit_transform(struct_data[col])

struct_data.head(5)

# Applying the NOTEARS algorithm to learn the structure.

In [None]:

# silence warnings
import warnings
warnings.filterwarnings("ignore")


from causalnex.structure.notears import from_pandas
sm = from_pandas(struct_data)

#### visualise the learned StructureModel using the plot function.

In [None]:
from IPython.display import Image
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE

# viz = plot_structure(
#     sm,
#     graph_attributes={"scale": "0.8"},
#     all_node_attributes=NODE_STYLE.WEAK,
#     all_edge_attributes=EDGE_STYLE.WEAK)
# Image(viz.draw(format='jpg'))

####The reason why we have a fully connected graph here is we haven’t applied thresholding to the weaker edges. Thresholding can be applied either by specifying the value for the parameter w_threshold in from_pandas, or we can remove the edges by calling the structure model function, remove_edges_below_threshold.

In [None]:
sm.remove_edges_below_threshold(0.8)
viz = plot_structure(
    sm,
    graph_attributes={"scale": "0.5"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK)
Image(viz.draw(format='png'))

# Modifying the Structure
To correct erroneous relationships, we can incorporate domain knowledge into the model after structure learning. We can modify the structure model through adding and deleting the edges. For example, we can add and remove edges as:

In [None]:
sm = from_pandas(struct_data, tabu_edges=[("higher", "Medu")], w_threshold=0.8)
viz = plot_structure(
    sm,
    graph_attributes={"scale": "0.5"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK)
Image(viz.draw(format='png'))

In [None]:
# sm.add_edge("breast-quad", "Class")
# sm.add_edge('menopause','Class')

sm.add_edge("failures", "G1")
sm.remove_edge("Pstatus", "G1")
sm.remove_edge("address", "G1")


In [None]:
viz = plot_structure(
    sm,
    graph_attributes={"scale": "0.5"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK)
Image(viz.draw(format='png'))

#### We can now visualise our updated structure to confirm it looks reasonable.

#### We can see there are two separate subgraphs here in the visualisation plot: Dalc->Walc and the other big subgraph. We can retrieve the largest subgraph easily by calling the StructureModel function get_largest_subgraph().

In [None]:
sm = sm.get_largest_subgraph()

viz = plot_structure(
    sm,
    graph_attributes={"scale": "0.5"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK)
Image(viz.draw(format='png'))

## After deciding on how the final structure model should look, we can instantiate a BayesianNetwork.

In [None]:
from causalnex.network import BayesianNetwork

bn = BayesianNetwork(sm)

##### We are now ready to move on to learning the conditional probability distribution of different features in the BayesianNetwork.

In [None]:
discretised_data = data.copy()

data_vals = {col: data[col].unique() for col in data.columns}

failures_map = {v: 'no-failure' if v == [0]
            else 'have-failure' for v in data_vals['failures']}

studytime_map = {v: 'short-studytime' if v in [1,2]
                 else 'long-studytime' for v in data_vals['studytime']}

discretised_data["failures"] = discretised_data["failures"].map(failures_map)
discretised_data["studytime"] = discretised_data["studytime"].map(studytime_map)

In [None]:
from causalnex.discretiser import Discretiser

discretised_data["absences"] = Discretiser(method="fixed",
                          numeric_split_points=[1, 10]).transform(discretised_data["absences"].values)

discretised_data["G1"] = Discretiser(method="fixed",
                          numeric_split_points=[10]).transform(discretised_data["G1"].values)

discretised_data["G2"] = Discretiser(method="fixed",
                          numeric_split_points=[10]).transform(discretised_data["G2"].values)

discretised_data["G3"] = Discretiser(method="fixed",
                          numeric_split_points=[10]).transform(discretised_data["G3"].values)

# Fitting the Conditional Distribution of the Bayesian Network
### Preparing the Discretised Data
Bayesian Networks in CausalNex support only discrete distributions. Any continuous features, or features with a large number of categories, should be discretised prior to fitting the Bayesian Network. Models containing variables with many possible values will typically be badly fit, and exhibit poor performance.

For example, consider P(G2 | G1), where G1 and G2 have possible values 0 to 20. The discrete conditional probability distribution is therefore specified using 21x21 (441) possible combinations - most of which we will be unlikely to observe.

CausalNex provides a few helper methods to make discretisation easier. Let’s start by reducing the number of categories in some of the categorical features by combining similar values. We will make numeric features categorical by discretisation, and then give the buckets meaningful labels.

# Cardinality of Categorical Features

In [None]:
absences_map = {0: "No-absence", 1: "Low-absence", 2: "High-absence"}

G1_map = {0: "Fail", 1: "Pass"}
G2_map = {0: "Fail", 1: "Pass"}
G3_map = {0: "Fail", 1: "Pass"}

discretised_data["absences"] = discretised_data["absences"].map(absences_map)
discretised_data["G1"] = discretised_data["G1"].map(G1_map)
discretised_data["G2"] = discretised_data["G2"].map(G2_map)
discretised_data["G3"] = discretised_data["G3"].map(G3_map)

## Train-Test split

In [None]:
# Split 90% train and 10% test
from sklearn.model_selection import train_test_split

train, test = train_test_split(discretised_data, train_size=0.9, test_size=0.1, random_state=7)

# Model Probability
With the learnt structure model from earlier and the discretised data, we can now fit the probability distrbution of the Bayesian Network. The first step in this is specifying all of the states that each node can take. This can be done either from data, or providing a dictionary of node values. We use the full dataset here to avoid cases where states in our test set do not exist in the training set. For real-world applications, these states may need to be provided using the dictionary method.

In [None]:
bn = bn.fit_node_states(data)

# Fit Conditional Probability Distributions
The fit_cpds method of BayesianNetwork accepts a dataset to learn the conditional probablilty distributions (CPDs) of each node, along with a method of how to do this fit.

In [None]:
bn = bn.fit_cpds(train, method="BayesianEstimator", bayes_prior="K2")


#### once we have the CPDs, we can inspect them through the cpds property, which is a dictionary of node->cpd.

In [None]:
bn.cpds["G1"]

#### The CPD dictionaries are multi-indexed, and so the loc function can be a useful way to interact with them:

# Predict the State given the Input Data
The predict method of BayesianNetwork allows us to make predictions based on the data using the learnt Bayesian Network.

In [None]:
discretised_data.loc[18, discretised_data.columns != 'G1']


In [None]:
predictions = bn.predict(discretised_data, "G1")

In [None]:

print('The prediction is \'{prediction}\''.format(prediction=predictions.loc[18, 'G1_prediction']))

#### The prediction by the Bayesian Network turns out to be a Fail. Let’s compare this to the ground truth:

In [None]:
print('The ground truth is \'{truth}\''.format(truth=discretised_data.loc[18, 'G1']))

##### which turns out to be the same.

# Model Quality
To evaluate the quality of the model that has been learned, CausalNex supports two main approaches: Classification Report and Reciever Operating Characteristics (ROC) / Area Under the ROC Curve (AUC). In this section each will be discussed.

# Classification Report
To obtain a classification report using a BN, we need to provide a test set, and the node we are trying to classify. The report will predict the target node for all rows in the test set, and evaluate how well those predictions are made.

In [None]:
from causalnex.evaluation import classification_report
classification_report(bn, test, "G1")

#### This report shows that the model we have defined is able to classify whether a person having breast cancer or not.


# ROC / AUC
Reciever Operating Characteristics (ROC), and the Area Under the ROC Curve (AUC) can be obtained using the roc_auc method within the CausalNex metrics module. Again, a test set and target node must be provided. The ROC curve is computed by micro-averaging predictions made across all states (classes) of the target node.

In [None]:
from causalnex.evaluation import roc_auc
roc, auc = roc_auc(bn, test, "G1")
print(auc)

#### The AUC value for our model is good enough, giving us confidence in the performance.