# Install causalnex, pydotplus and required libraries

In [None]:
!pip install causalnex


In [None]:
pip install "causalnex[all]"

In [None]:
!pip install pydotplus

In [None]:
!pip install -q pydot

In [None]:
!apt install libgraphviz-dev
!pip install pygraphviz

In [None]:
!pip install pygraphviz

In [None]:
#sudo apt-get install python-pip python-virtualenv

In [None]:
#!pip install graphviz !apt-get install graphviz


# Import data

In [None]:
import pandas as pd

In [None]:
data=pd.read_csv('breast-cancer_csv (1).csv')
data.head()

In [None]:
data.rename(columns={'tumor-size' : 'tumor_size','inv-nodes':'inv_nodes','node-caps':'node_caps','deg-malig':'deg_malig','breast-quad':'breast_quad'},inplace=True)

In [None]:
data.head()

In [None]:
data.dropna(inplace=True)

In [None]:
data.isna().sum()

In [None]:
data.Class.replace({'recurrence-events':1,'no-recurrence-events':0},inplace=True)

In [None]:
data.irradiat.replace({'no':0,'yes':1},inplace=True)

In [None]:
data['node_caps'].replace({'no':0,'yes':1},inplace=True)

In [None]:
data['breast'].replace({'right':0,'left':1},inplace=True)

In [None]:
data['menopause'].replace({'premeno':0,'ge40':1,'lt40':2},inplace=True)

In [None]:
data['breast_quad'].replace({'left_low':0,'left_up':1,'central':2,'right_low':3,'right_up':4},inplace=True)

In [None]:
data['menopause'].unique()

In [None]:
data.head()

In [None]:
data['age'].unique()

In [None]:
data['age'].replace({'20-29':0,'30-39':1,'40-49':1,'50-59':2,'60-69':2,'70-79':3},inplace=True)

In [None]:
data['inv_nodes'].replace({'0-2':0,'3-5':1,'6-8':2,'9-11':3,'12-14':4,'15-17':5,'24-26':6},inplace=True)

In [None]:
data['tumor_size'].replace({'0-4':0,'5-9':0,'10-14':1,'15-19':1,'20-24':2,'25-29':2,'30-34':3,'35-39':3,'40-44':4,'45-49':4,'50-54':5},inplace=True)

In [None]:
data['breast_quad'].unique()

In [None]:
data.tumor_size.unique()


In [None]:
data.menopause.unique()

In [None]:
data.info()

In [None]:
data.head()

# Applying the NOTEARS algorithm to learn the structure.

In [None]:

# silence warnings
import warnings
warnings.filterwarnings("ignore")


from causalnex.structure.notears import from_pandas
sm = from_pandas(data)

#### visualise the learned StructureModel using the plot function.

In [None]:
from IPython.display import Image
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE

viz = plot_structure(
    sm,
    graph_attributes={"scale": "0.5"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK)
Image(viz.draw(format='png'))

####The reason why we have a fully connected graph here is we haven’t applied thresholding to the weaker edges. Thresholding can be applied either by specifying the value for the parameter w_threshold in from_pandas, or we can remove the edges by calling the structure model function, remove_edges_below_threshold.

In [None]:
sm.remove_edges_below_threshold(0.3)
viz = plot_structure(
    sm,
    graph_attributes={"scale": "0.5"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK)
Image(viz.draw(format='png'))

# Modifying the Structure
To correct erroneous relationships, we can incorporate domain knowledge into the model after structure learning. We can modify the structure model through adding and deleting the edges. For example, we can add and remove edges as:

In [None]:
# sm.add_edge("breast-quad", "Class")
# sm.add_edge('menopause','Class')

In [None]:
viz = plot_structure(
    sm,
    graph_attributes={"scale": "0.5"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK)
Image(viz.draw(format='png'))

#### We can now visualise our updated structure to confirm it looks reasonable.

In [None]:
viz = plot_structure(
    sm,
    graph_attributes={"scale": "0.5"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK)
Image(viz.draw(format='png'))

#### We can see there are two separate subgraphs here in the visualisation plot: Dalc->Walc and the other big subgraph. We can retrieve the largest subgraph easily by calling the StructureModel function get_largest_subgraph().

In [None]:
sm = sm.get_largest_subgraph()

viz = plot_structure(
    sm,
    graph_attributes={"scale": "0.5"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK)
Image(viz.draw(format='png'))

## After deciding on how the final structure model should look, we can instantiate a BayesianNetwork.

In [None]:
from causalnex.network import BayesianNetwork

bn = BayesianNetwork(sm)

##### We are now ready to move on to learning the conditional probability distribution of different features in the BayesianNetwork.

# Fitting the Conditional Distribution of the Bayesian Network
### Preparing the Discretised Data
Bayesian Networks in CausalNex support only discrete distributions. Any continuous features, or features with a large number of categories, should be discretised prior to fitting the Bayesian Network. Models containing variables with many possible values will typically be badly fit, and exhibit poor performance.

For example, consider P(G2 | G1), where G1 and G2 have possible values 0 to 20. The discrete conditional probability distribution is therefore specified using 21x21 (441) possible combinations - most of which we will be unlikely to observe.

CausalNex provides a few helper methods to make discretisation easier. Let’s start by reducing the number of categories in some of the categorical features by combining similar values. We will make numeric features categorical by discretisation, and then give the buckets meaningful labels.

# Cardinality of Categorical Features

In [None]:
data.head()

In [None]:
data.irradiat.replace({0:'no',1:'yes'},inplace=True)

In [None]:
data['node_caps'].replace({0:'no',1:'yes'},inplace=True)

In [None]:
data.head()

In [None]:
Class_map = {0: "healthy", 1: "breast cancer"}

data["Class"] = data["Class"].map(Class_map)
data.head()

## Train-Test split

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, train_size=0.9, test_size=0.1, random_state=7)

# Model Probability
With the learnt structure model from earlier and the discretised data, we can now fit the probability distrbution of the Bayesian Network. The first step in this is specifying all of the states that each node can take. This can be done either from data, or providing a dictionary of node values. We use the full dataset here to avoid cases where states in our test set do not exist in the training set. For real-world applications, these states may need to be provided using the dictionary method.

In [None]:
bn = bn.fit_node_states(data)

# Fit Conditional Probability Distributions
The fit_cpds method of BayesianNetwork accepts a dataset to learn the conditional probablilty distributions (CPDs) of each node, along with a method of how to do this fit.

In [None]:
bn = bn.fit_cpds(train, method="BayesianEstimator", bayes_prior="K2")


#### once we have the CPDs, we can inspect them through the cpds property, which is a dictionary of node->cpd.

In [None]:
bn.cpds["Class"]

#### The CPD dictionaries are multi-indexed, and so the loc function can be a useful way to interact with them:

# Predict the State given the Input Data
The predict method of BayesianNetwork allows us to make predictions based on the data using the learnt Bayesian Network.

In [None]:
data.loc[16, data.columns != 'Class']

In [None]:
predictions = bn.predict(data, "Class")

In [None]:
print('The prediction is \'{prediction}\''.format(prediction=predictions.loc[18, 'Class_prediction']))

#### The prediction by the Bayesian Network turns out to be a Fail. Let’s compare this to the ground truth:

In [None]:
print('The ground truth is \'{truth}\''.format(truth=data.loc[18, 'Class']))

##### which turns out to be the same.

# Model Quality
To evaluate the quality of the model that has been learned, CausalNex supports two main approaches: Classification Report and Reciever Operating Characteristics (ROC) / Area Under the ROC Curve (AUC). In this section each will be discussed.

# Classification Report
To obtain a classification report using a BN, we need to provide a test set, and the node we are trying to classify. The report will predict the target node for all rows in the test set, and evaluate how well those predictions are made.

In [None]:
from causalnex.evaluation import classification_report
classification_report(bn, test, "Class")

#### This report shows that the model we have defined is able to classify whether a person having breast cancer or not.


# ROC / AUC
Reciever Operating Characteristics (ROC), and the Area Under the ROC Curve (AUC) can be obtained using the roc_auc method within the CausalNex metrics module. Again, a test set and target node must be provided. The ROC curve is computed by micro-averaging predictions made across all states (classes) of the target node.

In [None]:
from causalnex.evaluation import roc_auc
roc, auc = roc_auc(bn, test, "Class")
print(auc)

#### The AUC value for our model is good enough, giving us confidence in the performance.

# Querying Marginals
After iterating over our model structure, CPDs, and validating our model quality, we can query our model under defferent observation to gain insights.

# Baseline Marginals
To query the model for baseline marginals that reflect the population as a whole, a query method can be used. First let’s update our model using the complete dataset, since the one we currently have was only built from training data.

In [None]:
bn = bn.fit_cpds(train, method="BayesianEstimator", bayes_prior="K2")

#### For inference, we must create a new InferenceEngine from our BayesianNetwork, which lets us query the model. The query method will compute the marginal likelihood of all states for all nodes.

In [None]:
from causalnex.inference import InferenceEngine

ie = InferenceEngine(bn)
marginals = ie.query()
marginals["Class"]

#### The output observed tells us that P(Class=Healthy) = 0.704, and the P(Class=Breast cancer) = 0.2959. As a quick sanity check, we can compute what proportion of our dataset are healthy, which should be approximately the same.



In [None]:
import numpy as np
labels, counts = np.unique(train["Class"], return_counts=True)
list(zip(labels, counts))

#### The proportion of the students who has breast cancer is 73 / (73+176) = 0.293 - which is close to our computed marginal likelihood.

## Marginals after Observations
We can also query the marginal likelihood of states in our network given some obse

In [None]:
marginals_no = ie.query({"node_caps": "no"})
marginals_yes = ie.query({"node_caps": "yes"})
print("Marginal Class | No", marginals_no["Class"])
print("Marginal Class | Yes", marginals_yes["Class"])

## Do Calculus
CausalNex also supports simple Do-Calculus, allowing us to specify interventions. In this section we will take a look at the supported methods, and what they mean.

### Updating a Node Distribution
We can apply an intervention to any node in our data, updating its distribution using a do operator. This can be thought of as asking our model “What if” something were different. For example, we could ask what would happen if 100% of students wanted to go on to do higher education.

In [None]:
print("distribution before do", ie.query()["irradiat"])
ie.do_intervention("irradiat",
                   {'yes': 0.0,
                    'no': 1.0})
print("distribution after do", ie.query()["irradiat"])

#### Resetting a Node Distribution
We can reset any interventions that we make by using the reset_intervention method, and providing the node that we want to reset.


In [None]:
ie.reset_do("irradiat")


## Effect of Do on Marginals

We can again use query to examine the effect that an intervention has on our marginal likelihoods.

In [None]:
print("marginal Class", ie.query()["Class"])
ie.do_intervention("irradiat",
                   {'no': 1.0,
                    'yes': 0.0})
print("updated marginal Class", ie.query()["Class"])

#### In this case, we can see that if 100% of people have no irradiat, then we estimate that healthy rate would increase from 70.3% to 76.12%.