# Learning Tree-augmented Bayesian Network (TAN) Structure from Data

## First, create a Naive Bayes graph

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

from pgmpy.models.BayesianModel import BayesianModel

# class variable is A and feature variables are B, C, D, E and R
model = BayesianModel([('A', 'R'), ('A', 'B'), ('A', 'C'), ('A', 'D'), ('A', 'E')])
nx.draw_circular(model, with_labels=True, arrowsize=30, node_size=700, alpha=0.3)
plt.show()


## Second, add interaction between the features

In [None]:
# feature R correlates with other features
model.add_edges_from([('R', 'B'), ('R', 'C'), ('R', 'D'), ('R', 'E')])
nx.draw_circular(model, with_labels=True, arrowsize=30, node_size=700, alpha=0.3)
plt.show()


## Then, parameterize our graph to create a Bayesian network

In [None]:
from pgmpy.factors.discrete import TabularCPD

# add CPD to each edge
cpd_a = TabularCPD('A', 2, [[0.7], [0.3]])
cpd_r = TabularCPD('R',3,[[0.6,0.2],[0.3,0.5],[0.1,0.3]],evidence=['A'], evidence_card=[2])
cpd_b = TabularCPD('B',3,[[0.1,0.1,0.2,0.2,0.7,0.1],
                          [0.1,0.3,0.1,0.2,0.1,0.2],
                          [0.8,0.6,0.7,0.6,0.2,0.7]],
                          evidence=['A','R'], evidence_card=[2,3])
cpd_c = TabularCPD('C',2,[[0.7,0.2,0.2,0.5,0.1,0.3],
                          [0.3,0.8,0.8,0.5,0.9,0.7]],
                          evidence=['A','R'], evidence_card=[2,3])
cpd_d = TabularCPD('D',3,[[0.3,0.8,0.2,0.8,0.4,0.7],
                          [0.4,0.1,0.4,0.1,0.1,0.1],
                          [0.3,0.1,0.4,0.1,0.5,0.2]],
                          evidence=['A','R'], evidence_card=[2,3])
cpd_e = TabularCPD('E',2,[[0.5,0.6,0.6,0.5,0.5,0.4],
                          [0.5,0.4,0.4,0.5,0.5,0.6]],
                          evidence=['A','R'], evidence_card=[2,3])
model.add_cpds(cpd_a, cpd_r, cpd_b, cpd_c, cpd_d, cpd_e)


## Next, generate sample data from our Bayesian network

In [None]:
from pgmpy.sampling import BayesianModelSampling

# sample data from BN
inference = BayesianModelSampling(model)
df_data = inference.forward_sample(size=10000, return_type='dataframe')
print(df_data)


## Now we are ready to learn the TAN structure from sample data

In [None]:
from pgmpy.estimators import TreeSearch

# learn graph structure 
est = TreeSearch(df_data, root_node='R', return_type='tan', class_node='A')
dag = est.estimate()
nx.draw_circular(dag, with_labels=True, arrowsize=30, node_size=700, alpha=0.3)
plt.show()


## To parameterize the learned graph from data, check out the other tutorials for more info

In [None]:
from pgmpy.estimators import BayesianEstimator

# there are many choices of parametrization, here is one example
model = BayesianModel(dag.edges())
model.fit(df_data, estimator=BayesianEstimator, prior_type='dirichlet', pseudo_counts=0.1)

