In this notebook, we show an example for learning the parameters (CPDs) of a Discrete Bayesian Network given the data and the model structure. pgmpy has two main methods for learning the parameters:
1. MaximumLikelihood Estimator (pgmpy.estimators.MaximumLikelihoodEstimator)
2. Bayesian Estimator (pgmpy.estimators.BayesianEstimator)

In the examples, we will try to generate some data from given models and then try to learn the model parameters back from the generated data.

### Step 1: Generate some data

In [1]:
# Use the alarm model to generate data from it.

from pgmpy.utils import get_example_model
from pgmpy.sampling import BayesianModelSampling

alarm_model = get_example_model('alarm')
samples = BayesianModelSampling(alarm_model).forward_sample(size=int(1e5))
samples.head()

  "Found unknown state name. Trying to switch to using all state names as state numbers"
Generating for node: CVP: 100%|██████████| 37/37 [01:23<00:00,  2.27s/it]         


Unnamed: 0,MINVOLSET,VENTMACH,DISCONNECT,VENTTUBE,INTUBATION,PULMEMBOLUS,SHUNT,PAP,FIO2,KINKEDTUBE,...,HRBP,LVFAILURE,HISTORY,HYPOVOLEMIA,STROKEVOLUME,CO,BP,LVEDVOLUME,PCWP,CVP
0,NORMAL,NORMAL,False,LOW,ESOPHAGEAL,False,NORMAL,NORMAL,NORMAL,False,...,LOW,False,False,True,NORMAL,LOW,LOW,NORMAL,NORMAL,NORMAL
1,NORMAL,NORMAL,False,LOW,NORMAL,False,NORMAL,NORMAL,NORMAL,False,...,LOW,False,False,False,NORMAL,NORMAL,LOW,NORMAL,NORMAL,NORMAL
2,NORMAL,NORMAL,False,LOW,ESOPHAGEAL,False,NORMAL,NORMAL,NORMAL,False,...,NORMAL,False,False,False,NORMAL,HIGH,HIGH,NORMAL,NORMAL,NORMAL
3,NORMAL,NORMAL,False,LOW,NORMAL,False,NORMAL,NORMAL,NORMAL,False,...,HIGH,False,False,False,NORMAL,HIGH,HIGH,NORMAL,NORMAL,NORMAL
4,NORMAL,NORMAL,False,LOW,NORMAL,False,NORMAL,HIGH,NORMAL,False,...,HIGH,False,False,False,NORMAL,HIGH,HIGH,NORMAL,NORMAL,NORMAL


### Step 2: Define a model structure

In this case, since we are trying to learn the model parameters back we will use the model structure that we used to generate the data from.

In [2]:
# Defining the Bayesian Model structure

from pgmpy.models import BayesianModel

model_struct = BayesianModel(ebunch=alarm_model.edges())
model_struct.nodes()

NodeView(('HYPOVOLEMIA', 'LVEDVOLUME', 'STROKEVOLUME', 'CVP', 'PCWP', 'LVFAILURE', 'HISTORY', 'CO', 'ERRLOWOUTPUT', 'HRBP', 'ERRCAUTER', 'HREKG', 'HRSAT', 'INSUFFANESTH', 'CATECHOL', 'ANAPHYLAXIS', 'TPR', 'BP', 'KINKEDTUBE', 'PRESS', 'VENTLUNG', 'FIO2', 'PVSAT', 'SAO2', 'PULMEMBOLUS', 'PAP', 'SHUNT', 'INTUBATION', 'MINVOL', 'VENTALV', 'DISCONNECT', 'VENTTUBE', 'MINVOLSET', 'VENTMACH', 'EXPCO2', 'ARTCO2', 'HR'))

### Step 3: Learning the model parameters 

In [3]:
# Fitting the model using Maximum Likelihood Estimator

from pgmpy.estimators import MaximumLikelihoodEstimator

mle = MaximumLikelihoodEstimator(model=model_struct, data=samples)

# Estimating the CPD for a single node.
print(mle.estimate_cpd(node='FIO2'))
print(mle.estimate_cpd(node='CVP'))

# Estimating CPDs for all the nodes in the model
mle.get_parameters()[:10] # Show just the first 10 CPDs in the output

+--------------+---------+
| FIO2(LOW)    | 0.05005 |
+--------------+---------+
| FIO2(NORMAL) | 0.94995 |
+--------------+---------+
+-------------+----------------------+----------------------+----------------------+
| LVEDVOLUME  | LVEDVOLUME(HIGH)     | LVEDVOLUME(LOW)      | LVEDVOLUME(NORMAL)   |
+-------------+----------------------+----------------------+----------------------+
| CVP(HIGH)   | 0.6989774078478003   | 0.011237205162438807 | 0.00951605298126795  |
+-------------+----------------------+----------------------+----------------------+
| CVP(LOW)    | 0.009845422116527943 | 0.9506008010680908   | 0.038807207052738366 |
+-------------+----------------------+----------------------+----------------------+
| CVP(NORMAL) | 0.29117717003567184  | 0.0381619937694704   | 0.9516767399659937   |
+-------------+----------------------+----------------------+----------------------+


[<TabularCPD representing P(ANAPHYLAXIS:2) at 0x7f14727362e8>,
 <TabularCPD representing P(ARTCO2:3 | VENTALV:4) at 0x7f1471306048>,
 <TabularCPD representing P(BP:3 | CO:3, TPR:3) at 0x7f1471306f28>,
 <TabularCPD representing P(CATECHOL:2 | ARTCO2:3, INSUFFANESTH:2, SAO2:3, TPR:3) at 0x7f1471306a20>,
 <TabularCPD representing P(CO:3 | HR:3, STROKEVOLUME:3) at 0x7f1471306fd0>,
 <TabularCPD representing P(CVP:3 | LVEDVOLUME:3) at 0x7f1471306630>,
 <TabularCPD representing P(DISCONNECT:2) at 0x7f1471306e48>,
 <TabularCPD representing P(ERRCAUTER:2) at 0x7f1471306dd8>,
 <TabularCPD representing P(ERRLOWOUTPUT:2) at 0x7f1471306320>,
 <TabularCPD representing P(EXPCO2:4 | ARTCO2:3, VENTLUNG:4) at 0x7f14712b8160>]

In [4]:
# Verifying that the learned parameters are almost equal.
np.allclose(alarm_model.get_cpds('FIO2').values, mle.estimate_cpd('FIO2').values, atol=0.01)

True

In [5]:
# Fitting the using Bayesian Estimator
from pgmpy.estimators import BayesianEstimator

best = BayesianEstimator(model=model_struct, data=samples)

print(best.estimate_cpd(node='FIO2', prior_type="BDeu", equivalent_sample_size=1000))
# Uniform pseudo count for each state. Can also accept an array of the size of CPD.
print(best.estimate_cpd(node='CVP', prior_type="dirichlet", pseudo_counts=100))

# Learning CPDs for all the nodes in the model. For learning all parameters with BDeU prior, a dict of
# pseudo_counts need to be provided
best.get_parameters(prior_type="BDeu", equivalent_sample_size=1000)[:10]

+--------------+----------+
| FIO2(LOW)    | 0.054505 |
+--------------+----------+
| FIO2(NORMAL) | 0.945495 |
+--------------+----------+
+-------------+----------------------+----------------------+----------------------+
| LVEDVOLUME  | LVEDVOLUME(HIGH)     | LVEDVOLUME(LOW)      | LVEDVOLUME(NORMAL)   |
+-------------+----------------------+----------------------+----------------------+
| CVP(HIGH)   | 0.6938335287221571   | 0.021640826873385012 | 0.010898174626886907 |
+-------------+----------------------+----------------------+----------------------+
| CVP(LOW)    | 0.014396248534583822 | 0.9306632213608957   | 0.04006430776672784  |
+-------------+----------------------+----------------------+----------------------+
| CVP(NORMAL) | 0.2917702227432591   | 0.04769595176571921  | 0.9490375176063852   |
+-------------+----------------------+----------------------+----------------------+


[<TabularCPD representing P(HYPOVOLEMIA:2) at 0x7f1472736e10>,
 <TabularCPD representing P(LVEDVOLUME:3 | HYPOVOLEMIA:2, LVFAILURE:2) at 0x7f147128b898>,
 <TabularCPD representing P(STROKEVOLUME:3 | HYPOVOLEMIA:2, LVFAILURE:2) at 0x7f147128bb70>,
 <TabularCPD representing P(CVP:3 | LVEDVOLUME:3) at 0x7f147128bbe0>,
 <TabularCPD representing P(PCWP:3 | LVEDVOLUME:3) at 0x7f147128b9e8>,
 <TabularCPD representing P(LVFAILURE:2) at 0x7f147128bc18>,
 <TabularCPD representing P(HISTORY:2 | LVFAILURE:2) at 0x7f147128bfd0>,
 <TabularCPD representing P(CO:3 | HR:3, STROKEVOLUME:3) at 0x7f147128bef0>,
 <TabularCPD representing P(ERRLOWOUTPUT:2) at 0x7f147128b6a0>,
 <TabularCPD representing P(HRBP:3 | ERRLOWOUTPUT:2, HR:3) at 0x7f147128b5f8>]

In [7]:
# Shortcut for learning all the parameters and adding the CPDs to the model.

model_struct = BayesianModel(ebunch=alarm_model.edges())
model_struct.fit(data=samples, estimator=MaximumLikelihoodEstimator)
print(model_struct.get_cpds('FIO2'))

model_struct = BayesianModel(ebunch=alarm_model.edges())
model_struct.fit(data=samples, estimator=BayesianEstimator, prior_type='BDeu', equivalent_sample_size=1000)
print(model_struct.get_cpds('FIO2'))

+--------------+---------+
| FIO2(LOW)    | 0.05005 |
+--------------+---------+
| FIO2(NORMAL) | 0.94995 |
+--------------+---------+
+--------------+----------+
| FIO2(LOW)    | 0.054505 |
+--------------+----------+
| FIO2(NORMAL) | 0.945495 |
+--------------+----------+
