# Expectation Maximization

Expectation Maximization is a technique to estimate the parameters of a model from data that may have missing observations. It is closely related to the Maximum Liklihood Estimator in pgmpy with the key distinction that rather than doing a df.dropna to remove samples that have np.nan in them, the distribution for these variables is estimated from the prior given the other observed variables in the sample. 

The algorithm in pgmpy is described in *Probabilistic Graphical Models: Principles and Techniques* by Daphne Koller and Nir Friedman and one of the examples is the worked out model in the book.

In [None]:
import numpy as np
import pandas as pd
import daft
from daft import PGM

from pgmpy.factors.discrete import TabularCPD
from pgmpy.models.BayesianModel import BayesianModel
from pgmpy.inference import VariableElimination
from pgmpy.estimators import MaximumLikelihoodEstimator, ExpectationMaximization

import matplotlib.pyplot as plt

In [None]:
def convert_pgm_to_pgmpy(pgm):
    """Takes a Daft PGM object and converts it to a pgmpy BayesianModel"""
    edges = [(edge.node1.name, edge.node2.name) for edge in pgm._edges]
    model = BayesianModel(edges)
    return model

In [None]:
pgm = PGM(shape=[3,4])

pgm.add_node(daft.Node('A', r"A", 1, 4))
pgm.add_node(daft.Node('B', r"B", 3, 4))
pgm.add_node(daft.Node('C', r"C", 2, 3))
pgm.add_node(daft.Node('D', r"D", 2, 2))

pgm.add_edge('A', 'C')
pgm.add_edge('B', 'C')
pgm.add_edge('C', 'D')

pgm.render()
plt.show()

In [None]:
model = convert_pgm_to_pgmpy(pgm)

In [None]:
cpd_a = TabularCPD(
    variable='A',
    variable_card=2,
    values=[[0.7],[0.3]],
)

cpd_b = TabularCPD(
    variable='B',
    variable_card=2,
    values=[[0.1],[0.9]],
)

cpd_c = TabularCPD(
    variable='C',
    variable_card=2,
    values=[[0.17,0.91,0.4,0.8],
            [0.83,0.09,0.6,0.2]],
    evidence=['A','B'],
    evidence_card=[2, 2],
)

cpd_d = TabularCPD(
    variable='D',
    variable_card=2,
    values=[[0.9,0.2],
            [0.1,0.8]],
    evidence=['C'],
    evidence_card=[2,],
)

# Add the parameters to the model
model.add_cpds(cpd_a, cpd_b, cpd_c, cpd_d)

# Check the model
model.check_model()

### Run some basic inference

In [None]:
ve = VariableElimination(model)

In [None]:
q = ve.query(
    variables=["D"],
    evidence={"A":1,"B":0},
    show_progress=False,
)
print(q)

In [None]:
data = pd.DataFrame(
    data = {
        "A":[1,np.nan],
        "B":[np.nan,1],
        "C":[np.nan,np.nan],
        "D":[0,1],
    }
)
print(data)

In [None]:
em = ExpectationMaximization(model, data)

In [None]:
print(em.estimate_cpd(node="D",n_iter=1))

In [None]:
for cpd in em.get_parameters():
    print(cpd)

In [None]:
data = pd.DataFrame(
    data = {
        "A":[np.nan,np.nan],
        "B":[np.nan,np.nan],
        "C":[np.nan,np.nan],
        "D":[np.nan,np.nan],
    }
)
print(data)

In [None]:
em = ExpectationMaximization(model,data)

for cpd in em.get_parameters():
    print(cpd)