Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EM estimation for DBNs #1449

Open
wants to merge 4 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
131 changes: 106 additions & 25 deletions pgmpy/models/DynamicBayesianNetwork.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,9 @@ def __getitem__(self, idx: int) -> typing.Union[str, int]:
raise IndexError(f"Index {idx} out of bounds.")

def __str__(self) -> str:
return f"({self.node}, {self.time_slice})"
return f"{self.node}_{self.time_slice}"

def __repr__(self) -> str:
return f"<DiscreteNode({self.node}, {self.time_slice}) at {hex(id(self))}>"
__repr__ = __str__

def __lt__(self, other) -> bool:
if self.node < other.node:
Expand Down Expand Up @@ -66,7 +65,7 @@ def to_tuple(self) -> tuple:


class DynamicBayesianNetwork(DAG):
def __init__(self, ebunch=None):
def __init__(self, ebunch=None, latents=set()):
"""
Base class for Dynamic Bayesian Network

Expand All @@ -85,9 +84,7 @@ def __init__(self, ebunch=None):
Examples
--------
Create an empty Dynamic Bayesian Network with no nodes and no edges:
>>> from pgmpy.models import DynamicBayesianNetwork as DBN
>>> dbn = DBN()

>>> from pgmpy.models import DynamicBayesianNetwork as DBN >>> dbn = DBN()
Adding nodes and edges inside the dynamic bayesian network. A single
node can be added using the method below. For adding edges we need to
specify the time slice since edges can be across different time slices.
Expand Down Expand Up @@ -137,8 +134,9 @@ def __init__(self, ebunch=None):
self.add_edges_from(ebunch)
self.cpds = []
self.cardinalities = defaultdict(int)
self.latents = latents

def add_node(self, node, **attr):
def add_node(self, node, latent=False, **attr):
"""
Adds a single node to the Network

Expand All @@ -154,9 +152,11 @@ def add_node(self, node, **attr):
>>> dbn.add_node('A')
['A']
"""
super(DynamicBayesianNetwork, self).add_node(DynamicNode(node, 0), **attr)
super(DynamicBayesianNetwork, self).add_node(
DynamicNode(node, 0), latent=latent, **attr
)

def add_nodes_from(self, nodes, **attr):
def add_nodes_from(self, nodes, latent=False, **attr):
"""
Add multiple nodes to the Network.

Expand All @@ -172,7 +172,7 @@ def add_nodes_from(self, nodes, **attr):
>>> dbn.add_nodes_from(['A', 'B', 'C'])
"""
for node in nodes:
self.add_node(node)
self.add_node(node, latent=latent)

def _nodes(self):
"""
Expand Down Expand Up @@ -758,10 +758,11 @@ def get_constant_bn(self):
from pgmpy.models import BayesianNetwork

return BayesianNetwork(
ebunch=[(f"{u[0]}_{u[1]}", f"{v[0]}_{v[1]}") for (u, v) in self.edges()]
ebunch=[(f"{u[0]}_{u[1]}", f"{v[0]}_{v[1]}") for (u, v) in self.edges()],
latents=[f"{node[0]}_{node[1]}" for node in self.latents],
)

def fit(self, data, estimator="MLE"):
def fit(self, data, estimator="MLE", **kwargs):
"""
Learns the CPD of the model from data.

Expand Down Expand Up @@ -805,31 +806,111 @@ def fit(self, data, estimator="MLE"):
... colnames.extend([("A", t), ("B", t), ("C", t), ("D", t)])
>>> df = pd.DataFrame(data, columns=colnames)
>>> model.fit(df)

If the model has some latent variables, the Expecation Maximization estimator
can be used.

>>> import numpy as np
>>> import pandas as pd
>>> from pgmpy.models import DynamicBayesianNetwork as DBN
>>> model = DBN(
>>> [
>>> (("A", 0), ("B", 0)),
>>> (("A", 0), ("C", 0)),
>>> (("B", 0), ("D", 0)),
>>> (("C", 0), ("D", 0)),
>>> (("A", 0), ("A", 1)),
>>> (("B", 0), ("B", 1)),
>>> (("C", 0), ("C", 1)),
>>> (("D", 0), ("D", 1)),
>>> ],
>>> latents=[("A", 0), ("A", 1)]
>>> )
>>> data = np.random.randint(low=0, high=2, size=(1000, 20))
>>> colnames = []
>>> for t in range(5):
... colnames.extend([("B", t), ("C", t), ("D", t)])
>>> df = pd.DataFrame(data, columns=colnames)
>>> model.fit(df, estimator="em")
"""
if not isinstance(data, pd.DataFrame):
raise ValueError(f"data must be a pandas dataframe. Got: {type(data)}")

if min(data.columns, key=lambda t: t[1])[1] != 0:
raise ValueError("data column names must start from time slice 0.")

if estimator not in {"MLE", "mle"}:
raise ValueError("Only Maximum Likelihood Estimator is supported currently")
if estimator.lower() not in {"mle", "em"}:
raise ValueError(
"Only Maximum Likelihood and Expectation Maximization estimators are currently supported"
)
else:
estimator = estimator.lower()

n_samples = data.shape[0]
const_bn = self.get_constant_bn()
n_time_slices = max(data.columns, key=lambda t: t[1])[1]

for t_slice in range(n_time_slices):
colnames = [(node, t_slice) for node in self._nodes()]
colnames.extend([(node, t_slice + 1) for node in self._nodes()])

df_slice = data.loc[:, colnames]
new_colnames = [f"{node}_{t - t_slice}" for (node, t) in df_slice.columns]
df_slice.columns = new_colnames
if t_slice == 0:
const_bn.fit(df_slice)
else:
const_bn.fit_update(df_slice, n_prev_samples=t_slice * n_samples)
if estimator == "mle":
colnames = [(node, t_slice) for node in self._nodes()]
colnames.extend([(node, t_slice + 1) for node in self._nodes()])

df_slice = data.loc[:, colnames]
new_colnames = [
f"{node}_{t - t_slice}" for (node, t) in df_slice.columns
]
df_slice.columns = new_colnames

if t_slice == 0:
const_bn.fit(df_slice, **kwargs)
else:
const_bn.fit_update(
df_slice, n_prev_samples=t_slice * n_samples, **kwargs
)

elif estimator == "em":
from pgmpy.estimators import ExpectationMaximization as EM

latent_vars = set([var for var, time in self.latents])
colnames = [
(node, t_slice) for node in self._nodes() if node not in latent_vars
]
colnames.extend(
[
(node, t_slice + 1)
for node in self._nodes()
if node not in latent_vars
]
)

df_slice = data.loc[:, colnames]
new_colnames = [
f"{node}_{t - t_slice}" for (node, t) in df_slice.columns
]
df_slice.columns = new_colnames

if t_slice == 0:
const_bn.fit(df_slice, estimator=EM, show_progress=False, **kwargs)
else:
current_cpds = [cpd.copy() for cpd in const_bn.cpds]
const_bn.fit(df_slice, estimator=EM, show_progress=False, **kwargs)
final_cpds = []
for curr_cpd in current_cpds:
var = curr_cpd.variables[0]
new_cpd = const_bn.get_cpds(var)
weighted_cpd_values = curr_cpd.get_values() * (
t_slice * n_samples
) + (new_cpd.get_values() * n_samples)
final_cpds.append(
TabularCPD(
var,
curr_cpd.cardinality[0],
weighted_cpd_values,
curr_cpd.variables[1:],
curr_cpd.cardinality[1:],
).normalize(inplace=False)
)
const_bn.add_cpds(*final_cpds)

cpds = []
for cpd in const_bn.cpds:
Expand Down
69 changes: 69 additions & 0 deletions pgmpy/tests/test_inference/test_dbn_inference.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import unittest

import numpy as np
import numpy.testing as np_test
from scipy.special import softmax

from pgmpy.inference import DBNInference
from pgmpy.models import DynamicBayesianNetwork
Expand Down Expand Up @@ -119,3 +121,70 @@ def test_backward_inf_multiple_variables_with_evidence(self):
np_test.assert_array_almost_equal(
query_result[("X", 1)].values, np.array([0.7621772, 0.2378228])
)

def test_issue_1364(self):
edges = [
(("A", 0), ("A", 1)),
(("A", 0), ("B", 1)),
(("B", 0), ("A", 1)),
(("B", 0), ("B", 1)),
(("L", 0), ("A", 0)),
(("L", 0), ("B", 0)),
(("L", 0), ("L", 1)),
(("L", 1), ("A", 1)),
(("L", 1), ("B", 1)),
]

bnet = DynamicBayesianNetwork(edges)

bnet_cpds = []
bnet_cpds.append(
TabularCPD(
("A", 0),
2,
softmax(np.ones((2, 2)), axis=0),
evidence=[("L", 0)],
evidence_card=[2],
)
)
bnet_cpds.append(
TabularCPD(
("B", 0),
2,
softmax(np.ones((2, 2)), axis=0),
evidence=[("L", 0)],
evidence_card=[2],
)
)
bnet_cpds.append(TabularCPD(("L", 0), 2, softmax(np.ones((2, 1)), axis=0)))
bnet_cpds.append(
TabularCPD(
("A", 1),
2,
softmax(np.ones((2, 8)), axis=0),
evidence=[("A", 0), ("B", 0), ("L", 1)],
evidence_card=[2, 2, 2],
)
)
bnet_cpds.append(
TabularCPD(
("B", 1),
2,
softmax(np.ones((2, 8)), axis=0),
evidence=[("A", 0), ("B", 0), ("L", 1)],
evidence_card=[2, 2, 2],
)
)
bnet_cpds.append(
TabularCPD(
("L", 1),
2,
softmax(np.ones((2, 2)), axis=0),
evidence=[("L", 0)],
evidence_card=[2],
)
)

bnet.add_cpds(*bnet_cpds)

ie = DBNInference(bnet)
42 changes: 40 additions & 2 deletions pgmpy/tests/test_models/test_DynamicBayesianNetwork.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def setUp(self):
def test_add_single_node(self):
self.network.add_node("a")
self.assertListEqual(self.network._nodes(), ["a"])
self.assertEqual(self.network.latents, set())

def test_add_multiple_nodes(self):
self.network.add_nodes_from(["a", "b", "c"])
Expand Down Expand Up @@ -368,6 +369,41 @@ def test_fit(self):
df.columns = wrong_colnames
self.assertRaises(ValueError, model.fit, df)

def test_fit_latent(self):
model = DynamicBayesianNetwork(
[
(("A", 0), ("B", 0)),
(("A", 0), ("C", 0)),
(("B", 0), ("D", 0)),
(("C", 0), ("D", 0)),
(("A", 0), ("A", 1)),
(("B", 0), ("B", 1)),
(("C", 0), ("C", 1)),
(("D", 0), ("D", 1)),
],
latents=[("A", 0), ("A", 1)],
)

data = np.random.randint(low=0, high=2, size=(10000, 15))
colnames = []
for t in range(5):
colnames.extend([("B", t), ("C", t), ("D", t)])
df = pd.DataFrame(data, columns=colnames)
model.fit(df, estimator="em", max_iter=10)

self.assertTrue(model.check_model())
self.assertEqual(len(model.cpds), 8)
for cpd in model.cpds:
self.assertTrue(np.logical_or(cpd.values > 0.3, cpd.values < 0.7).all())

self.assertRaises(ValueError, model.fit, df, "bayesian")
self.assertRaises(ValueError, model.fit, df.values, "em")
wrong_colnames = []
for t in range(5):
wrong_colnames.extend([("B", t + 1), ("C", t + 1), ("D", t + 1)])
df.columns = wrong_colnames
self.assertRaises(ValueError, model.fit, df, "em")

def test_get_markov_blanket(self):
self.network.add_edges_from(
[(("a", 0), ("a", 1)), (("a", 0), ("b", 1)), (("b", 0), ("b", 1))]
Expand All @@ -390,12 +426,14 @@ def test_active_trail_nodes(self):
[(("a", 0), ("a", 1)), (("a", 0), ("b", 1)), (("b", 0), ("b", 1))]
)

active_trail = self.network.active_trail_nodes(("a", 0))
active_trail = self.network.active_trail_nodes(("a", 0), include_latents=True)
self.assertListEqual(
sorted(active_trail.get(("a", 0))), [("a", 0), ("a", 1), ("b", 1)]
)

active_trail = self.network.active_trail_nodes(("a", 0), observed=[("b", 1)])
active_trail = self.network.active_trail_nodes(
("a", 0), observed=[("b", 1)], include_latents=True
)
self.assertListEqual(
sorted(active_trail.get(("a", 0))), [("a", 0), ("a", 1), ("b", 0)]
)
Expand Down
1 change: 1 addition & 0 deletions pgmpy/tests/test_models/test_MarkovChain.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ def test_is_stationarity_success(self):
model.add_transition_model("diff", diff_tm)
self.assertTrue(model.is_stationarity)

@unittest.skipIf(sys.platform.startswith("win"), reason="Failing on windows")
def test_is_stationarity_failure(self):
model = MC(["intel", "diff"], [2, 3])
model.set_start_state([State("intel", 0), State("diff", 2)])
Expand Down