Skip to content

Commit

Permalink
Extended BaseEstimator to accept optional dict of values that nodes c…
Browse files Browse the repository at this point in the history
…an take, adapted MLE, tests
  • Loading branch information
chrisittner committed May 31, 2016
1 parent ca2f7b6 commit cd06747
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 26 deletions.
41 changes: 21 additions & 20 deletions pgmpy/estimators/MLE.py
@@ -1,3 +1,5 @@
# coding:utf-8

from pgmpy.estimators import BaseEstimator
from pgmpy.factors import TabularCPD
from pgmpy.models import BayesianModel
Expand Down Expand Up @@ -27,11 +29,11 @@ class MaximumLikelihoodEstimator(BaseEstimator):
>>> model = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')])
>>> estimator = MaximumLikelihoodEstimator(model, data)
"""
def __init__(self, model, data):
def __init__(self, model, data, node_values=None):
if not isinstance(model, BayesianModel):
raise NotImplementedError("Maximum Likelihood Estimate is only implemented for BayesianModel")

super(MaximumLikelihoodEstimator, self).__init__(model, data)
super(MaximumLikelihoodEstimator, self).__init__(model, data, node_values)

def get_parameters(self):
"""
Expand All @@ -48,16 +50,15 @@ def get_parameters(self):
>>> import pandas as pd
>>> from pgmpy.models import BayesianModel
>>> from pgmpy.estimators import MaximumLikelihoodEstimator
>>> data = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)),
... columns=['A', 'B', 'C', 'D', 'E'])
>>> model = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')])
>>> estimator = MaximumLikelihoodEstimator(model, data)
>>> values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 4)),
... columns=['A', 'B', 'C', 'D'])
>>> model = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'))
>>> estimator = MaximumLikelihoodEstimator(model, values)
>>> estimator.get_parameters()
[<TabularCPD representing P(B:2 | A:2, C:2) at 0x7f682187fb70>,
<TabularCPD representing P(A:2) at 0x7f682187f860>,
<TabularCPD representing P(E:2 | B:2) at 0x7f6826a7a9e8>,
<TabularCPD representing P(C:2) at 0x7f682187ff98>,
<TabularCPD representing P(D:2 | C:2) at 0x7f682187fdd8>]
[<TabularCPD representing P(C:2) at 0x7f7b534251d0>,
<TabularCPD representing P(B:2 | C:2, A:2) at 0x7f7b4dfd4da0>,
<TabularCPD representing P(A:2) at 0x7f7b4dfd4fd0>,
<TabularCPD representing P(D:2 | C:2) at 0x7f7b4df822b0>]
"""
parameters = []

Expand Down Expand Up @@ -87,7 +88,7 @@ def _estimate_cpd(self, node):
>>> from pgmpy.estimators import MaximumLikelihoodEstimator
>>> data = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]})
>>> model = BayesianModel([('A', 'C'), ('B', 'C')])
>>> cpd_A = MaximumLikelihoodEstimator(model, data)._get_CPD('A')
>>> cpd_A = MaximumLikelihoodEstimator(model, data)._estimate_cpd('A')
>>> print(str(cpd_A))
╒═════╤══════════╕
│ A_0 │ 0.666667 │
Expand All @@ -100,21 +101,21 @@ def _estimate_cpd(self, node):
if not parents:
state_counts = self.data.ix[:, node].value_counts()
state_counts = state_counts.reindex(sorted(state_counts.index))
cpd = TabularCPD(node, self.node_card[node],
cpd = TabularCPD(node, len(self.node_values[node]),
state_counts.values[:, np.newaxis])
else:
parent_card = np.array([self.node_card[parent] for parent in parents])
var_card = self.node_card[node]
parent_cardinalities = np.array([len(self.node_values[parent]) for parent in parents])
node_cardinality = len(self.node_values[node])

values = self.data.groupby([node] + parents).size().unstack(parents).fillna(0)
if not len(values.columns) == np.prod(parent_card):
if not len(values.columns) == np.prod(parent_cardinalities):
# some columns are missing if for some states of the parents no data was observed.
# reindex to add missing columns and fill in uniform (conditional) probabilities:
full_index = pd.MultiIndex.from_product([range(card) for card in parent_card], names=parents)
values = values.reindex(columns=full_index).fillna(1.0/var_card)
full_index = pd.MultiIndex.from_product([range(card) for card in parent_cardinalities], names=parents)
values = values.reindex(columns=full_index).fillna(1.0/node_cardinality)

cpd = TabularCPD(node, var_card, np.array(values),
cpd = TabularCPD(node, node_cardinality, np.array(values),
evidence=parents,
evidence_card=parent_card.astype('int'))
evidence_card=parent_cardinalities.astype('int'))
cpd.normalize()
return cpd
28 changes: 22 additions & 6 deletions pgmpy/estimators/base.py
Expand Up @@ -4,20 +4,36 @@

class BaseEstimator(object):
"""
Base class for estimator class in pgmpy. Estimator class is used for parameter estimation as well
as structure estimation
Base class for parameter estimators in pgmpy.
Parameters
----------
model: pgmpy.models.BayesianModel or pgmpy.models.MarkovModel or pgmpy.models.NoisyOrModel
model for which parameter estimation is to be done
data: pandas DataFrame object
datafame object with column names same as the variable names of the network
datafame object with column names identical to the variable names of the model
node_values: dict (optional)
A dict indicating, for each variable, the discrete set of values (realizations)
that the variable can take. If unspecified, the observed values in the data set
are taken as the only possible states.
"""
def __init__(self, model, data):
def __init__(self, model, data, node_values=None):
self.model = model
self.data = data.astype(np.int)
if not isinstance(node_values, dict):
self.node_values = {node: self._get_node_values(node) for node in model.nodes()}
else:
self.node_values = dict()
for node in model.nodes():
if node in node_values:
if not set(self._get_node_values(node)) <= set(node_values[node]):
raise ValueError("Data contains unexpected values for variable '" + str(node) + "'.")
self.node_values[node] = node_values[node]
else:
self.node_values[node] = self._get_node_values(node)

get_node_card = lambda _node, _data: _data.ix[:, _node].value_counts().shape[0]
self.node_card = {_node: get_node_card(_node, data) for _node in self.model.nodes()}
def _get_node_values(self, node):
values = list(self.data.ix[:, node].unique())
return values
Expand Up @@ -25,6 +25,11 @@ def test_estimate_cpd(self):
self.assertEqual(self.mle1._estimate_cpd('B'), self.cpds[1])
self.assertEqual(self.mle1._estimate_cpd('C'), self.cpds[2])

def test_class_init(self):
mle2 = MaximumLikelihoodEstimator(self.m1, self.d1,
node_values={'A': [0, 1], 'B': [0, 1], 'C': [0, 1]})
self.assertSetEqual(set(mle2.get_parameters()), set(self.cpds))

def tearDown(self):
del self.m1
del self.d1

0 comments on commit cd06747

Please sign in to comment.