Skip to content

Commit

Permalink
MLE: Removed restriction that variable values/names must be integers …
Browse files Browse the repository at this point in the history
…+ minor bugfix
  • Loading branch information
chrisittner committed Jun 6, 2016
1 parent 0815f3c commit 04c4d14
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 30 deletions.
46 changes: 26 additions & 20 deletions pgmpy/estimators/MLE.py
Expand Up @@ -37,7 +37,7 @@ def __init__(self, model, data, node_values=None):

def get_parameters(self):
"""
Method used to get parameters.
Method to estimate the model parameters (CPDs).
Returns
-------
Expand Down Expand Up @@ -97,27 +97,33 @@ def _estimate_cpd(self, node):
╘══════╧══════════╛
"""

parents = self.model.get_parents(node)
parents = sorted(self.model.get_parents(node))
node_cardinality = len(self.node_values[node])
parents_cardinalities = np.array([len(self.node_values[parent]) for parent in parents])

if not parents:
state_counts = self.data.ix[:, node].value_counts()
state_counts = state_counts.reindex(sorted(state_counts.index))
cpd = TabularCPD(node, len(self.node_values[node]),
state_counts.values[:, np.newaxis],
state_names=self.node_values)
else:
parent_cardinalities = np.array([len(self.node_values[parent]) for parent in parents])
node_cardinality = len(self.node_values[node])
state_count_data = self.data.ix[:, node].value_counts()
state_counts = state_count_data.reindex(sorted(self.node_values[node])).fillna(0).values[:, np.newaxis]

values = self.data.groupby([node] + parents).size().unstack(parents).fillna(0)
if not len(values.columns) == np.prod(parent_cardinalities):
# some columns are missing if for some states of the parents no data was observed.
else:
state_count_data = self.data.groupby([node] + parents).size()
state_counts = state_count_data.unstack(parents).reindex(sorted(self.node_values[node])).fillna(0)
if isinstance(state_counts.index, pd.MultiIndex):
state_counts = state_counts.sortlevel(axis=1)
else:
state_counts = state_counts.sort_index(axis=1)

# some columns might be missing if for some states of the parents no data was observed:
if not len(state_counts.columns) == np.prod(parents_cardinalities):
possible_parents_states = [sorted(self.node_values[parent]) for parent in parents]
# reindex to add missing columns and fill in uniform (conditional) probabilities:
full_index = pd.MultiIndex.from_product([range(card) for card in parent_cardinalities], names=parents)
values = values.reindex(columns=full_index).fillna(1.0/node_cardinality)

cpd = TabularCPD(node, node_cardinality, np.array(values),
evidence=parents,
evidence_card=parent_cardinalities.astype('int'),
state_names=self.node_values)
full_index = pd.MultiIndex.from_product(possible_parents_states, names=parents)
state_counts = state_counts.reindex(columns=full_index).fillna(1.0 / node_cardinality)

state_names = {var: sorted(states) for var, states in self.node_values.items()}
cpd = TabularCPD(node, node_cardinality, np.array(state_counts),
evidence=parents,
evidence_card=parents_cardinalities,
state_names=state_names)
cpd.normalize()
return cpd
4 changes: 2 additions & 2 deletions pgmpy/estimators/base.py
Expand Up @@ -17,11 +17,11 @@ class BaseEstimator(object):
node_values: dict (optional)
A dict indicating, for each variable, the discrete set of values (realizations)
that the variable can take. If unspecified, the observed values in the data set
are taken as the only possible states.
are taken to be the only possible states.
"""
def __init__(self, model, data, node_values=None):
self.model = model
self.data = data.astype(np.int)
self.data = data
if not isinstance(node_values, dict):
self.node_values = {node: self._get_node_values(node) for node in model.nodes()}
else:
Expand Down
46 changes: 38 additions & 8 deletions pgmpy/tests/test_estimators/test_MaximumLikelihoodEstimator.py
Expand Up @@ -10,11 +10,11 @@ class TestMLE(unittest.TestCase):
def setUp(self):
self.m1 = BayesianModel([('A', 'C'), ('B', 'C')])
self.d1 = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]})
self.cpds = cpds = [TabularCPD('A', 2, [[2.0/3], [1.0/3]]),
TabularCPD('B', 2, [[2.0/3], [1.0/3]]),
TabularCPD('C', 2, [[0.0, 0.0, 1.0, 0.5],
[1.0, 1.0, 0.0, 0.5]],
evidence=['A', 'B'], evidence_card=[2, 2])]
self.cpds = [TabularCPD('A', 2, [[2.0/3], [1.0/3]]),
TabularCPD('B', 2, [[2.0/3], [1.0/3]]),
TabularCPD('C', 2, [[0.0, 0.0, 1.0, 0.5],
[1.0, 1.0, 0.0, 0.5]],
evidence=['A', 'B'], evidence_card=[2, 2])]
self.mle1 = MaximumLikelihoodEstimator(self.m1, self.d1)

def test_get_parameters_missing_data(self):
Expand All @@ -25,10 +25,40 @@ def test_estimate_cpd(self):
self.assertEqual(self.mle1._estimate_cpd('B'), self.cpds[1])
self.assertEqual(self.mle1._estimate_cpd('C'), self.cpds[2])

def test_state_names1(self):
m = BayesianModel([('A', 'B')])
d = pd.DataFrame(data={'A': [2, 3, 8, 8, 8], 'B': ['X', 'O', 'X', 'O', 'X']})
cpd_b = TabularCPD('B', 2, [[0, 1, 1.0 / 3], [1, 0, 2.0 / 3]],
evidence=['A'], evidence_card=[3])
mle2 = MaximumLikelihoodEstimator(m, d)
self.assertEqual(mle2._estimate_cpd('B'), cpd_b)

def test_state_names2(self):
m = BayesianModel([('Light?', 'Color'), ('Fruit', 'Color')])
d = pd.DataFrame(data={'Fruit': ['Apple', 'Apple', 'Apple', 'Banana', 'Banana'],
'Light?': [True, True, False, False, True],
'Color': ['red', 'green', 'black', 'black', 'yellow']})
color_cpd = TabularCPD('Color', 4, [[1, 0, 1, 0], [0, 0.5, 0, 0],
[0, 0.5, 0, 0], [0, 0, 0, 1]],
evidence=['Fruit', 'Light?'], evidence_card=[2, 2])
mle2 = MaximumLikelihoodEstimator(m, d)
self.assertEqual(mle2._estimate_cpd('Color'), color_cpd)

def test_class_init(self):
mle2 = MaximumLikelihoodEstimator(self.m1, self.d1,
node_values={'A': [0, 1], 'B': [0, 1], 'C': [0, 1]})
self.assertSetEqual(set(mle2.get_parameters()), set(self.cpds))
mle = MaximumLikelihoodEstimator(self.m1, self.d1,
node_values={'A': [0, 1], 'B': [0, 1], 'C': [0, 1]})
self.assertSetEqual(set(mle.get_parameters()), set(self.cpds))

def test_nonoccurring_values(self):
mle = MaximumLikelihoodEstimator(self.m1, self.d1,
node_values={'A': [0, 1, 23], 'B': [0, 1], 'C': [0, 42, 1], 1: [2]})
cpds = [TabularCPD('A', 3, [[2.0/3], [1.0/3], [0]]),
TabularCPD('B', 2, [[2.0/3], [1.0/3]]),
TabularCPD('C', 3, [[0.0, 0.0, 1.0, 1.0/3, 1.0/3, 1.0/3],
[1.0, 1.0, 0.0, 1.0/3, 1.0/3, 1.0/3],
[0.0, 0.0, 0.0, 1.0/3, 1.0/3, 1.0/3]],
evidence=['A', 'B'], evidence_card=[3, 2])]
self.assertSetEqual(set(mle.get_parameters()), set(cpds))

def tearDown(self):
del self.m1
Expand Down

0 comments on commit 04c4d14

Please sign in to comment.