MLE: Removed restriction that variable values/names must be integers …

…+ minor bugfix
pgmpy · Jun 6, 2016 · 04c4d14 · 04c4d14
1 parent 0815f3c
commit 04c4d14
Show file tree

Hide file tree

Showing 3 changed files with 66 additions and 30 deletions.
diff --git a/pgmpy/estimators/MLE.py b/pgmpy/estimators/MLE.py
@@ -37,7 +37,7 @@ def __init__(self, model, data, node_values=None):
 
     def get_parameters(self):
         """
-        Method used to get parameters.
+        Method to estimate the model parameters (CPDs).
 
         Returns
         -------
@@ -97,27 +97,33 @@ def _estimate_cpd(self, node):
         ╘══════╧══════════╛
         """
 
-        parents = self.model.get_parents(node)
+        parents = sorted(self.model.get_parents(node))
+        node_cardinality = len(self.node_values[node])
+        parents_cardinalities = np.array([len(self.node_values[parent]) for parent in parents])
+
         if not parents:
-            state_counts = self.data.ix[:, node].value_counts()
-            state_counts = state_counts.reindex(sorted(state_counts.index))
-            cpd = TabularCPD(node, len(self.node_values[node]),
-                             state_counts.values[:, np.newaxis],
-                             state_names=self.node_values)
-        else:
-            parent_cardinalities = np.array([len(self.node_values[parent]) for parent in parents])
-            node_cardinality = len(self.node_values[node])
+            state_count_data = self.data.ix[:, node].value_counts()
+            state_counts = state_count_data.reindex(sorted(self.node_values[node])).fillna(0).values[:, np.newaxis]
 
-            values = self.data.groupby([node] + parents).size().unstack(parents).fillna(0)
-            if not len(values.columns) == np.prod(parent_cardinalities):
-                # some columns are missing if for some states of the parents no data was observed.
+        else:
+            state_count_data = self.data.groupby([node] + parents).size()
+            state_counts = state_count_data.unstack(parents).reindex(sorted(self.node_values[node])).fillna(0)
+            if isinstance(state_counts.index, pd.MultiIndex):
+                state_counts = state_counts.sortlevel(axis=1)
+            else:
+                state_counts = state_counts.sort_index(axis=1)
+
+            # some columns might be missing if for some states of the parents no data was observed:
+            if not len(state_counts.columns) == np.prod(parents_cardinalities):
+                possible_parents_states = [sorted(self.node_values[parent]) for parent in parents]
                 # reindex to add missing columns and fill in uniform (conditional) probabilities:
-                full_index = pd.MultiIndex.from_product([range(card) for card in parent_cardinalities], names=parents)
-                values = values.reindex(columns=full_index).fillna(1.0/node_cardinality)
-
-            cpd = TabularCPD(node, node_cardinality, np.array(values),
-                             evidence=parents,
-                             evidence_card=parent_cardinalities.astype('int'),
-                             state_names=self.node_values)
+                full_index = pd.MultiIndex.from_product(possible_parents_states, names=parents)
+                state_counts = state_counts.reindex(columns=full_index).fillna(1.0 / node_cardinality)
+
+        state_names = {var: sorted(states) for var, states in self.node_values.items()}
+        cpd = TabularCPD(node, node_cardinality, np.array(state_counts),
+                         evidence=parents,
+                         evidence_card=parents_cardinalities,
+                         state_names=state_names)
         cpd.normalize()
         return cpd
diff --git a/pgmpy/estimators/base.py b/pgmpy/estimators/base.py
@@ -17,11 +17,11 @@ class BaseEstimator(object):
     node_values: dict (optional)
         A dict indicating, for each variable, the discrete set of values (realizations)
         that the variable can take. If unspecified, the observed values in the data set
-        are taken as the only possible states.
+        are taken to be the only possible states.
     """
     def __init__(self, model, data, node_values=None):
         self.model = model
-        self.data = data.astype(np.int)
+        self.data = data
         if not isinstance(node_values, dict):
             self.node_values = {node: self._get_node_values(node) for node in model.nodes()}
         else:

diff --git a/pgmpy/tests/test_estimators/test_MaximumLikelihoodEstimator.py b/pgmpy/tests/test_estimators/test_MaximumLikelihoodEstimator.py
@@ -10,11 +10,11 @@ class TestMLE(unittest.TestCase):
     def setUp(self):
         self.m1 = BayesianModel([('A', 'C'), ('B', 'C')])
         self.d1 = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]})
-        self.cpds = cpds = [TabularCPD('A', 2, [[2.0/3], [1.0/3]]),
-                            TabularCPD('B', 2, [[2.0/3], [1.0/3]]),
-                            TabularCPD('C', 2, [[0.0, 0.0, 1.0, 0.5],
-                                                [1.0, 1.0, 0.0, 0.5]],
-                                       evidence=['A', 'B'], evidence_card=[2, 2])]
+        self.cpds = [TabularCPD('A', 2, [[2.0/3], [1.0/3]]),
+                     TabularCPD('B', 2, [[2.0/3], [1.0/3]]),
+                     TabularCPD('C', 2, [[0.0, 0.0, 1.0, 0.5],
+                                         [1.0, 1.0, 0.0, 0.5]],
+                                evidence=['A', 'B'], evidence_card=[2, 2])]
         self.mle1 = MaximumLikelihoodEstimator(self.m1, self.d1)
 
     def test_get_parameters_missing_data(self):
@@ -25,10 +25,40 @@ def test_estimate_cpd(self):
         self.assertEqual(self.mle1._estimate_cpd('B'), self.cpds[1])
         self.assertEqual(self.mle1._estimate_cpd('C'), self.cpds[2])
 
+    def test_state_names1(self):
+        m = BayesianModel([('A', 'B')])
+        d = pd.DataFrame(data={'A': [2, 3, 8, 8, 8], 'B': ['X', 'O', 'X', 'O', 'X']})
+        cpd_b = TabularCPD('B', 2, [[0, 1, 1.0 / 3], [1, 0, 2.0 / 3]],
+                           evidence=['A'], evidence_card=[3])
+        mle2 = MaximumLikelihoodEstimator(m, d)
+        self.assertEqual(mle2._estimate_cpd('B'), cpd_b)
+
+    def test_state_names2(self):
+        m = BayesianModel([('Light?', 'Color'), ('Fruit', 'Color')])
+        d = pd.DataFrame(data={'Fruit': ['Apple', 'Apple', 'Apple', 'Banana', 'Banana'],
+                               'Light?': [True,   True,   False,   False,    True],
+                               'Color': ['red',   'green', 'black', 'black',  'yellow']})
+        color_cpd = TabularCPD('Color', 4, [[1, 0, 1, 0], [0, 0.5, 0, 0],
+                                            [0, 0.5, 0, 0], [0, 0, 0, 1]],
+                               evidence=['Fruit', 'Light?'], evidence_card=[2, 2])
+        mle2 = MaximumLikelihoodEstimator(m, d)
+        self.assertEqual(mle2._estimate_cpd('Color'), color_cpd)
+
     def test_class_init(self):
-        mle2 = MaximumLikelihoodEstimator(self.m1, self.d1,
-                                          node_values={'A': [0, 1], 'B': [0, 1], 'C': [0, 1]})
-        self.assertSetEqual(set(mle2.get_parameters()), set(self.cpds))
+        mle = MaximumLikelihoodEstimator(self.m1, self.d1,
+                                         node_values={'A': [0, 1], 'B': [0, 1], 'C': [0, 1]})
+        self.assertSetEqual(set(mle.get_parameters()), set(self.cpds))
+
+    def test_nonoccurring_values(self):
+        mle = MaximumLikelihoodEstimator(self.m1, self.d1,
+                                         node_values={'A': [0, 1, 23], 'B': [0, 1], 'C': [0, 42, 1], 1: [2]})
+        cpds = [TabularCPD('A', 3, [[2.0/3], [1.0/3], [0]]),
+                TabularCPD('B', 2, [[2.0/3], [1.0/3]]),
+                TabularCPD('C', 3, [[0.0, 0.0, 1.0, 1.0/3, 1.0/3, 1.0/3],
+                                    [1.0, 1.0, 0.0, 1.0/3, 1.0/3, 1.0/3],
+                                    [0.0, 0.0, 0.0, 1.0/3, 1.0/3, 1.0/3]],
+                           evidence=['A', 'B'], evidence_card=[3, 2])]
+        self.assertSetEqual(set(mle.get_parameters()), set(cpds))
 
     def tearDown(self):
         del self.m1