Extended BaseEstimator to accept optional dict of values that nodes c…

…an take, adapted MLE, tests
pgmpy · May 31, 2016 · cd06747 · cd06747
1 parent ca2f7b6
commit cd06747
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 26 deletions.
diff --git a/pgmpy/estimators/MLE.py b/pgmpy/estimators/MLE.py
@@ -1,3 +1,5 @@
+# coding:utf-8
+
 from pgmpy.estimators import BaseEstimator
 from pgmpy.factors import TabularCPD
 from pgmpy.models import BayesianModel
@@ -27,11 +29,11 @@ class MaximumLikelihoodEstimator(BaseEstimator):
     >>> model = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')])
     >>> estimator = MaximumLikelihoodEstimator(model, data)
     """
-    def __init__(self, model, data):
+    def __init__(self, model, data, node_values=None):
         if not isinstance(model, BayesianModel):
             raise NotImplementedError("Maximum Likelihood Estimate is only implemented for BayesianModel")
 
-        super(MaximumLikelihoodEstimator, self).__init__(model, data)
+        super(MaximumLikelihoodEstimator, self).__init__(model, data, node_values)
 
     def get_parameters(self):
         """
@@ -48,16 +50,15 @@ def get_parameters(self):
         >>> import pandas as pd
         >>> from pgmpy.models import BayesianModel
         >>> from pgmpy.estimators import MaximumLikelihoodEstimator
-        >>> data = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)),
-        ...                       columns=['A', 'B', 'C', 'D', 'E'])
-        >>> model = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')])
-        >>> estimator = MaximumLikelihoodEstimator(model, data)
+        >>> values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 4)),
+        ...                       columns=['A', 'B', 'C', 'D'])
+        >>> model = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'))
+        >>> estimator = MaximumLikelihoodEstimator(model, values)
         >>> estimator.get_parameters()
-        [<TabularCPD representing P(B:2 | A:2, C:2) at 0x7f682187fb70>,
-        <TabularCPD representing P(A:2) at 0x7f682187f860>,
-        <TabularCPD representing P(E:2 | B:2) at 0x7f6826a7a9e8>,
-        <TabularCPD representing P(C:2) at 0x7f682187ff98>,
-        <TabularCPD representing P(D:2 | C:2) at 0x7f682187fdd8>]
+        [<TabularCPD representing P(C:2) at 0x7f7b534251d0>,
+        <TabularCPD representing P(B:2 | C:2, A:2) at 0x7f7b4dfd4da0>,
+        <TabularCPD representing P(A:2) at 0x7f7b4dfd4fd0>,
+        <TabularCPD representing P(D:2 | C:2) at 0x7f7b4df822b0>]
         """
         parameters = []
 
@@ -87,7 +88,7 @@ def _estimate_cpd(self, node):
         >>> from pgmpy.estimators import MaximumLikelihoodEstimator
         >>> data = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]})
         >>> model = BayesianModel([('A', 'C'), ('B', 'C')])
-        >>> cpd_A = MaximumLikelihoodEstimator(model, data)._get_CPD('A')
+        >>> cpd_A = MaximumLikelihoodEstimator(model, data)._estimate_cpd('A')
         >>> print(str(cpd_A))
         ╒═════╤══════════╕
         │ A_0 │ 0.666667 │
@@ -100,21 +101,21 @@ def _estimate_cpd(self, node):
         if not parents:
             state_counts = self.data.ix[:, node].value_counts()
             state_counts = state_counts.reindex(sorted(state_counts.index))
-            cpd = TabularCPD(node, self.node_card[node],
+            cpd = TabularCPD(node, len(self.node_values[node]),
                              state_counts.values[:, np.newaxis])
         else:
-            parent_card = np.array([self.node_card[parent] for parent in parents])
-            var_card = self.node_card[node]
+            parent_cardinalities = np.array([len(self.node_values[parent]) for parent in parents])
+            node_cardinality = len(self.node_values[node])
 
             values = self.data.groupby([node] + parents).size().unstack(parents).fillna(0)
-            if not len(values.columns) == np.prod(parent_card):
+            if not len(values.columns) == np.prod(parent_cardinalities):
                 # some columns are missing if for some states of the parents no data was observed.
                 # reindex to add missing columns and fill in uniform (conditional) probabilities:
-                full_index = pd.MultiIndex.from_product([range(card) for card in parent_card], names=parents)
-                values = values.reindex(columns=full_index).fillna(1.0/var_card)
+                full_index = pd.MultiIndex.from_product([range(card) for card in parent_cardinalities], names=parents)
+                values = values.reindex(columns=full_index).fillna(1.0/node_cardinality)
 
-            cpd = TabularCPD(node, var_card, np.array(values),
+            cpd = TabularCPD(node, node_cardinality, np.array(values),
                              evidence=parents,
-                             evidence_card=parent_card.astype('int'))
+                             evidence_card=parent_cardinalities.astype('int'))
         cpd.normalize()
         return cpd
diff --git a/pgmpy/estimators/base.py b/pgmpy/estimators/base.py
@@ -4,20 +4,36 @@
 
 class BaseEstimator(object):
     """
-    Base class for estimator class in pgmpy. Estimator class is used for parameter estimation as well
-    as structure estimation
+    Base class for parameter estimators in pgmpy.
 
     Parameters
     ----------
     model: pgmpy.models.BayesianModel or pgmpy.models.MarkovModel or pgmpy.models.NoisyOrModel
         model for which parameter estimation is to be done
 
     data: pandas DataFrame object
-        datafame object with column names same as the variable names of the network
+        datafame object with column names identical to the variable names of the model
+
+    node_values: dict (optional)
+        A dict indicating, for each variable, the discrete set of values (realizations)
+        that the variable can take. If unspecified, the observed values in the data set
+        are taken as the only possible states.
     """
-    def __init__(self, model, data):
+    def __init__(self, model, data, node_values=None):
         self.model = model
         self.data = data.astype(np.int)
+        if not isinstance(node_values, dict):
+            self.node_values = {node: self._get_node_values(node) for node in model.nodes()}
+        else:
+            self.node_values = dict()
+            for node in model.nodes():
+                if node in node_values:
+                    if not set(self._get_node_values(node)) <= set(node_values[node]):
+                        raise ValueError("Data contains unexpected values for variable '" + str(node) + "'.")
+                    self.node_values[node] = node_values[node]
+                else:
+                    self.node_values[node] = self._get_node_values(node)
 
-        get_node_card = lambda _node, _data: _data.ix[:, _node].value_counts().shape[0]
-        self.node_card = {_node: get_node_card(_node, data) for _node in self.model.nodes()}
+    def _get_node_values(self, node):
+        values = list(self.data.ix[:, node].unique())
+        return values
diff --git a/pgmpy/tests/test_estimators/test_MaximumLikelihoodEstimator.py b/pgmpy/tests/test_estimators/test_MaximumLikelihoodEstimator.py
@@ -25,6 +25,11 @@ def test_estimate_cpd(self):
         self.assertEqual(self.mle1._estimate_cpd('B'), self.cpds[1])
         self.assertEqual(self.mle1._estimate_cpd('C'), self.cpds[2])
 
+    def test_class_init(self):
+        mle2 = MaximumLikelihoodEstimator(self.m1, self.d1,
+                                          node_values={'A': [0, 1], 'B': [0, 1], 'C': [0, 1]})
+        self.assertSetEqual(set(mle2.get_parameters()), set(self.cpds))
+
     def tearDown(self):
         del self.m1
         del self.d1