Restructured pgmpy/estimators/ for structure learning

pgmpy · Aug 11, 2016 · d052b42 · d052b42
1 parent 258b5bd
commit d052b42
Show file tree

Hide file tree

Showing 6 changed files with 188 additions and 38 deletions.
diff --git a/pgmpy/estimators/BayesianEstimator.py b/pgmpy/estimators/BayesianEstimator.py
@@ -1,13 +1,13 @@
 # coding:utf-8
 
-from pgmpy.estimators import BaseEstimator
+from pgmpy.estimators import ParameterEstimator
 from pgmpy.factors import TabularCPD
 from pgmpy.models import BayesianModel
 import numpy as np
 import pandas as pd
 
 
-class BayesianEstimator(BaseEstimator):
+class BayesianEstimator(ParameterEstimator):
     def __init__(self, model, data, **kwargs):
         """
         Class used to compute parameters for a model using Bayesian Parameter Estimation.

diff --git a/pgmpy/estimators/MLE.py b/pgmpy/estimators/MLE.py
@@ -1,13 +1,13 @@
 # coding:utf-8
 
-from pgmpy.estimators import BaseEstimator
+from pgmpy.estimators import ParameterEstimator
 from pgmpy.factors import TabularCPD
 from pgmpy.models import BayesianModel
 import numpy as np
 import pandas as pd
 
 
-class MaximumLikelihoodEstimator(BaseEstimator):
+class MaximumLikelihoodEstimator(ParameterEstimator):
     def __init__(self, model, data, **kwargs):
         """
         Class used to compute parameters for a model using Maximum Likelihood Estimation.

diff --git a/pgmpy/estimators/__init__.py b/pgmpy/estimators/__init__.py
@@ -1,7 +1,11 @@
-from pgmpy.estimators.base import BaseEstimator
+from pgmpy.estimators.base import BaseEstimator, ParameterEstimator, StructureEstimator
 from pgmpy.estimators.MLE import MaximumLikelihoodEstimator
 from pgmpy.estimators.BayesianEstimator import BayesianEstimator
+from pgmpy.estimators.StructureScore import StructureScore
+from pgmpy.estimators.BayesianScore import BayesianScore
+from pgmpy.estimators.ExhaustiveSearch import ExhaustiveSearch
 
 __all__ = ['BaseEstimator',
-           'MaximumLikelihoodEstimator',
-           'BayesianEstimator']
+           'ParameterEstimator', 'MaximumLikelihoodEstimator', 'BayesianEstimator',
+           'StructureEstimator', 'ExhaustiveSearch',
+           'StructureScore', 'BayesianScore']
diff --git a/pgmpy/estimators/base.py b/pgmpy/estimators/base.py
@@ -4,17 +4,16 @@
 
 
 class BaseEstimator(object):
-    def __init__(self, model, data, state_names=None, complete_samples_only=True):
+    def __init__(self, data, state_names=None, complete_samples_only=True):
         """
-        Base class for parameter estimators in pgmpy.
+        Base class for estimators in pgmpy; `ParameterEstimator`,
+        `StructureEstimator` and `StructureScore` derive from this class.
 
         Parameters
         ----------
-        model: pgmpy.models.BayesianModel or pgmpy.models.MarkovModel or pgmpy.models.NoisyOrModel
-            model for which parameter estimation is to be done
 
         data: pandas DataFrame object
-            datafame object with column names identical to the variable names of the model.
+            datafame object where each column represents one variable.
             (If some values in the data are missing the data cells should be set to `numpy.NaN`.
             Note that pandas converts each column containing `numpy.NaN`s to dtype `float`.)
 
@@ -30,37 +29,42 @@ def __init__(self, model, data, state_names=None, complete_samples_only=True):
             This sets the behavior of the `state_count`-method.
         """
 
-        self.model = model
         self.data = data
         self.complete_samples_only = complete_samples_only
 
+        variables = list(data.columns.values)
+
         if not isinstance(state_names, dict):
-            self.state_names = {node: self._collect_state_names(node) for node in model.nodes()}
+            self.state_names = {var: self._collect_state_names(var) for var in variables}
         else:
             self.state_names = dict()
-            for node in model.nodes():
-                if node in state_names:
-                    if not set(self._collect_state_names(node)) <= set(state_names[node]):
-                        raise ValueError("Data contains unexpected states for variable '{0}'.".format(str(node)))
-                    self.state_names[node] = sorted(state_names[node])
+            for var in variables:
+                if var in state_names:
+                    if not set(self._collect_state_names(var)) <= set(state_names[var]):
+                        raise ValueError("Data contains unexpected states for variable '{0}'.".format(str(var)))
+                    self.state_names[var] = sorted(state_names[var])
                 else:
-                    self.state_names[node] = self._collect_state_names(node)
+                    self.state_names[var] = self._collect_state_names(var)
 
     def _collect_state_names(self, variable):
         "Return a list of states that the variable takes in the data"
         states = sorted(list(self.data.ix[:, variable].dropna().unique()))
         return states
 
-    def state_counts(self, variable, complete_samples_only=None):
+    def state_counts(self, variable, parents=[], complete_samples_only=None):
         """
         Return counts how often each state of 'variable' occured in the data.
-        If the variable has parents, counting is done conditionally
+        If a list of parents is provided, counting is done conditionally
         for each state configuration of the parents.
 
         Parameters
         ----------
         variable: string
-            Name of the variable for which the state count is to be done
+            Name of the variable for which the state count is to be done.
+
+        parents: list
+            Optional list of variable parents, if conditional counting is desired.
+            Order of parents in list is reflected in the returned DataFrame
 
         complete_samples_only: bool
             Specifies how to deal with missing data, if present. If set to `True` all rows
@@ -76,27 +80,23 @@ def state_counts(self, variable, complete_samples_only=None):
         Examples
         --------
         >>> import pandas as pd
-        >>> from pgmpy.models import BayesianModel
-        >>> model = BayesianModel([('A', 'C'), ('B', 'C')])
+        >>> from pgmpy.estimators import BaseEstimator
         >>> data = pd.DataFrame(data={'A': ['a1', 'a1', 'a2'],
                                       'B': ['b1', 'b2', 'b1'],
                                       'C': ['c1', 'c1', 'c2']})
-        >>> estimator = BaseEstimator(model, data)
+        >>> estimator = BaseEstimator(data)
         >>> estimator.state_counts('A')
             A
         a1  2
         a2  1
-        >>> estimator.state_counts('C')
+        >>> estimator.state_counts('C', parents=['A', 'B'])
         A  a1      a2
         B  b1  b2  b1  b2
         C
         c1  1   1   0   0
         c2  0   0   1   0
         """
 
-        parents = sorted(self.model.get_parents(variable))
-        parents_states = [self.state_names[parent] for parent in parents]
-
         # default for how to deal with missing data can be set in class constructor
         if complete_samples_only is None:
             complete_samples_only = self.complete_samples_only
@@ -109,6 +109,7 @@ def state_counts(self, variable, complete_samples_only=None):
             state_counts = state_count_data.reindex(self.state_names[variable]).fillna(0).to_frame()
 
         else:
+            parents_states = [self.state_names[parent] for parent in parents]
             # count how often each state of 'variable' occured, conditional on parents' states
             state_count_data = data.groupby([variable] + parents).size().unstack(parents)
 
@@ -121,3 +122,118 @@ def state_counts(self, variable, complete_samples_only=None):
             state_counts = state_count_data.reindex(index=row_index, columns=column_index).fillna(0)
 
         return state_counts
+
+
+class ParameterEstimator(BaseEstimator):
+    def __init__(self, model, data, **kwargs):
+        """
+        Base class for parameter estimators in pgmpy.
+
+        Parameters
+        ----------
+        model: pgmpy.models.BayesianModel or pgmpy.models.MarkovModel or pgmpy.models.NoisyOrModel
+            model for which parameter estimation is to be done
+
+        data: pandas DataFrame object
+            datafame object with column names identical to the variable names of the model.
+            (If some values in the data are missing the data cells should be set to `numpy.NaN`.
+            Note that pandas converts each column containing `numpy.NaN`s to dtype `float`.)
+
+        state_names: dict (optional)
+            A dict indicating, for each variable, the discrete set of states (or values)
+            that the variable can take. If unspecified, the observed values in the data set
+            are taken to be the only possible states.
+
+        complete_samples_only: bool (optional, default `True`)
+            Specifies how to deal with missing data, if present. If set to `True` all rows
+            that contain `np.Nan` somewhere are ignored. If `False` then, for each variable,
+            every row where neither the variable nor its parents are `np.NaN` is used.
+            This sets the behavior of the `state_count`-method.
+        """
+
+        assert set(model.nodes()) <= set(data.columns.values), \
+            "variable names of the model must be identical to column names in data"
+        self.model = model
+
+        super(ParameterEstimator, self).__init__(data, **kwargs)
+
+    def state_counts(self, variable, **kwargs):
+        """
+        Return counts how often each state of 'variable' occured in the data.
+        If the variable has parents, counting is done conditionally
+        for each state configuration of the parents.
+
+        Parameters
+        ----------
+        variable: string
+            Name of the variable for which the state count is to be done.
+
+        complete_samples_only: bool
+            Specifies how to deal with missing data, if present. If set to `True` all rows
+            that contain `np.NaN` somewhere are ignored. If `False` then
+            every row where neither the variable nor its parents are `np.NaN` is used.
+            Desired default behavior can be passed to the class constructor.
+
+        Returns
+        -------
+        state_counts: pandas.DataFrame
+            Table with state counts for 'variable'
+
+        Examples
+        --------
+        >>> import pandas as pd
+        >>> from pgmpy.models import BayesianModel
+        >>> from pgmpy.estimators import ParameterEstimator
+        >>> model = BayesianModel([('A', 'C'), ('B', 'C')])
+        >>> data = pd.DataFrame(data={'A': ['a1', 'a1', 'a2'],
+                                      'B': ['b1', 'b2', 'b1'],
+                                      'C': ['c1', 'c1', 'c2']})
+        >>> estimator = ParameterEstimator(model, data)
+        >>> estimator.state_counts('A')
+            A
+        a1  2
+        a2  1
+        >>> estimator.state_counts('C')
+        A  a1      a2
+        B  b1  b2  b1  b2
+        C
+        c1  1   1   0   0
+        c2  0   0   1   0
+        """
+
+        parents = sorted(self.model.get_parents(variable))
+        return super(ParameterEstimator, self).state_counts(variable, parents=parents, **kwargs)
+
+    def get_parameters(self):
+        pass
+
+
+class StructureEstimator(BaseEstimator):
+    def __init__(self, data, **kwargs):
+        """
+        Base class for structure estimators in pgmpy.
+
+        Parameters
+        ----------
+
+        data: pandas DataFrame object
+            datafame object where each column represents one variable.
+            (If some values in the data are missing the data cells should be set to `numpy.NaN`.
+            Note that pandas converts each column containing `numpy.NaN`s to dtype `float`.)
+
+        state_names: dict (optional)
+            A dict indicating, for each variable, the discrete set of states (or values)
+            that the variable can take. If unspecified, the observed values in the data set
+            are taken to be the only possible states.
+
+        complete_samples_only: bool (optional, default `True`)
+            Specifies how to deal with missing data, if present. If set to `True` all rows
+            that contain `np.Nan` somewhere are ignored. If `False` then, for each variable,
+            every row where neither the variable nor its parents are `np.NaN` is used.
+            This sets the behavior of the `state_count`-method.
+        """
+
+        super(StructureEstimator, self).__init__(data, **kwargs)
+
+    def estimate(self):
+        pass
diff --git a/pgmpy/tests/test_estimators/test_BaseEstimator.py b/pgmpy/tests/test_estimators/test_BaseEstimator.py
@@ -2,33 +2,30 @@
 
 import pandas as pd
 from numpy import NaN
-from pgmpy.models import BayesianModel
 from pgmpy.estimators import MaximumLikelihoodEstimator
 from pgmpy.estimators import BaseEstimator
 from pgmpy.factors import TabularCPD
 
 
 class TestBaseEstimator(unittest.TestCase):
     def setUp(self):
-        self.m1 = BayesianModel([('A', 'C'), ('B', 'C'), ('D', 'B')])
         self.d1 = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0], 'D': ['X', 'Y', 'Z']})
         self.d2 = pd.DataFrame(data={'A': [0, NaN, 1], 'B': [0, 1, 0], 'C': [1, 1, NaN], 'D': [NaN, 'Y', NaN]})
 
     def test_state_count(self):
-        e = BaseEstimator(self.m1, self.d1)
+        e = BaseEstimator(self.d1)
         self.assertEqual(e.state_counts('A').values.tolist(), [[2], [1]])
-        self.assertEqual(e.state_counts('C').values.tolist(),
+        self.assertEqual(e.state_counts('C', ['A', 'B']).values.tolist(),
                          [[0., 0., 1., 0.], [1., 1., 0., 0.]])
 
     def test_missing_data(self):
-        e = BaseEstimator(self.m1, self.d2, state_names={'C': [0, 1]}, complete_samples_only=False)
+        e = BaseEstimator(self.d2, state_names={'C': [0, 1]}, complete_samples_only=False)
         self.assertEqual(e.state_counts('A', complete_samples_only=True).values.tolist(), [[0], [0]])
         self.assertEqual(e.state_counts('A').values.tolist(), [[1], [1]])
-        self.assertEqual(e.state_counts('C', complete_samples_only=True).values.tolist(),
+        self.assertEqual(e.state_counts('C', parents=['A', 'B'], complete_samples_only=True).values.tolist(),
                          [[0, 0, 0, 0], [0, 0, 0, 0]])
-        self.assertEqual(e.state_counts('C').values.tolist(),
+        self.assertEqual(e.state_counts('C', parents=['A', 'B']).values.tolist(),
                          [[0, 0, 0, 0], [1, 0, 0, 0]])
 
     def tearDown(self):
-        del self.m1
         del self.d1
diff --git a/pgmpy/tests/test_estimators/test_ParameterEstimator.py b/pgmpy/tests/test_estimators/test_ParameterEstimator.py
@@ -0,0 +1,33 @@
+import unittest
+
+import pandas as pd
+from numpy import NaN
+from pgmpy.models import BayesianModel
+from pgmpy.estimators import ParameterEstimator
+from pgmpy.factors import TabularCPD
+
+
+class TestParameterEstimator(unittest.TestCase):
+    def setUp(self):
+        self.m1 = BayesianModel([('A', 'C'), ('B', 'C'), ('D', 'B')])
+        self.d1 = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0], 'D': ['X', 'Y', 'Z']})
+        self.d2 = pd.DataFrame(data={'A': [0, NaN, 1], 'B': [0, 1, 0], 'C': [1, 1, NaN], 'D': [NaN, 'Y', NaN]})
+
+    def test_state_count(self):
+        e = ParameterEstimator(self.m1, self.d1)
+        self.assertEqual(e.state_counts('A').values.tolist(), [[2], [1]])
+        self.assertEqual(e.state_counts('C').values.tolist(),
+                         [[0., 0., 1., 0.], [1., 1., 0., 0.]])
+
+    def test_missing_data(self):
+        e = ParameterEstimator(self.m1, self.d2, state_names={'C': [0, 1]}, complete_samples_only=False)
+        self.assertEqual(e.state_counts('A', complete_samples_only=True).values.tolist(), [[0], [0]])
+        self.assertEqual(e.state_counts('A').values.tolist(), [[1], [1]])
+        self.assertEqual(e.state_counts('C', complete_samples_only=True).values.tolist(),
+                         [[0, 0, 0, 0], [0, 0, 0, 0]])
+        self.assertEqual(e.state_counts('C').values.tolist(),
+                         [[0, 0, 0, 0], [1, 0, 0, 0]])
+
+    def tearDown(self):
+        del self.m1
+        del self.d1