Skip to content

Commit

Permalink
Restructured pgmpy/estimators/ for structure learning
Browse files Browse the repository at this point in the history
  • Loading branch information
chrisittner committed Aug 11, 2016
1 parent 258b5bd commit d052b42
Show file tree
Hide file tree
Showing 6 changed files with 188 additions and 38 deletions.
4 changes: 2 additions & 2 deletions pgmpy/estimators/BayesianEstimator.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# coding:utf-8

from pgmpy.estimators import BaseEstimator
from pgmpy.estimators import ParameterEstimator
from pgmpy.factors import TabularCPD
from pgmpy.models import BayesianModel
import numpy as np
import pandas as pd


class BayesianEstimator(BaseEstimator):
class BayesianEstimator(ParameterEstimator):
def __init__(self, model, data, **kwargs):
"""
Class used to compute parameters for a model using Bayesian Parameter Estimation.
Expand Down
4 changes: 2 additions & 2 deletions pgmpy/estimators/MLE.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# coding:utf-8

from pgmpy.estimators import BaseEstimator
from pgmpy.estimators import ParameterEstimator
from pgmpy.factors import TabularCPD
from pgmpy.models import BayesianModel
import numpy as np
import pandas as pd


class MaximumLikelihoodEstimator(BaseEstimator):
class MaximumLikelihoodEstimator(ParameterEstimator):
def __init__(self, model, data, **kwargs):
"""
Class used to compute parameters for a model using Maximum Likelihood Estimation.
Expand Down
10 changes: 7 additions & 3 deletions pgmpy/estimators/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
from pgmpy.estimators.base import BaseEstimator
from pgmpy.estimators.base import BaseEstimator, ParameterEstimator, StructureEstimator
from pgmpy.estimators.MLE import MaximumLikelihoodEstimator
from pgmpy.estimators.BayesianEstimator import BayesianEstimator
from pgmpy.estimators.StructureScore import StructureScore
from pgmpy.estimators.BayesianScore import BayesianScore
from pgmpy.estimators.ExhaustiveSearch import ExhaustiveSearch

__all__ = ['BaseEstimator',
'MaximumLikelihoodEstimator',
'BayesianEstimator']
'ParameterEstimator', 'MaximumLikelihoodEstimator', 'BayesianEstimator',
'StructureEstimator', 'ExhaustiveSearch',
'StructureScore', 'BayesianScore']
162 changes: 139 additions & 23 deletions pgmpy/estimators/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,16 @@


class BaseEstimator(object):
def __init__(self, model, data, state_names=None, complete_samples_only=True):
def __init__(self, data, state_names=None, complete_samples_only=True):
"""
Base class for parameter estimators in pgmpy.
Base class for estimators in pgmpy; `ParameterEstimator`,
`StructureEstimator` and `StructureScore` derive from this class.
Parameters
----------
model: pgmpy.models.BayesianModel or pgmpy.models.MarkovModel or pgmpy.models.NoisyOrModel
model for which parameter estimation is to be done
data: pandas DataFrame object
datafame object with column names identical to the variable names of the model.
datafame object where each column represents one variable.
(If some values in the data are missing the data cells should be set to `numpy.NaN`.
Note that pandas converts each column containing `numpy.NaN`s to dtype `float`.)
Expand All @@ -30,37 +29,42 @@ def __init__(self, model, data, state_names=None, complete_samples_only=True):
This sets the behavior of the `state_count`-method.
"""

self.model = model
self.data = data
self.complete_samples_only = complete_samples_only

variables = list(data.columns.values)

if not isinstance(state_names, dict):
self.state_names = {node: self._collect_state_names(node) for node in model.nodes()}
self.state_names = {var: self._collect_state_names(var) for var in variables}
else:
self.state_names = dict()
for node in model.nodes():
if node in state_names:
if not set(self._collect_state_names(node)) <= set(state_names[node]):
raise ValueError("Data contains unexpected states for variable '{0}'.".format(str(node)))
self.state_names[node] = sorted(state_names[node])
for var in variables:
if var in state_names:
if not set(self._collect_state_names(var)) <= set(state_names[var]):
raise ValueError("Data contains unexpected states for variable '{0}'.".format(str(var)))
self.state_names[var] = sorted(state_names[var])
else:
self.state_names[node] = self._collect_state_names(node)
self.state_names[var] = self._collect_state_names(var)

def _collect_state_names(self, variable):
"Return a list of states that the variable takes in the data"
states = sorted(list(self.data.ix[:, variable].dropna().unique()))
return states

def state_counts(self, variable, complete_samples_only=None):
def state_counts(self, variable, parents=[], complete_samples_only=None):
"""
Return counts how often each state of 'variable' occured in the data.
If the variable has parents, counting is done conditionally
If a list of parents is provided, counting is done conditionally
for each state configuration of the parents.
Parameters
----------
variable: string
Name of the variable for which the state count is to be done
Name of the variable for which the state count is to be done.
parents: list
Optional list of variable parents, if conditional counting is desired.
Order of parents in list is reflected in the returned DataFrame
complete_samples_only: bool
Specifies how to deal with missing data, if present. If set to `True` all rows
Expand All @@ -76,27 +80,23 @@ def state_counts(self, variable, complete_samples_only=None):
Examples
--------
>>> import pandas as pd
>>> from pgmpy.models import BayesianModel
>>> model = BayesianModel([('A', 'C'), ('B', 'C')])
>>> from pgmpy.estimators import BaseEstimator
>>> data = pd.DataFrame(data={'A': ['a1', 'a1', 'a2'],
'B': ['b1', 'b2', 'b1'],
'C': ['c1', 'c1', 'c2']})
>>> estimator = BaseEstimator(model, data)
>>> estimator = BaseEstimator(data)
>>> estimator.state_counts('A')
A
a1 2
a2 1
>>> estimator.state_counts('C')
>>> estimator.state_counts('C', parents=['A', 'B'])
A a1 a2
B b1 b2 b1 b2
C
c1 1 1 0 0
c2 0 0 1 0
"""

parents = sorted(self.model.get_parents(variable))
parents_states = [self.state_names[parent] for parent in parents]

# default for how to deal with missing data can be set in class constructor
if complete_samples_only is None:
complete_samples_only = self.complete_samples_only
Expand All @@ -109,6 +109,7 @@ def state_counts(self, variable, complete_samples_only=None):
state_counts = state_count_data.reindex(self.state_names[variable]).fillna(0).to_frame()

else:
parents_states = [self.state_names[parent] for parent in parents]
# count how often each state of 'variable' occured, conditional on parents' states
state_count_data = data.groupby([variable] + parents).size().unstack(parents)

Expand All @@ -121,3 +122,118 @@ def state_counts(self, variable, complete_samples_only=None):
state_counts = state_count_data.reindex(index=row_index, columns=column_index).fillna(0)

return state_counts


class ParameterEstimator(BaseEstimator):
def __init__(self, model, data, **kwargs):
"""
Base class for parameter estimators in pgmpy.
Parameters
----------
model: pgmpy.models.BayesianModel or pgmpy.models.MarkovModel or pgmpy.models.NoisyOrModel
model for which parameter estimation is to be done
data: pandas DataFrame object
datafame object with column names identical to the variable names of the model.
(If some values in the data are missing the data cells should be set to `numpy.NaN`.
Note that pandas converts each column containing `numpy.NaN`s to dtype `float`.)
state_names: dict (optional)
A dict indicating, for each variable, the discrete set of states (or values)
that the variable can take. If unspecified, the observed values in the data set
are taken to be the only possible states.
complete_samples_only: bool (optional, default `True`)
Specifies how to deal with missing data, if present. If set to `True` all rows
that contain `np.Nan` somewhere are ignored. If `False` then, for each variable,
every row where neither the variable nor its parents are `np.NaN` is used.
This sets the behavior of the `state_count`-method.
"""

assert set(model.nodes()) <= set(data.columns.values), \
"variable names of the model must be identical to column names in data"
self.model = model

super(ParameterEstimator, self).__init__(data, **kwargs)

def state_counts(self, variable, **kwargs):
"""
Return counts how often each state of 'variable' occured in the data.
If the variable has parents, counting is done conditionally
for each state configuration of the parents.
Parameters
----------
variable: string
Name of the variable for which the state count is to be done.
complete_samples_only: bool
Specifies how to deal with missing data, if present. If set to `True` all rows
that contain `np.NaN` somewhere are ignored. If `False` then
every row where neither the variable nor its parents are `np.NaN` is used.
Desired default behavior can be passed to the class constructor.
Returns
-------
state_counts: pandas.DataFrame
Table with state counts for 'variable'
Examples
--------
>>> import pandas as pd
>>> from pgmpy.models import BayesianModel
>>> from pgmpy.estimators import ParameterEstimator
>>> model = BayesianModel([('A', 'C'), ('B', 'C')])
>>> data = pd.DataFrame(data={'A': ['a1', 'a1', 'a2'],
'B': ['b1', 'b2', 'b1'],
'C': ['c1', 'c1', 'c2']})
>>> estimator = ParameterEstimator(model, data)
>>> estimator.state_counts('A')
A
a1 2
a2 1
>>> estimator.state_counts('C')
A a1 a2
B b1 b2 b1 b2
C
c1 1 1 0 0
c2 0 0 1 0
"""

parents = sorted(self.model.get_parents(variable))
return super(ParameterEstimator, self).state_counts(variable, parents=parents, **kwargs)

def get_parameters(self):
pass


class StructureEstimator(BaseEstimator):
def __init__(self, data, **kwargs):
"""
Base class for structure estimators in pgmpy.
Parameters
----------
data: pandas DataFrame object
datafame object where each column represents one variable.
(If some values in the data are missing the data cells should be set to `numpy.NaN`.
Note that pandas converts each column containing `numpy.NaN`s to dtype `float`.)
state_names: dict (optional)
A dict indicating, for each variable, the discrete set of states (or values)
that the variable can take. If unspecified, the observed values in the data set
are taken to be the only possible states.
complete_samples_only: bool (optional, default `True`)
Specifies how to deal with missing data, if present. If set to `True` all rows
that contain `np.Nan` somewhere are ignored. If `False` then, for each variable,
every row where neither the variable nor its parents are `np.NaN` is used.
This sets the behavior of the `state_count`-method.
"""

super(StructureEstimator, self).__init__(data, **kwargs)

def estimate(self):
pass
13 changes: 5 additions & 8 deletions pgmpy/tests/test_estimators/test_BaseEstimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,30 @@

import pandas as pd
from numpy import NaN
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.estimators import BaseEstimator
from pgmpy.factors import TabularCPD


class TestBaseEstimator(unittest.TestCase):
def setUp(self):
self.m1 = BayesianModel([('A', 'C'), ('B', 'C'), ('D', 'B')])
self.d1 = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0], 'D': ['X', 'Y', 'Z']})
self.d2 = pd.DataFrame(data={'A': [0, NaN, 1], 'B': [0, 1, 0], 'C': [1, 1, NaN], 'D': [NaN, 'Y', NaN]})

def test_state_count(self):
e = BaseEstimator(self.m1, self.d1)
e = BaseEstimator(self.d1)
self.assertEqual(e.state_counts('A').values.tolist(), [[2], [1]])
self.assertEqual(e.state_counts('C').values.tolist(),
self.assertEqual(e.state_counts('C', ['A', 'B']).values.tolist(),
[[0., 0., 1., 0.], [1., 1., 0., 0.]])

def test_missing_data(self):
e = BaseEstimator(self.m1, self.d2, state_names={'C': [0, 1]}, complete_samples_only=False)
e = BaseEstimator(self.d2, state_names={'C': [0, 1]}, complete_samples_only=False)
self.assertEqual(e.state_counts('A', complete_samples_only=True).values.tolist(), [[0], [0]])
self.assertEqual(e.state_counts('A').values.tolist(), [[1], [1]])
self.assertEqual(e.state_counts('C', complete_samples_only=True).values.tolist(),
self.assertEqual(e.state_counts('C', parents=['A', 'B'], complete_samples_only=True).values.tolist(),
[[0, 0, 0, 0], [0, 0, 0, 0]])
self.assertEqual(e.state_counts('C').values.tolist(),
self.assertEqual(e.state_counts('C', parents=['A', 'B']).values.tolist(),
[[0, 0, 0, 0], [1, 0, 0, 0]])

def tearDown(self):
del self.m1
del self.d1
33 changes: 33 additions & 0 deletions pgmpy/tests/test_estimators/test_ParameterEstimator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import unittest

import pandas as pd
from numpy import NaN
from pgmpy.models import BayesianModel
from pgmpy.estimators import ParameterEstimator
from pgmpy.factors import TabularCPD


class TestParameterEstimator(unittest.TestCase):
def setUp(self):
self.m1 = BayesianModel([('A', 'C'), ('B', 'C'), ('D', 'B')])
self.d1 = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0], 'D': ['X', 'Y', 'Z']})
self.d2 = pd.DataFrame(data={'A': [0, NaN, 1], 'B': [0, 1, 0], 'C': [1, 1, NaN], 'D': [NaN, 'Y', NaN]})

def test_state_count(self):
e = ParameterEstimator(self.m1, self.d1)
self.assertEqual(e.state_counts('A').values.tolist(), [[2], [1]])
self.assertEqual(e.state_counts('C').values.tolist(),
[[0., 0., 1., 0.], [1., 1., 0., 0.]])

def test_missing_data(self):
e = ParameterEstimator(self.m1, self.d2, state_names={'C': [0, 1]}, complete_samples_only=False)
self.assertEqual(e.state_counts('A', complete_samples_only=True).values.tolist(), [[0], [0]])
self.assertEqual(e.state_counts('A').values.tolist(), [[1], [1]])
self.assertEqual(e.state_counts('C', complete_samples_only=True).values.tolist(),
[[0, 0, 0, 0], [0, 0, 0, 0]])
self.assertEqual(e.state_counts('C').values.tolist(),
[[0, 0, 0, 0], [1, 0, 0, 0]])

def tearDown(self):
del self.m1
del self.d1

0 comments on commit d052b42

Please sign in to comment.