/
ExhaustiveSearch.py
119 lines (100 loc) · 4.76 KB
/
ExhaustiveSearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python
import numpy as np
import pandas as pd
import networkx as nx
from warnings import warn
from itertools import combinations
from pgmpy.estimators import StructureEstimator, BayesianScore
from pgmpy.utils.mathext import powerset
from pgmpy.models import BayesianModel
class ExhaustiveSearch(StructureEstimator):
def __init__(self, data, scoring_method=None, **kwargs):
"""
Search class for exhaustive searches over all BayesianModels with a given set of variables.
Takes a `StructureScore`-Instance as parameter; `estimate` finds the model with maximal score.
Parameters
----------
model: pgmpy.models.BayesianModel or pgmpy.models.MarkovModel or pgmpy.models.NoisyOrModel
model for which parameter estimation is to be done
data: pandas DataFrame object
datafame object where each column represents one variable.
(If some values in the data are missing the data cells should be set to `numpy.NaN`.
Note that pandas converts each column containing `numpy.NaN`s to dtype `float`.)
scoring_method: Instance of a `StructureScore`-subclass (`BayesianScore` is used if not set)
An instance of either `BayesianScore` or `BicScore`.
This score is optimized during structure estimation by the `estimate`-method.
state_names: dict (optional)
A dict indicating, for each variable, the discrete set of states (or values)
that the variable can take. If unspecified, the observed values in the data set
are taken to be the only possible states.
complete_samples_only: bool (optional, default `True`)
Specifies how to deal with missing data, if present. If set to `True` all rows
that contain `np.Nan` somewhere are ignored. If `False` then, for each variable,
every row where neither the variable nor its parents are `np.NaN` is used.
This sets the behavior of the `state_count`-method.
Returns
-------
state_counts: pandas.DataFrame
Table with state counts for 'variable'
Examples
--------
>>> import pandas as pd
>>> from pgmpy.models import BayesianModel
>>> from pgmpy.estimators import ParameterEstimator
>>> model = BayesianModel([('A', 'C'), ('B', 'C')])
>>> data = pd.DataFrame(data={'A': ['a1', 'a1', 'a2'],
'B': ['b1', 'b2', 'b1'],
'C': ['c1', 'c1', 'c2']})
>>> estimator = ParameterEstimator(model, data)
>>> estimator.state_counts('A')
A
a1 2
a2 1
>>> estimator.state_counts('C')
A a1 a2
B b1 b2 b1 b2
C
c1 1 1 0 0
c2 0 0 1 0
"""
if scoring_method is not None:
self.scoring_method = scoring_method
else:
from pgmpy.estimators import BayesianScore
self.scoring_method = BayesianScore(data, **kwargs)
super(ExhaustiveSearch, self).__init__(data, **kwargs)
def all_dags(self, nodes=None):
"Generates all possible DAGs with a given set of nodes; sparse ones first"
if nodes is None:
nodes = sorted(self.state_names.keys())
if len(nodes) > 6:
warn("Generating all DAGs of n nodes likely not feasible for n>6")
edges = list(combinations(nodes, 2)) # n*(n-1) possible directed edges
edges.extend([(y, x) for x, y in edges])
all_graphs = powerset(edges) # 2^(n*(n-1)) graphs
for graph_edges in all_graphs:
graph = nx.DiGraph()
graph.add_nodes_from(nodes)
graph.add_edges_from(graph_edges)
if nx.is_directed_acyclic_graph(graph):
yield graph
def all_scores(self):
"Computes an list of DAGs and their structure scores, ordered by score"
scored_dags = sorted([(self.scoring_method.score(dag), dag) for dag in self.all_dags()],
key=lambda x: x[0])
return scored_dags
def estimate(self):
"""
Estimates the `BayesianModel` structure that fits best to the given data set,
according to the scoring method supplied in the constructor.
Exhaustively searches through all models. Only estimates network structure, no parametrization.
Returns
-------
model: `BayesianModel` instance
A `BayesianModel` with maximal score.
"""
best_dag = max(self.all_dags(), key=self.scoring_method.score)
best_model = BayesianModel()
best_model.add_nodes_from(sorted(best_dag.nodes()))
best_model.add_edges_from(sorted(best_dag.edges()))
return best_model