In [1]:
from pgmpy.models import MarkovModel, BayesianModel
from pgmpy.inference import BeliefPropagation
from pgmpy.factors import Factor
import numpy as np
import pandas as pd
from scipy import optimize
from pgmpy.estimators import MaximumLikelihoodEstimator
import pickle

In [2]:
data = pd.read_csv('5_formatted.csv').drop('Unnamed: 0', axis=1)
data.head()

Unnamed: 0,A-2,C-2,D-2,E-2,F-2,G-2,H-2,I-2,K-2,L-2,...,N2,P2,Q2,R2,S2,T2,V2,W2,Y2,-2
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [3]:
labels = pd.read_csv('5_formatted_labels.csv').drop('Unnamed: 0', axis=1)

In [4]:
labels.columns = ['State']
labels.head()

Unnamed: 0,State
0,5.0
1,5.0
2,0.0
3,0.0
4,0.0


In [5]:
# create the singleton nodes
singletons = list(data.columns)
singletons[:2]

['A-2', 'C-2']

In [6]:
# ys = list(df.columns[-6:])
# ys[-1] = 'U'
# ys

In [7]:
edges = [('State', s) for s in singletons]

# # generate the edges
# for y in ys:
#     for s in singletons:
#         edges.append((s, y))
        
edges[:2]

[('State', 'A-2'), ('State', 'C-2')]

In [8]:
len(edges)

105

In [9]:
data['State'] = labels

In [10]:
data.head()

Unnamed: 0,A-2,C-2,D-2,E-2,F-2,G-2,H-2,I-2,K-2,L-2,...,P2,Q2,R2,S2,T2,V2,W2,Y2,-2,State
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,5.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0.0


In [11]:
# instantiate the model
bn = BayesianModel()
bn.add_edges_from(edges)

In [12]:
bn.get_cpds()

[]

In [13]:
bn.edges()

[('State', 'P0'),
 ('State', 'D2'),
 ('State', 'S1'),
 ('State', 'S0'),
 ('State', 'A-1'),
 ('State', 'N-1'),
 ('State', 'F-1'),
 ('State', 'E2'),
 ('State', '-2'),
 ('State', 'G2'),
 ('State', '-0'),
 ('State', 'N0'),
 ('State', 'P-2'),
 ('State', 'T1'),
 ('State', 'K0'),
 ('State', 'R-2'),
 ('State', 'F0'),
 ('State', 'D-1'),
 ('State', 'H-2'),
 ('State', 'Q2'),
 ('State', 'P1'),
 ('State', 'H0'),
 ('State', 'M-1'),
 ('State', 'E-2'),
 ('State', 'C-1'),
 ('State', 'R0'),
 ('State', 'I0'),
 ('State', 'Y1'),
 ('State', 'D-2'),
 ('State', 'V1'),
 ('State', 'E1'),
 ('State', 'P-1'),
 ('State', 'S2'),
 ('State', 'V2'),
 ('State', 'G0'),
 ('State', 'V-1'),
 ('State', 'M1'),
 ('State', 'W-2'),
 ('State', 'T0'),
 ('State', 'H1'),
 ('State', 'W1'),
 ('State', 'W-1'),
 ('State', 'E-1'),
 ('State', 'K2'),
 ('State', 'Q-2'),
 ('State', 'N2'),
 ('State', 'D0'),
 ('State', 'N1'),
 ('State', 'M0'),
 ('State', 'D1'),
 ('State', 'Q-1'),
 ('State', 'L-1'),
 ('State', 'R-1'),
 ('State', 'F1'),
 ('State

In [14]:
# calculate thetas in our CPDs
est = MaximumLikelihoodEstimator(bn, data)

In [15]:
est.get_parameters()[0]

0,1,2,3,4,5,6
State,State_0,State_1,State_2,State_3,State_4,State_5
P0_0,0.9748,0.9797,0.9297,0.9500,0.9509,0.9613
P0_1,0.0252,0.0203,0.0703,0.0500,0.0491,0.0387


In [16]:
bn.add_cpds(*est.get_parameters())

In [17]:
bn.cpds

[<TabularCPD representing P(P0:2 | State:6) at 0x1033cdb70>,
 <TabularCPD representing P(D2:2 | State:6) at 0x1033cdd30>,
 <TabularCPD representing P(S1:2 | State:6) at 0x1033e77f0>,
 <TabularCPD representing P(S0:2 | State:6) at 0x1033e7da0>,
 <TabularCPD representing P(A-1:2 | State:6) at 0x1033e7eb8>,
 <TabularCPD representing P(N-1:2 | State:6) at 0x1033e79b0>,
 <TabularCPD representing P(F-1:2 | State:6) at 0x1033e7a90>,
 <TabularCPD representing P(E2:2 | State:6) at 0x1033e7fd0>,
 <TabularCPD representing P(-2:2 | State:6) at 0x1033e7630>,
 <TabularCPD representing P(G2:2 | State:6) at 0x1033e7d30>,
 <TabularCPD representing P(-0:2 | State:6) at 0x1033e7b00>,
 <TabularCPD representing P(N0:2 | State:6) at 0x1033e7cc0>,
 <TabularCPD representing P(P-2:2 | State:6) at 0x1135a8da0>,
 <TabularCPD representing P(T1:2 | State:6) at 0x1135d5710>,
 <TabularCPD representing P(K0:2 | State:6) at 0x1135d5b38>,
 <TabularCPD representing P(R-2:2 | State:6) at 0x1135d5438>,
 <TabularCPD repres

In [18]:
tree = bn.to_junction_tree()

In [20]:
tree.edges()

[(('State', 'M0'), ('State', 'H0')),
 (('State', 'K1'), ('State', 'H0')),
 (('State', 'G2'), ('State', 'H0')),
 (('State', 'S-1'), ('State', 'H0')),
 (('State', 'W2'), ('State', 'H0')),
 (('State', 'L-1'), ('State', 'H0')),
 (('State', 'P1'), ('State', 'H0')),
 (('State', 'E2'), ('State', 'H0')),
 (('State', 'P2'), ('State', 'H0')),
 (('State', 'G-2'), ('State', 'H0')),
 (('State', 'W1'), ('State', 'H0')),
 (('State', 'W-1'), ('State', 'H0')),
 (('State', 'L2'), ('State', 'H0')),
 (('State', 'K0'), ('State', 'H0')),
 (('State', 'T-1'), ('State', 'H0')),
 (('State', 'F-2'), ('State', 'H0')),
 (('State', 'Y2'), ('State', 'H0')),
 (('State', '-2'), ('State', 'H0')),
 (('State', 'E1'), ('State', 'H0')),
 (('State', 'A0'), ('State', 'H0')),
 (('State', 'D0'), ('State', 'H0')),
 (('State', '--2'), ('State', 'H0')),
 (('State', 'G0'), ('State', 'H0')),
 (('State', 'Y-2'), ('State', 'H0')),
 (('State', 'R0'), ('State', 'H0')),
 (('State', 'Q2'), ('State', 'H0')),
 (('State', 'C2'), ('State', '

In [21]:
# our own train, test split
choices = [np.random.choice([True, False], p=[.75, 0.25]) for _ in range(len(data))]
train_indices = [i for i, choice in enumerate(choices) if choice]
test_inices = [i for i, choice in enumerate(choices) if not choice]

In [22]:
raw_data = np.asarray(data)

In [23]:
train = raw_data[train_indices]
test = raw_data[test_inices]

In [24]:
train = pd.DataFrame(train, columns=data.columns)

In [25]:
train.head()

Unnamed: 0,A-2,C-2,D-2,E-2,F-2,G-2,H-2,I-2,K-2,L-2,...,P2,Q2,R2,S2,T2,V2,W2,Y2,-2,State
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
est = MaximumLikelihoodEstimator(bn, train)

In [27]:
est.get_parameters()[0]

0,1,2,3,4,5,6
State,State_0,State_1,State_2,State_3,State_4,State_5
P0_0,0.9750,0.9794,0.9292,0.9482,0.9521,0.9613
P0_1,0.0250,0.0206,0.0708,0.0518,0.0479,0.0387


In [28]:
fit_bn = est.model

In [29]:
# add all of the estimated CPDs
fit_bn.add_cpds(*est.get_parameters())

In [30]:
fit_bn.cpds

[<TabularCPD representing P(P0:2 | State:6) at 0x114653fd0>,
 <TabularCPD representing P(D2:2 | State:6) at 0x1146539b0>,
 <TabularCPD representing P(S1:2 | State:6) at 0x114653f98>,
 <TabularCPD representing P(S0:2 | State:6) at 0x114653be0>,
 <TabularCPD representing P(A-1:2 | State:6) at 0x114653e80>,
 <TabularCPD representing P(N-1:2 | State:6) at 0x114653b00>,
 <TabularCPD representing P(F-1:2 | State:6) at 0x114653eb8>,
 <TabularCPD representing P(E2:2 | State:6) at 0x1146537f0>,
 <TabularCPD representing P(-2:2 | State:6) at 0x114653cf8>,
 <TabularCPD representing P(G2:2 | State:6) at 0x114653a90>,
 <TabularCPD representing P(-0:2 | State:6) at 0x1146a35f8>,
 <TabularCPD representing P(N0:2 | State:6) at 0x1146a3e80>,
 <TabularCPD representing P(P-2:2 | State:6) at 0x1146a3978>,
 <TabularCPD representing P(T1:2 | State:6) at 0x1146a3dd8>,
 <TabularCPD representing P(K0:2 | State:6) at 0x1146a3b38>,
 <TabularCPD representing P(R-2:2 | State:6) at 0x1146a3ac8>,
 <TabularCPD repres

In [31]:
# save the optimized model
pickle.dump(fit_bn, open("fit_bn.p", 'wb'))

In [32]:
bp = BeliefPropagation(fit_bn)

In [33]:
bp.calibrate()

In [34]:
print(test[0][-1])

5.0


In [36]:
evi = dict(zip(data.columns, [int(t) for t in test[0][:-1]]))
evi

{'--1': 0,
 '--2': 1,
 '-0': 0,
 '-1': 0,
 '-2': 0,
 'A-1': 0,
 'A-2': 0,
 'A0': 0,
 'A1': 0,
 'A2': 0,
 'C-1': 0,
 'C-2': 0,
 'C0': 0,
 'C1': 0,
 'C2': 0,
 'D-1': 0,
 'D-2': 0,
 'D0': 0,
 'D1': 0,
 'D2': 0,
 'E-1': 0,
 'E-2': 0,
 'E0': 0,
 'E1': 0,
 'E2': 0,
 'F-1': 0,
 'F-2': 0,
 'F0': 0,
 'F1': 0,
 'F2': 0,
 'G-1': 0,
 'G-2': 0,
 'G0': 0,
 'G1': 0,
 'G2': 0,
 'H-1': 0,
 'H-2': 0,
 'H0': 0,
 'H1': 0,
 'H2': 0,
 'I-1': 0,
 'I-2': 0,
 'I0': 0,
 'I1': 0,
 'I2': 0,
 'K-1': 0,
 'K-2': 0,
 'K0': 0,
 'K1': 0,
 'K2': 0,
 'L-1': 0,
 'L-2': 0,
 'L0': 1,
 'L1': 0,
 'L2': 0,
 'M-1': 0,
 'M-2': 0,
 'M0': 0,
 'M1': 0,
 'M2': 0,
 'N-1': 0,
 'N-2': 0,
 'N0': 0,
 'N1': 0,
 'N2': 0,
 'P-1': 0,
 'P-2': 0,
 'P0': 0,
 'P1': 0,
 'P2': 1,
 'Q-1': 0,
 'Q-2': 0,
 'Q0': 0,
 'Q1': 0,
 'Q2': 0,
 'R-1': 0,
 'R-2': 0,
 'R0': 0,
 'R1': 0,
 'R2': 0,
 'S-1': 0,
 'S-2': 0,
 'S0': 0,
 'S1': 1,
 'S2': 0,
 'T-1': 0,
 'T-2': 0,
 'T0': 0,
 'T1': 0,
 'T2': 0,
 'V-1': 1,
 'V-2': 0,
 'V0': 0,
 'V1': 0,
 'V2': 0,
 'W-1': 0,
 

In [37]:
# finally query
answer = bp.query(['State'], evidence=evi)

In [38]:
print(answer['State'])

╒═════════╤══════════════╕
│ State   │   phi(State) │
╞═════════╪══════════════╡
│ State_0 │       0.0229 │
├─────────┼──────────────┤
│ State_1 │       0.1757 │
├─────────┼──────────────┤
│ State_2 │       0.0158 │
├─────────┼──────────────┤
│ State_3 │       0.0021 │
├─────────┼──────────────┤
│ State_4 │       0.0092 │
├─────────┼──────────────┤
│ State_5 │       0.7742 │
╘═════════╧══════════════╛


In [39]:
print(np.argmax(answer['State'].values))

5


In [40]:
# finally iterate through

preds = []

for i, t in enumerate(test):
    ev = dict(zip(data.columns, [int(v) for v in t[:-1]]))
    answer = bp.query(['State'], evidence=ev)
    max_v = np.argmax(answer['State'].values)
    preds.append(max_v)

pred_df = pd.DataFrame(preds, columns=['State'])
pred_df.to_csv('preds.csv')

NameError: name 'cols' is not defined

In [None]:
# 55% accuracy
print(np.mean([1.0 if pred == int(y_i) else 0.0 for pred, y_i in zip(preds, test[:, -1])]))