In [1]:
from pgmpy.models import MarkovModel, BayesianModel
from pgmpy.inference import BeliefPropagation
from pgmpy.factors import Factor
import numpy as np
import pandas as pd
from scipy import optimize
from pgmpy.estimators import MaximumLikelihoodEstimator
import pickle

In [2]:
data = pd.read_csv('5_formatted.csv').drop('Unnamed: 0', axis=1)
data.head()

Unnamed: 0,A-2,C-2,D-2,E-2,F-2,G-2,H-2,I-2,K-2,L-2,...,N2,P2,Q2,R2,S2,T2,V2,W2,Y2,-2
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [3]:
labels = pd.read_csv('5_formatted_labels.csv').drop('Unnamed: 0', axis=1)

In [4]:
labels.columns = ['State']
labels.head()

Unnamed: 0,State
0,5.0
1,5.0
2,0.0
3,0.0
4,0.0


In [5]:
# create the singleton nodes
singletons = list(data.columns)
singletons[:2]

['A-2', 'C-2']

In [6]:
# ys = list(df.columns[-6:])
# ys[-1] = 'U'
# ys

In [20]:
edges = [('State', s) for s in singletons]

# # generate the edges
# for y in ys:
#     for s in singletons:
#         edges.append((s, y))
        
edges[:2]

[('State', 'A-2'), ('State', 'C-2')]

In [21]:
len(edges)

105

In [22]:
data['State'] = labels

In [23]:
data.head()

Unnamed: 0,A-2,C-2,D-2,E-2,F-2,G-2,H-2,I-2,K-2,L-2,...,P2,Q2,R2,S2,T2,V2,W2,Y2,-2,State
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,5.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0.0


In [None]:
# instantiate the model
bn = BayesianModel()
bn.add_edges_from(edges)

In [31]:
bn.get_cpds()

[<TabularCPD representing P(N-2:2 | State:6) at 0x1137fef98>,
 <TabularCPD representing P(S1:2 | State:6) at 0x1137feba8>,
 <TabularCPD representing P(W0:2 | State:6) at 0x1137feeb8>,
 <TabularCPD representing P(H1:2 | State:6) at 0x1137fea58>,
 <TabularCPD representing P(A2:2 | State:6) at 0x1137feb38>,
 <TabularCPD representing P(N0:2 | State:6) at 0x1137fee48>,
 <TabularCPD representing P(W2:2 | State:6) at 0x1137fe908>,
 <TabularCPD representing P(L0:2 | State:6) at 0x1137fec88>,
 <TabularCPD representing P(M2:2 | State:6) at 0x1137fecf8>,
 <TabularCPD representing P(C2:2 | State:6) at 0x1137f4ba8>,
 <TabularCPD representing P(E0:2 | State:6) at 0x1137f4a58>,
 <TabularCPD representing P(I0:2 | State:6) at 0x1137f4748>,
 <TabularCPD representing P(P-1:2 | State:6) at 0x1137f4cf8>,
 <TabularCPD representing P(G1:2 | State:6) at 0x1137f4f98>,
 <TabularCPD representing P(--1:2 | State:6) at 0x1137f4828>,
 <TabularCPD representing P(E2:2 | State:6) at 0x1137f4e10>,
 <TabularCPD represen

In [30]:
bn.edges()

[('State', 'N-2'),
 ('State', 'S1'),
 ('State', 'W0'),
 ('State', 'H1'),
 ('State', 'A2'),
 ('State', 'N0'),
 ('State', 'W2'),
 ('State', 'L0'),
 ('State', 'C2'),
 ('State', 'E0'),
 ('State', 'I0'),
 ('State', 'P-1'),
 ('State', 'G1'),
 ('State', '--1'),
 ('State', 'E2'),
 ('State', 'G-2'),
 ('State', 'R2'),
 ('State', 'Y2'),
 ('State', 'R-1'),
 ('State', 'Q2'),
 ('State', 'D2'),
 ('State', 'Y-1'),
 ('State', 'E-2'),
 ('State', 'M-1'),
 ('State', 'D1'),
 ('State', 'R1'),
 ('State', 'A-1'),
 ('State', '--2'),
 ('State', 'L-1'),
 ('State', 'Y0'),
 ('State', 'P0'),
 ('State', 'L1'),
 ('State', 'D-2'),
 ('State', 'C0'),
 ('State', 'V-2'),
 ('State', 'A1'),
 ('State', 'H-2'),
 ('State', 'K-1'),
 ('State', 'A-2'),
 ('State', 'L2'),
 ('State', 'I2'),
 ('State', 'E1'),
 ('State', 'T-2'),
 ('State', '-2'),
 ('State', 'P-2'),
 ('State', 'K0'),
 ('State', 'V-1'),
 ('State', 'I1'),
 ('State', '-1'),
 ('State', 'F-2'),
 ('State', 'I-1'),
 ('State', 'S2'),
 ('State', 'M2'),
 ('State', 'W1'),
 ('Stat

In [25]:
# calculate thetas in our CPDs
est = MaximumLikelihoodEstimator(bn, data)

In [26]:
est.get_parameters()[0]

0,1,2,3,4,5,6
State,State_0,State_1,State_2,State_3,State_4,State_5
N-2_0,0.9620,0.9634,0.9528,0.9511,0.9656,0.9818
N-2_1,0.0380,0.0366,0.0472,0.0489,0.0344,0.0182


In [27]:
bn.add_cpds(*est.get_parameters())

In [28]:
tree = bn.to_junction_tree()

In [37]:
tree.edges()

[(('State', 'N1'), ('State', 'Y-2')),
 (('State', 'N1'), ('State', 'K1')),
 (('State', 'N1'), ('State', 'W1')),
 (('State', 'N1'), ('State', 'A1')),
 (('State', 'N1'), ('State', 'G1')),
 (('State', 'N1'), ('State', 'G0')),
 (('State', 'N1'), ('State', 'S1')),
 (('State', 'N1'), ('State', 'I-2')),
 (('State', 'N1'), ('State', 'D-2')),
 (('State', 'N1'), ('State', 'F0')),
 (('State', 'N1'), ('State', 'H-1')),
 (('State', 'N1'), ('State', 'C-1')),
 (('State', 'N1'), ('State', 'Y0')),
 (('State', 'N1'), ('State', 'E0')),
 (('State', 'N1'), ('State', 'M0')),
 (('State', 'N1'), ('State', 'G-1')),
 (('State', 'N1'), ('State', 'E1')),
 (('State', 'N1'), ('State', 'G-2')),
 (('State', 'N1'), ('State', 'N2')),
 (('State', 'N1'), ('State', 'N-2')),
 (('State', 'N1'), ('State', 'S-2')),
 (('State', 'N1'), ('State', 'P-1')),
 (('State', 'N1'), ('State', 'I1')),
 (('State', 'N1'), ('State', 'D-1')),
 (('State', 'N1'), ('State', 'Q-2')),
 (('State', 'N1'), ('State', 'F-2')),
 (('State', 'N1'), ('Stat

In [None]:
# take a look at one such
est.get_parameters()[0]

In [None]:
# our own train, test split
choices = [np.random.choice([True, False], p=[.75, 0.25]) for _ in range(len(data))]
train_indices = [i for i, choice in enumerate(choices) if choice]
test_inices = [i for i, choice in enumerate(choices) if not choice]

In [None]:
raw_data = np.asarray(data)

In [None]:
train = raw_data[train_indices]
test = raw_data[test_inices]

In [None]:
train = pd.DataFrame(train, columns=data.columns)

In [None]:
train.head()

In [None]:
est = MaximumLikelihoodEstimator(bn, train)

In [None]:
est.get_parameters()[0]

In [None]:
fit_bn = est.model

In [None]:
# add all of the estimated CPDs
fit_bn.add_cpds(*est.get_parameters())

In [None]:
fit_bn.cpds

In [None]:
# save the optimized model
pickle.dump(fit_bn, open("fit_bn.p", 'wb'))

In [None]:
%%time
bp = BeliefPropagation(fit_bn)

In [None]:
bp.calibrate()

In [None]:
print(test[0][-1])

In [None]:
evi = dict(zip(cols, [int(t) for t in test[0][:-1]]))
evi

In [None]:
# finally query
answer = bp.query(['State'], evidence=evi)

In [None]:
print(answer['State'])

In [None]:
print(np.argmax(answer['State'].values))

In [None]:
# finally iterate through

preds = []

for i, t in enumerate(test):
    ev = dict(zip(cols, [int(v) for v in t[:-1]]))
    answer = bp.query(['State'], evidence=ev)
    max_v = np.argmax(answer['State'].values)
    preds.append(max_v)

In [None]:
pred_df = pd.DataFrame(preds, columns=['State'])
pred_df.to_csv('preds.csv')

In [None]:
# 55% accuracy
print(np.mean([1.0 if pred == int(y_i) else 0.0 for pred, y_i in zip(preds, test[:, -1])]))