In [1]:
from pgmpy.models import MarkovModel, BayesianModel
from pgmpy.inference import BeliefPropagation
from pgmpy.factors import Factor
import numpy as np
import pandas as pd
from scipy import optimize
from pgmpy.estimators import MaximumLikelihoodEstimator
import pickle

In [3]:
data = pd.read_csv('5_formatted.csv').drop('Unnamed: 0', axis=1)
data.head()

Unnamed: 0,A-2,C-2,D-2,E-2,F-2,G-2,H-2,I-2,K-2,L-2,...,N2,P2,Q2,R2,S2,T2,V2,W2,Y2,-2
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [4]:
labels = pd.read_csv('5_formatted_labels.csv').drop('Unnamed: 0', axis=1)

In [5]:
labels.columns = ['State']
labels.head()

Unnamed: 0,State
0,5
1,5
2,0
3,0
4,0


In [6]:
# create the singleton nodes
singletons = list(data.columns)
singletons[:2]

['A-2', 'C-2']

In [None]:
# ys = list(df.columns[-6:])
# ys[-1] = 'U'
# ys

In [7]:
edges = [('State', s) for s in singletons]

# # generate the edges
# for y in ys:
#     for s in singletons:
#         edges.append((s, y))
        
edges[:2]

[('State', 'A-2'), ('State', 'C-2')]

In [8]:
len(edges)

105

In [9]:
data['State'] = labels

In [10]:
data.head()

Unnamed: 0,A-2,C-2,D-2,E-2,F-2,G-2,H-2,I-2,K-2,L-2,...,P2,Q2,R2,S2,T2,V2,W2,Y2,-2,State
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,5
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [11]:
# instantiate the model
bn = BayesianModel()
bn.add_edges_from(edges)

In [12]:
# calculate thetas in our CPDs
est = MaximumLikelihoodEstimator(bn, data)

In [13]:
# take a look at one such
est.get_parameters()[0]

0,1
State_0,0.3166
State_1,0.139
State_2,0.0897
State_3,0.0688
State_4,0.0101
State_5,0.3759


In [14]:
# our own train, test split
choices = [np.random.choice([True, False], p=[.75, 0.25]) for _ in range(len(data))]
train_indices = [i for i, choice in enumerate(choices) if choice]
test_inices = [i for i, choice in enumerate(choices) if not choice]

In [16]:
raw_data = np.asarray(data)

In [17]:
train = raw_data[train_indices]
test = raw_data[test_inices]

In [18]:
train = pd.DataFrame(train, columns=data.columns)

In [19]:
train.head()

Unnamed: 0,A-2,C-2,D-2,E-2,F-2,G-2,H-2,I-2,K-2,L-2,...,P2,Q2,R2,S2,T2,V2,W2,Y2,-2,State
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
est = MaximumLikelihoodEstimator(bn, train)

In [21]:
est.get_parameters()[0]

0,1
State_0,0.3178
State_1,0.1387
State_2,0.0891
State_3,0.0692
State_4,0.01
State_5,0.3752


In [22]:
fit_bn = est.model

In [23]:
# add all of the estimated CPDs
fit_bn.add_cpds(*est.get_parameters())

In [24]:
fit_bn.cpds

[<TabularCPD representing P(State:6) at 0x1077afd68>,
 <TabularCPD representing P(T0:2 | State:6) at 0x1077afda0>,
 <TabularCPD representing P(M-2:2 | State:6) at 0x1077afbe0>,
 <TabularCPD representing P(F2:2 | State:6) at 0x1077af860>,
 <TabularCPD representing P(L0:2 | State:6) at 0x1077afcc0>,
 <TabularCPD representing P(M1:2 | State:6) at 0x1077aff98>,
 <TabularCPD representing P(N-1:2 | State:6) at 0x1077bae80>,
 <TabularCPD representing P(G0:2 | State:6) at 0x1077bada0>,
 <TabularCPD representing P(A1:2 | State:6) at 0x1077ba748>,
 <TabularCPD representing P(I2:2 | State:6) at 0x1077ba828>,
 <TabularCPD representing P(G1:2 | State:6) at 0x1077ba908>,
 <TabularCPD representing P(D0:2 | State:6) at 0x1077baf28>,
 <TabularCPD representing P(C-2:2 | State:6) at 0x1077baf98>,
 <TabularCPD representing P(A2:2 | State:6) at 0x1077bac88>,
 <TabularCPD representing P(N0:2 | State:6) at 0x1077c1ac8>,
 <TabularCPD representing P(V-2:2 | State:6) at 0x1077c1860>,
 <TabularCPD representing P

In [25]:
# save the optimized model
pickle.dump(fit_bn, open("fit_bn.p", 'wb'))

In [26]:
%%time
bp = BeliefPropagation(fit_bn)

CPU times: user 13.1 s, sys: 95 ms, total: 13.1 s
Wall time: 13.1 s


In [27]:
bp.calibrate()

In [28]:
print(test[0][-1])

5.0


In [29]:
evi = dict(zip(cols, [int(t) for t in test[0][:-1]]))
evi

{'--1': 0,
 '--2': 1,
 '-0': 0,
 '-1': 0,
 '-2': 0,
 'A-1': 0,
 'A-2': 0,
 'A0': 0,
 'A1': 0,
 'A2': 0,
 'C-1': 0,
 'C-2': 0,
 'C0': 0,
 'C1': 0,
 'C2': 0,
 'D-1': 0,
 'D-2': 0,
 'D0': 0,
 'D1': 0,
 'D2': 0,
 'E-1': 0,
 'E-2': 0,
 'E0': 0,
 'E1': 0,
 'E2': 0,
 'F-1': 0,
 'F-2': 0,
 'F0': 0,
 'F1': 0,
 'F2': 0,
 'G-1': 0,
 'G-2': 0,
 'G0': 0,
 'G1': 0,
 'G2': 0,
 'H-1': 0,
 'H-2': 0,
 'H0': 0,
 'H1': 0,
 'H2': 0,
 'I-1': 0,
 'I-2': 0,
 'I0': 0,
 'I1': 0,
 'I2': 0,
 'K-1': 0,
 'K-2': 0,
 'K0': 0,
 'K1': 0,
 'K2': 0,
 'L-1': 0,
 'L-2': 0,
 'L0': 1,
 'L1': 0,
 'L2': 0,
 'M-1': 0,
 'M-2': 0,
 'M0': 0,
 'M1': 0,
 'M2': 0,
 'N-1': 0,
 'N-2': 0,
 'N0': 0,
 'N1': 0,
 'N2': 0,
 'P-1': 0,
 'P-2': 0,
 'P0': 0,
 'P1': 0,
 'P2': 1,
 'Q-1': 0,
 'Q-2': 0,
 'Q0': 0,
 'Q1': 0,
 'Q2': 0,
 'R-1': 0,
 'R-2': 0,
 'R0': 0,
 'R1': 0,
 'R2': 0,
 'S-1': 0,
 'S-2': 0,
 'S0': 0,
 'S1': 1,
 'S2': 0,
 'T-1': 0,
 'T-2': 0,
 'T0': 0,
 'T1': 0,
 'T2': 0,
 'V-1': 1,
 'V-2': 0,
 'V0': 0,
 'V1': 0,
 'V2': 0,
 'W-1': 0,
 

In [30]:
# finally query
answer = bp.query(['State'], evidence=evi)

In [31]:
print(answer['State'])

╒═════════╤══════════════╕
│ State   │   phi(State) │
╞═════════╪══════════════╡
│ State_0 │       0.3178 │
├─────────┼──────────────┤
│ State_1 │       0.1387 │
├─────────┼──────────────┤
│ State_2 │       0.0891 │
├─────────┼──────────────┤
│ State_3 │       0.0692 │
├─────────┼──────────────┤
│ State_4 │       0.0100 │
├─────────┼──────────────┤
│ State_5 │       0.3752 │
╘═════════╧══════════════╛


In [32]:
print(np.argmax(answer['State'].values))

5


In [35]:
# finally iterate through

preds = []

for i, t in enumerate(test):
    ev = dict(zip(cols, [int(v) for v in t[:-1]]))
    answer = bp.query(['State'], evidence=ev)
    max_v = np.argmax(answer['State'].values)
    preds.append(max_v)

In [36]:
pred_df = pd.DataFrame(preds, columns=['State'])
pred_df.to_csv('preds.csv')

In [39]:
print(np.mean([1.0 if pred == int(y_i) else 0.0 for pred, y_i in zip(preds, test[:, -1])]))

0.584521263958
