In [1]:
import pickle
import pandas as pd
import numpy as np
from pgmpy.models import LinearGaussianBayesianNetwork
from pgmpy.factors.continuous import LinearGaussianCPD
from pgmpy.inference.ExactInference import BeliefPropagation
from scipy.stats import multivariate_normal
from pgmpy.factors.continuous import ContinuousFactor
from pgmpy.models import BayesianModel
import numpy.linalg as la
import emcee
from tqdm import tqdm_notebook

In [2]:
## DAG

PC_DAG = pd.read_csv('data/DAG_PC.csv',index_col = 'Unnamed: 0')
with open('data/MLE_PC.pkl','rb') as f:
    PC_MLE = pickle.load(f)
    
mean = pd.read_csv('data/final_means.csv',header=None,index_col=0)

In [3]:
g = LinearGaussianBayesianNetwork()

g.add_nodes_from(PC_DAG.columns)

edges = [(PC_DAG.index[edge[0]], PC_DAG.columns[edge[1]]) for edge in np.argwhere(PC_DAG.values)]

g.add_edges_from(edges)

for factor in PC_MLE:
    name = factor[0]
    beta = factor[1]
    var = factor[2]
    pars = factor[3]
    if len(pars)==0:
        cpd = LinearGaussianCPD(name, [], var, pars)
        cpd.beta_0 = mean.loc[name]
        g.add_cpds(cpd)
    else:
        cpd = LinearGaussianCPD(name,  beta, var, pars)
        cpd.beta_0 = mean.loc[name]
        g.add_cpds(cpd)

norm = g.to_joint_gaussian()

In [4]:
# MRF
mrf = pd.read_csv('data/MRF.csv',index_col = 'Unnamed: 0')
sigma_mrf = la.inv(mrf.values)
mu_mrf = mean.values.squeeze()
vars_mrf = list(PC_DAG.columns)

In [5]:
vars_dag = norm.variables
mu_dag = pd.Series(mu_mrf,index=vars_mrf).reindex(vars_dag)
sigma_dag = norm.covariance


In [6]:
def lnprob(x, mu, cov,icov, conditions, vars = None):
    for condition in conditions:
        ind = vars.index(condition)
        x[ind] = conditions[condition]
    k = len(x)
    diff = x-mu
    term1 = np.log(la.det(cov))
    term2 = np.dot(diff,np.dot(icov,diff))
    term3 = k*np.log(2*np.pi)
    return -1/2*(term1 + term2 + term3)

In [7]:
def mcmc(mu,sigma,conditions,vars):
    ndim = 51
    nwalkers = 250
    p0 = np.random.rand(ndim * nwalkers).reshape((nwalkers, ndim))
    sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=[mu,sigma, la.inv(sigma), conditions, vars])
    pos, prob, state = sampler.run_mcmc(p0, 100)
    sampler.reset()
    sampler.run_mcmc(pos, 1000)
    print(np.mean(sampler.acceptance_fraction))
    return pd.Series(sampler.flatchain.mean(0),index = vars)

def mh(mu,sigma,conditions,vars):
    ndim = 51
    nwalkers = 250
    p0 = np.random.rand(ndim)
    sampler = emcee.MHSampler(sigma, ndim, lnprob, args=[mu,sigma, la.inv(sigma), conditions, vars])
    pos, prob, state = sampler.run_mcmc(p0, 1000)
    sampler.reset()
    sampler.run_mcmc(pos, 10000)
    print(np.mean(sampler.acceptance_fraction))
    return pd.Series(sampler.flatchain.mean(0),index = vars)
    

In [8]:
election = pd.read_csv('data/results.csv',index_col='State')
temp = list(election.index)

temp[-1] = 'District of Columbia'
election.index = temp

In [9]:
def pred_clinton_mcmc(mu,sigma,vars,time = 6.5):
    evidence = election[election.Poll_Closing <= time].Clinton.to_dict()
    vote_perc = mcmc(mu_dag, sigma_dag, evidence,vars_dag)
    clinton_win = (vote_perc>.50).astype(int).reindex(election.index)
    return vote_perc.loc[['Florida','Michigan','Pennsylvania','Wisconsin']], (clinton_win*election['number of votes']).sum()

In [10]:
def pred_clinton_mh(mu,sigma,vars,time = 6.5):
    evidence = election[election.Poll_Closing <= time].Clinton.to_dict()
    vote_perc = mh(mu_dag, sigma_dag, evidence,vars_dag)
    clinton_win = (vote_perc>.50).astype(int).reindex(election.index)
    return vote_perc.loc[['Florida','Michigan','Pennsylvania','Wisconsin']], (clinton_win*election['number of votes']).sum()

In [11]:
times = np.unique(election.Poll_Closing)
times = [6.5] + list(times)
for model in tqdm_notebook(['dag','mrf']):
    for time in tqdm_notebook(times):
        swings = []
        ecs = []
        for i in tqdm_notebook(range(10)):
            if model == 'dag':
                swing, ec = pred_clinton_mcmc(mu_dag,sigma_dag,vars_dag,time)
            elif model == 'mrf':
                swing, ec = pred_clinton_mcmc(mu_mrf,sigma_mrf,vars_mrf,time)
            title = 'data/{}_GW_{}.pkl'.format(model,time)
            swings.append(swing)
            ecs.append(ec)
        ec = np.mean(ecs)
        swing = pd.Series(np.array(swings).mean(0),index = swing.index)
        with open(title, 'wb') as f:
            pickle.dump((swing,ec), f)

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.20383199999999999
0.205264
0.203732
0.20675600000000002
0.20368400000000003
0.20397200000000004
0.20566800000000002
0.20256400000000002
0.20681200000000002
0.199608


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.20434400000000003
0.206112
0.20718799999999998
0.203644
0.203156
0.20616000000000004
0.20731600000000003
0.20442000000000002
0.20712
0.204268


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.20306799999999997
0.202988
0.204656
0.20458400000000002
0.20592000000000002
0.20331600000000002
0.20172800000000002
0.20306
0.201992
0.20325200000000002


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.20452800000000002
0.20523600000000003
0.204928
0.20506799999999997
0.20373200000000002
0.20554
0.20582000000000003
0.20476
0.20429600000000003
0.20616000000000004


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.205636
0.204188
0.20512
0.206632
0.20613599999999999
0.20624
0.204128
0.204852
0.20345999999999997
0.20642000000000002


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.23244800000000002
0.23231200000000002
0.23502800000000001
0.23093200000000003
0.23510399999999998
0.23044
0.233208
0.23298000000000002
0.228672
0.231484


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.25896
0.25012
0.22290400000000002
0.23610400000000004
0.23142000000000001
0.219812
0.238456
0.267504
0.24765600000000004
0.2502


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.5991839999999999
0.6011479999999999
0.601456
0.601144
0.5992999999999999
0.600684
0.5997319999999999
0.599872
0.5995
0.599156


HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.20438800000000001
0.204488
0.20856400000000003
0.20083600000000001
0.211348
0.20831600000000003
0.20456
0.206352
0.204844
0.20318


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.20500400000000002
0.204976
0.207216
0.203272
0.20278800000000002
0.204908
0.20214400000000002
0.202576
0.20735200000000004
0.20517600000000003


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.2075
0.206156
0.204988
0.20348800000000003
0.20420000000000002
0.20350800000000002
0.20457999999999998
0.20122
0.20101999999999998
0.204876


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.204264
0.20628400000000002
0.204748
0.203844
0.205096
0.20569199999999999
0.204904
0.203628
0.2045
0.20445600000000003


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.204096
0.205632
0.20459600000000003
0.20512
0.20595600000000003
0.20460399999999998
0.2033
0.20544
0.20486000000000001
0.20427199999999998


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.231964
0.23189600000000002
0.23015200000000002
0.23424399999999998
0.23374400000000004
0.231576
0.229816
0.231112
0.229552
0.23218799999999998


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.224608
0.228552
0.22728399999999999
0.22534
0.242
0.22916
0.251636
0.23788
0.219776
0.236688


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.599464
0.601288
0.598144
0.5993999999999999
0.5997279999999999
0.599452
0.6002839999999999
0.6011599999999998
0.600592
0.599132



In [12]:
for model in tqdm_notebook(['dag','mrf']):
    for time in tqdm_notebook(times):
        swings = []
        ecs = []
        for i in tqdm_notebook(range(10)):
            if model == 'dag':
                swing, ec = pred_clinton_mh(mu_dag,sigma_dag,vars_dag,time)
            elif model == 'mrf':
                swing, ec = pred_clinton_mh(mu_mrf,sigma_mrf,vars_mrf,time)
            title = 'data/{}_MH_{}.pkl'.format(model,time)

            swings.append(swing)
            ecs.append(ec)
        ec = np.mean(ecs)
        swing = pd.Series(np.array(swings).mean(0),index = swing.index)
        with open(title, 'wb') as f:
            pickle.dump((swing,ec), f)

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.005
0.0042
0.0023
0.0037
0.0031
0.0039
0.0035
0.0032
0.0022
0.0028


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.0048
0.0044
0.0032
0.0012
0.0039
0.0022
0.0014
0.003
0.0027
0.0038


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.0029
0.0023
0.0025
0.0026
0.003
0.0032
0.0024
0.006
0.0034
0.0031


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.0116
0.0078
0.009
0.0139
0.006
0.0153
0.0103
0.0091
0.0089
0.0116


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.0118
0.0101
0.0099
0.0112
0.0149
0.0088
0.0083
0.0132
0.0072
0.008


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.0977
0.0929
0.0924
0.1013
0.0997
0.0975
0.0956
0.0844
0.095
0.0992


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.677
0.6759
0.6755
0.6626
0.6756
0.6737
0.67
0.6743
0.6779
0.6665


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.0048
0.0049
0.004
0.0038
0.0044
0.0035
0.0037
0.0033
0.0026
0.0025


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.0027
0.0021
0.0043
0.0035
0.0029
0.0033
0.0025
0.003
0.0045
0.0021


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.0043
0.0029
0.0016
0.0022
0.0021
0.0037
0.0024
0.0029
0.0016
0.0057


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.0088
0.0095
0.0096
0.0131
0.0093
0.0058
0.0122
0.0065
0.0127
0.0096


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.0119
0.0114
0.0173
0.011
0.0111
0.0112
0.0122
0.0098
0.012
0.0143


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.0951
0.0882
0.0996
0.0927
0.0929
0.0974
0.0936
0.0974
0.0944
0.1037


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.6731
0.6749
0.6643
0.6764
0.673
0.6764
0.6701
0.676
0.6663
0.6649


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0

