In [22]:
import pandas as pd 
import numpy as np
import datetime
import pickle
import us
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score

# model parameters
reg = 1000
max_depth = 3

In [14]:
compact_df = pd.read_csv('cd_scores.csv', encoding = "ISO-8859-1")
houseresults = pd.read_csv('vote_results/clean_csv_11_23/house.csv', encoding = "ISO-8859-1")

compact_df['id'] = compact_df['STATEFP'].astype(str) + '-' + compact_df['CD114FP'].astype(str)
houseresults['id'] = houseresults['STATEFP'].astype(str) + '-' + houseresults['CD114FP'].astype(str)

state_to_party, housedist_to_party = pickle.load(open('pundits.pkl', 'rb')) # read in baselines.pkl or pundits.pkl depending

housedist_to_party = pickle.load(open("baselines.pkl", "rb"))[1]

housedist_to_id = {}
state_to_statefp = {}

for i in range(len(houseresults)):
    row = houseresults.iloc[i]
    state = us.states.lookup(row['state_id']).name.lower()
    state_to_statefp[state] = row['STATEFP']
    
    housedist_to_id[state+str(row['seat_id'])] = row['id']

id_to_housedist = {v: k for k, v in housedist_to_id.items()}
statefp_to_state = {v: k for k, v in state_to_statefp.items()}

In [15]:
compact_version = 'ReockPTB'

features = []
labels = []

for i in range(len(houseresults)):
    row = houseresults.iloc[i]
    prev_result = housedist_to_party[id_to_housedist[row['id']]]
    prev_result = 1 if prev_result == 'democrat' else 0
    curr_result = 1 if row['d_won'] else 0
    compact_score = compact_df[compact_df['id'] == row['id']][compact_version].values
    if len(compact_score) != 1:
        print(compact_score)
        assert False
    compact_score = compact_score[0]
    
    features.append([prev_result, compact_score])
    labels.append(curr_result)

In [16]:
features = np.array(features)
labels = np.array(labels)

## 1. Predictions with compactness data AND past election results

In [17]:

train_X, test_X, train_y, test_y = train_test_split(features, labels, test_size=0.33)

log_reg = LogisticRegression(C=reg).fit(train_X, train_y)
rdm_for = RandomForestClassifier(max_depth=max_depth).fit(train_X, train_y)
log_reg_score = log_reg.score(test_X, test_y)
rdm_for_score = rdm_for.score(test_X, test_y)
log_reg_f1score = f1_score(test_y, log_reg.predict(test_X))
rdm_for_f1score = f1_score(test_y, rdm_for.predict(test_X))
log_reg_auc = roc_auc_score(test_y, log_reg.predict_proba(test_X)[:, 1])
rdm_for_auc = roc_auc_score(test_y, rdm_for.predict_proba(test_X)[:, 1])

print("--------- Compactness AND Election ---------")
print("Accuracy LR: {0}, RF: {1}".format(round(log_reg_score, 5), round(rdm_for_score, 5)))
print("F1 for LR: {0}, RF: {1}".format(round(log_reg_f1score, 4), round(rdm_for_f1score, 4)))
print("AUC for LR: {0}, RF: {1}\n".format(round(log_reg_auc, 4), round(rdm_for_auc, 4)))

--------- Compactness AND Election ---------
Accuracy LR: 0.94964, RF: 0.94964
F1 for LR: 0.9489, RF: 0.9489
AUC for LR: 0.9536, RF: 0.953



## 2. Predictions ONLY with Compactness Data

In [24]:

train_X, test_X, train_y, test_y = train_test_split(features[:, 1:], labels, test_size=0.33)

log_reg = LogisticRegression(C=reg).fit(train_X, train_y)
rdm_for = RandomForestClassifier(max_depth=max_depth).fit(train_X, train_y)
log_reg_score = log_reg.score(test_X, test_y)
rdm_for_score = rdm_for.score(test_X, test_y)
log_reg_f1score = f1_score(test_y, log_reg.predict(test_X))
rdm_for_f1score = f1_score(test_y, rdm_for.predict(test_X))
log_reg_auc = roc_auc_score(test_y, log_reg.predict_proba(test_X)[:, 1])
rdm_for_auc = roc_auc_score(test_y, rdm_for.predict_proba(test_X)[:, 1])

print("--------- Just Compactness ---------")
print("Accuracy LR: {0}, RF: {1}".format(round(log_reg_score, 5), round(rdm_for_score, 5)))
print("F1 for LR: {0}, RF: {1}".format(round(log_reg_f1score, 4), round(rdm_for_f1score, 4)))
print("AUC for LR: {0}, RF: {1}\n".format(round(log_reg_auc, 4), round(rdm_for_auc, 4)))

--------- Just Compactness ---------
Accuracy LR: 0.61151, RF: 0.58993
F1 for LR: 0.6582, RF: 0.5839
AUC for LR: 0.676, RF: 0.6337



In [19]:
print(np.mean(labels))

0.522673031026253
