In [2]:
import pandas as pd 
import numpy as np
import datetime
import pickle
import us
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score

# model parameters
reg = 1000
max_depth = 5

# Load election result data and COVID-19 numbers per congressional district

In [3]:
house_results = pd.read_csv("vote_results/clean_csv_11_23/house.csv")
house_results.head()

Unnamed: 0.1,Unnamed: 0,state_id,STATEFP,seat_id,CD114FP,d_votes,r_votes,other_votes,d_vote_share,r_vote_share,other_vote_share,d_won,r_won,other_won
0,0,AK,2,1,1,159765,191568,0,0.45474,0.54526,0.0,0,1,0
1,1,AL,1,1,1,116949,211825,0,0.355712,0.644288,0.0,0,1,0
2,2,AL,1,2,2,105286,197996,287,0.346827,0.652227,0.000945,0,1,0
3,3,AL,1,3,3,104595,217384,0,0.32485,0.67515,0.0,0,1,0
4,4,AL,1,4,4,56237,261553,0,0.176963,0.823037,0.0,0,1,0


## Load election predictions (based on 2016 election results)

In [4]:
# Get predictions for each congressional district
housedist_to_party = pickle.load(open("baselines.pkl", "rb"))[1]

# Map state name to STATEFP
state_to_statefp = {}

for i, row in house_results.iterrows():
    state = us.states.lookup(row['state_id']).name.lower()
    state_to_statefp[state] = row['STATEFP']

statefp_to_state = {v: k for k, v in state_to_statefp.items()}

## Load COVID-19 congressional district data from Harvard + Microsoft AI for Health

In [5]:
# Get COVID CD data
covid_cd = pd.read_csv("data/clean-covid-cd.csv")
covid_cd["Date"] = pd.to_datetime(covid_cd["Date"])
covid_cd.head()

Unnamed: 0,Date,STATEFP,state_id,StateName,CD114FP,incremental_cases,incremental_deaths
0,2020-01-22,1,AL,Alabama,1,0.0,0.0
1,2020-01-23,1,AL,Alabama,1,0.0,0.0
2,2020-01-24,1,AL,Alabama,1,0.0,0.0
3,2020-01-25,1,AL,Alabama,1,0.0,0.0
4,2020-01-26,1,AL,Alabama,1,0.0,0.0


## Merge DataFrames to include COVID cases, COVID deaths, and election predictions

In [6]:
# Prepare COVID CD numbers in 5 sets
covid_datasets = list()
# 1. Up to Election Day (Nov. 3rd) by congressional district
covid_election_day = covid_cd.groupby(["STATEFP", "CD114FP", "state_id"]).sum()
covid_election_day = covid_election_day.add_suffix("_total").reset_index()
covid_election_day["CD114FP"][covid_election_day["CD114FP"] == 0] = 1
covid_datasets.append(covid_election_day)

# 2-5. Excluding 1-2-3-4 weeks (respectively) worth of COVID data
for i in range(1, 5):
    excluded_weeks = datetime.timedelta(days=7*i)
    cut = datetime.datetime(2020, 11, 3) - excluded_weeks
    covid_data = covid_cd[covid_cd["Date"] <= cut]
    covid_data = covid_data.groupby(["STATEFP", "CD114FP", "state_id"]).sum()
    covid_data = covid_data.add_suffix("_total").reset_index()
    covid_data["CD114FP"][covid_data["CD114FP"] == 0] = 1
    covid_datasets.append(covid_data)

In [7]:
house_datasets = list()

for dataset in covid_datasets:
    # Merge COVID CD data with house results
    house_results["covid_total"] = None
    house_results["covid_deaths"] = None
    house_results["d_predicted"] = None
    house_results["r_predicted"] = None

    for i, row in dataset.iterrows():
        statefp = row["STATEFP"]
        if statefp == 11: # skip DC
            continue
        cd = row["CD114FP"]
        predicted_winner = housedist_to_party[statefp_to_state[statefp] + str(cd)]
        house_results["d_predicted"][(house_results["STATEFP"] == statefp) & (house_results["CD114FP"] == cd)] = 1 if predicted_winner == "democrat" else 0
        house_results["r_predicted"][(house_results["STATEFP"] == statefp) & (house_results["CD114FP"] == cd)] = 1 if predicted_winner == "republican" else 0
        house_results["covid_total"][(house_results["STATEFP"] == statefp) & (house_results["CD114FP"] == cd)] = row["incremental_cases_total"]
        house_results["covid_deaths"][(house_results["STATEFP"] == statefp) & (house_results["CD114FP"] == cd)] = row["incremental_deaths_total"]
    house_datasets.append(house_results)

## 1. Predictions with COVID-19 data AND past election results

In [15]:
for index, house_dataset in enumerate(house_datasets):
    # Get predictions with covid
    features = pd.DataFrame()
    features["covid_total"] = house_dataset["covid_total"]
    features["covid_deaths"] = house_dataset["covid_deaths"]
    # Getting d_predicted because d_predicted = ~r_predicted
    features["d_predicted"] = house_dataset["d_predicted"]

    # Getting d_won because d_won = ~r_won
    labels = house_dataset["d_won"]

    train_X, test_X, train_y, test_y = train_test_split(features, labels, test_size=0.33)

    log_reg = LogisticRegression(C=reg).fit(train_X, train_y)
    rdm_for = RandomForestClassifier(max_depth=max_depth).fit(train_X, train_y)
    log_reg_score = log_reg.score(test_X, test_y)
    rdm_for_score = rdm_for.score(test_X, test_y)
    log_reg_f1score = f1_score(test_y, log_reg.predict(test_X))
    rdm_for_f1score = f1_score(test_y, rdm_for.predict(test_X))
    log_reg_auc = roc_auc_score(test_y, log_reg.predict_proba(test_X)[:, 1])
    rdm_for_auc = roc_auc_score(test_y, rdm_for.predict_proba(test_X)[:, 1])

    print(f"--------- COVID DATA EXCLUDING {index} WEEK(S) PRIOR TO NOV 3 ---------")
    print("Accuracy LR: {0}, RF: {1}".format(round(log_reg_score, 5), round(rdm_for_score, 5)))
    print("F1 for LR: {0}, RF: {1}".format(round(log_reg_f1score, 4), round(rdm_for_f1score, 4)))
    print("AUC for LR: {0}, RF: {1}\n".format(round(log_reg_auc, 4), round(rdm_for_auc, 4)))

--------- COVID DATA EXCLUDING 0 WEEK(S) PRIOR TO NOV 3 ---------
Accuracy LR: 0.66906, RF: 0.94245
F1 for LR: 0.6933, RF: 0.9467
AUC for LR: 0.7253, RF: 0.9652

--------- COVID DATA EXCLUDING 1 WEEK(S) PRIOR TO NOV 3 ---------
Accuracy LR: 0.95683, RF: 0.96403
F1 for LR: 0.961, RF: 0.9673
AUC for LR: 0.969, RF: 0.9655

--------- COVID DATA EXCLUDING 2 WEEK(S) PRIOR TO NOV 3 ---------
Accuracy LR: 0.64029, RF: 0.95683
F1 for LR: 0.6429, RF: 0.9583
AUC for LR: 0.7698, RF: 0.976

--------- COVID DATA EXCLUDING 3 WEEK(S) PRIOR TO NOV 3 ---------
Accuracy LR: 0.97122, RF: 0.97122
F1 for LR: 0.9701, RF: 0.9701
AUC for LR: 0.9774, RF: 0.9761

--------- COVID DATA EXCLUDING 4 WEEK(S) PRIOR TO NOV 3 ---------
Accuracy LR: 0.96403, RF: 0.96403
F1 for LR: 0.963, RF: 0.963
AUC for LR: 0.9696, RF: 0.9716



## 2. Predictions ONLY with past election results

In [21]:
house_dataset = house_datasets[0]
# Get predictions WITHOUT covid
features = pd.DataFrame()
# Getting d_predicted because d_predicted = ~r_predicted
features["d_predicted"] = house_dataset["d_predicted"]

# Getting d_won because d_won = ~r_won
labels = house_dataset["d_won"]

train_X, test_X, train_y, test_y = train_test_split(features, labels, test_size=0.33)

log_reg = LogisticRegression(C=reg).fit(train_X, train_y)
rdm_for = RandomForestClassifier(max_depth=max_depth).fit(train_X, train_y)
log_reg_score = log_reg.score(test_X, test_y)
rdm_for_score = rdm_for.score(test_X, test_y)
log_reg_f1score = f1_score(test_y, log_reg.predict(test_X))
rdm_for_f1score = f1_score(test_y, rdm_for.predict(test_X))
log_reg_auc = roc_auc_score(test_y, log_reg.predict_proba(test_X)[:, 1])
rdm_for_auc = roc_auc_score(test_y, rdm_for.predict_proba(test_X)[:, 1])

print(f"--------- BASELINE PREDICTIONS ---------")
print("Accuracy LR: {0}, RF: {1}".format(log_reg_score, rdm_for_score))
print("F1 for LR: {0}, RF: {1}".format(log_reg_f1score, rdm_for_f1score))
print("AUC for LR: {0}, RF: {1}\n".format(log_reg_auc, rdm_for_auc))

--------- BASELINE PREDICTIONS ---------
Accuracy LR: 0.9496402877697842, RF: 0.9496402877697842
F1 for LR: 0.9523809523809524, RF: 0.9523809523809524
AUC for LR: 0.9488400994200497, RF: 0.9488400994200497



## 3. Predictions with ONLY COVID-19 data

In [14]:
 for index, house_dataset in enumerate(house_datasets):
    # Get predictions with covid

    features = pd.DataFrame()
    features["covid_total"] = house_dataset["covid_total"]
    features["covid_deaths"] = house_dataset["covid_deaths"]

    # Getting d_won because d_won = ~r_won
    labels = house_dataset["d_won"]

    train_X, test_X, train_y, test_y = train_test_split(features, labels, test_size=0.33)

    log_reg = LogisticRegression(C=reg).fit(train_X, train_y)
    rdm_for = RandomForestClassifier(max_depth=max_depth).fit(train_X, train_y)
    log_reg_score = log_reg.score(test_X, test_y)
    rdm_for_score = rdm_for.score(test_X, test_y)
    log_reg_f1score = f1_score(test_y, log_reg.predict(test_X))
    rdm_for_f1score = f1_score(test_y, rdm_for.predict(test_X))
    log_reg_auc = roc_auc_score(test_y, log_reg.predict_proba(test_X)[:, 1])
    rdm_for_auc = roc_auc_score(test_y, rdm_for.predict_proba(test_X)[:, 1])

    print(f"--------- COVID DATA EXCLUDING {index} WEEK(S) PRIOR TO NOV 3 ---------")
    print("Accuracy LR: {0}, RF: {1}".format(round(log_reg_score, 5), round(rdm_for_score, 5)))
    print("F1 for LR: {0}, RF: {1}".format(round(log_reg_f1score, 4), round(rdm_for_f1score, 4)))
    print("AUC for LR: {0}, RF: {1}\n".format(round(log_reg_auc, 4), round(rdm_for_auc, 4)))

--------- COVID DATA EXCLUDING 0 WEEK(S) PRIOR TO NOV 3 ---------
Accuracy LR: 0.61151, RF: 0.57554
F1 for LR: 0.6029, RF: 0.604
AUC for LR: 0.6583, RF: 0.6941

--------- COVID DATA EXCLUDING 1 WEEK(S) PRIOR TO NOV 3 ---------
Accuracy LR: 0.6259, RF: 0.63309
F1 for LR: 0.6438, RF: 0.6483
AUC for LR: 0.705, RF: 0.7205

--------- COVID DATA EXCLUDING 2 WEEK(S) PRIOR TO NOV 3 ---------
Accuracy LR: 0.64029, RF: 0.66187
F1 for LR: 0.6575, RF: 0.6846
AUC for LR: 0.7204, RF: 0.741

--------- COVID DATA EXCLUDING 3 WEEK(S) PRIOR TO NOV 3 ---------
Accuracy LR: 0.59712, RF: 0.61871
F1 for LR: 0.5625, RF: 0.6131
AUC for LR: 0.7327, RF: 0.7605

--------- COVID DATA EXCLUDING 4 WEEK(S) PRIOR TO NOV 3 ---------
Accuracy LR: 0.6259, RF: 0.58993
F1 for LR: 0.6119, RF: 0.6069
AUC for LR: 0.6768, RF: 0.6713

