In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

## Helper Functions

In [2]:
def encode(df):
    """
    Uses LabelEncoder to encode disrecete values
    """
    for column in df.columns:
        if df[column].dtype == type(object):
            le = LabelEncoder()
            df[column] = le.fit_transform(df[column])

In [3]:
def oversample(data_f):
    """
    Oversamples the highly contested races
    """
    df_contested = data_f[data_f.TIGHT_RACE == True].reset_index(drop=True)
    df_not_contested = data_f[data_f.TIGHT_RACE == False].reset_index(drop=True)
    sample = np.random.choice(range(df_contested.shape[0]), size=df_not_contested.shape[0], replace=True)
    df_contested_bootstrapped = df_contested.iloc[sample].reset_index(drop=True)
    frames = [df_not_contested, df_contested_bootstrapped]
    df_new = pd.concat(frames)
    return df_new

# Datasets

- **r100** : 100% of precincts reporting
- **r5**: 5% of precincts reporting
- **_3**: 3 strata
- **_6**: 6 strata

In [4]:
# 100 % reporting
r100_3 = pd.read_csv("../full_data/final.csv")
r100_6 = pd.read_csv("../full_data/full_final.csv")

# 90 % reporting
r90_3 = pd.read_csv("../partial_data/90/90_reporting.csv")
r90_6 = pd.read_csv("../partial_data/90/90_reporting_6.csv")

# 75 % reporting
r75_3 = pd.read_csv("../partial_data/75/75_reporting.csv")
r75_6 = pd.read_csv("../partial_data/75/75_reporting_6.csv")

# 50 % reporting
r50_3 = pd.read_csv("../partial_data/50/50_reporting.csv")
r50_6 = pd.read_csv("../partial_data/50/50_reporting_6.csv")

# 25 % reporting
r25_3 = pd.read_csv("../partial_data/25/25_reporting.csv")
r25_6 = pd.read_csv("../partial_data/25/25_reporting_6.csv")

# 15 % reporting
r15_3 = pd.read_csv("../partial_data/15/15_reporting.csv")
r15_6 = pd.read_csv("../partial_data/15/15_reporting_6.csv")

# 10 % reporting
r10_3 = pd.read_csv("../partial_data/10/10_reporting.csv")
r10_6 = pd.read_csv("../partial_data/10/10_reporting_6.csv")

# 5 % reporting
r5_3 = pd.read_csv("../partial_data/5/5_reporting.csv")
r5_6 = pd.read_csv("../partial_data/5/5_reporting_6.csv")

Encode all of the dataframes

In [5]:
data_frames = [r100_3, r100_6, r90_3, r90_6, r75_3, r75_6, r50_3, r50_6, 
               r25_3, r25_6, r15_3, r15_6, r10_3, r10_6, r5_3, r5_6]

for d in data_frames:
    d = encode(d)

Initialize LeaveOneOut

In [6]:
from sklearn.model_selection import LeaveOneOut
leave_one_out = LeaveOneOut()

# Master list for plotting

In [55]:
for_plots = []

# Decision Tree

In [57]:
# remove the correlation
r100_3_nc = r100_3[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO',
                    'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO',
                    'TIGHT_RACE', 'WINNER']]
percents = [90, 75, 50, 25, 15, 10, 5]

In [58]:
# for training - seperate the target (WINNER)
X_ = r100_3_nc.drop(columns=['WINNER'])
y_ = r100_3_nc.WINNER

In [59]:
testing_dfs = [r90_3, r75_3, r50_3, r25_3, r15_3, r10_3, r5_3]
testing_nc_dfs = [df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO',
                         'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO',
                         'TIGHT_RACE', 'WINNER']] for df in testing_dfs]

In [60]:
def decision_tree(partial_reporting_df, perc):
    scores = []
    tight_from_test = []
    p_scores = []
    p_tight = []
    
    p_X = partial_reporting_df.drop(columns=['WINNER'])
    p_y = partial_reporting_df.WINNER
    p_tight_only = partial_reporting_df[partial_reporting_df.TIGHT_RACE]
    p_tight_X = p_tight_only.drop(columns=['WINNER'])
    p_tight_y = p_tight_only.WINNER
    
    for train_index, test_index in leave_one_out.split(X_):
        X_test = X_.iloc[test_index]
        y_test = y_.iloc[test_index]

        oversampled_df = oversample(r100_3_nc.iloc[train_index])
        X_train = oversampled_df.drop(columns=['WINNER'])
        y_train = oversampled_df.WINNER

        dt = DecisionTreeClassifier(max_depth=6)
        dt.fit(X_train, y_train)

        scores.append(dt.score(X_test, y_test))
        if X_test.TIGHT_RACE.bool():
            tight_from_test.append(dt.score(X_test, y_test))
        p_scores.append(dt.score(p_X, p_y))
        p_tight.append(dt.score(p_tight_X, p_tight_y))

    print(np.mean(scores), np.mean(tight_from_test), np.mean(p_scores), np.mean(p_tight), perc)
    return[scores, tight_from_test, p_scores, p_tight]

In [61]:
for i in range(len(percents)):
    #decision_tree(testing_nc_dfs[i], percents[i])
    for_plots.append(decision_tree(testing_nc_dfs[i], percents[i]))

0.8125 0.5909090909090909 0.8541238584474886 0.7890809692671393 90
0.8229166666666666 0.6363636363636364 0.8454147640791475 0.7751436781609197 75
0.7708333333333334 0.6363636363636364 0.853595890410959 0.7910539215686274 50
0.7604166666666666 0.7272727272727273 0.853950403690888 0.798068576388889 25
0.7708333333333334 0.6363636363636364 0.859965341255332 0.7992424242424242 15
0.7708333333333334 0.6363636363636364 0.8388032724505327 0.7466032608695653 10
0.75 0.6363636363636364 0.8529757993867717 0.79173801369863 5


# Random Forest

In [62]:
r100_6_nc = r100_6[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO',
                         'S4_DEM_RATIO','S5_DEM_RATIO', 'S6_DEM_RATIO',
                         'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO',
                         'S4_REP_RATIO', 'S5_REP_RATIO', 'S6_REP_RATIO',
                         'TIGHT_RACE', 'WINNER']]

In [63]:
testing_dfs = [r90_6, r75_6, r50_6, r25_6, r15_6, r10_6, r5_6]
testing_nc_dfs = [df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO',
                         'S4_DEM_RATIO','S5_DEM_RATIO', 'S6_DEM_RATIO',
                         'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO',
                         'S4_REP_RATIO', 'S5_REP_RATIO', 'S6_REP_RATIO',
                         'TIGHT_RACE', 'WINNER']] for df in testing_dfs]

In [64]:
# for training - seperate the target (WINNER)
X_ = r100_6_nc.drop(columns=['WINNER'])
y_ = r100_6_nc.WINNER

In [65]:
def random_forest(partial_reporting_df, perc):
    scores = []
    tight_from_test = []
    p_scores = []
    p_tight = []
    
    p_X = partial_reporting_df.drop(columns=['WINNER'])
    p_y = partial_reporting_df.WINNER
    p_tight_only = partial_reporting_df[partial_reporting_df.TIGHT_RACE]
    p_tight_X = p_tight_only.drop(columns=['WINNER'])
    p_tight_y = p_tight_only.WINNER
    
    for train_index, test_index in leave_one_out.split(X_):
        X_test = X_.iloc[test_index]
        y_test = y_.iloc[test_index]

        oversampled_df = oversample(r100_6_nc.iloc[train_index])
        X_train = oversampled_df.drop(columns=['WINNER'])
        y_train = oversampled_df.WINNER

        rf = RandomForestClassifier(max_depth=5, n_estimators=100)
        rf.fit(X_train, y_train)

        scores.append(rf.score(X_test, y_test))
        if X_test.TIGHT_RACE.bool():
            tight_from_test.append(rf.score(X_test, y_test))
        p_scores.append(rf.score(p_X, p_y))
        p_tight.append(rf.score(p_tight_X, p_tight_y))

    print(np.mean(scores), np.mean(tight_from_test), np.mean(p_scores), np.mean(p_tight), perc)
    return[scores, tight_from_test, p_scores, p_tight]

In [66]:
for i in range(len(percents)):
    #random_forest(testing_nc_dfs[i], percents[i])
    for_plots.append(random_forest(testing_nc_dfs[i], percents[i]))

0.7916666666666666 0.7727272727272727 0.9122574200913242 0.8503989361702128 90
0.8333333333333334 0.8181818181818182 0.9046803652968037 0.8329741379310344 75
0.8229166666666666 0.7727272727272727 0.9322203196347031 0.8811274509803924 50
0.8229166666666666 0.8181818181818182 0.9028309853353106 0.840576171875 25
0.84375 0.7272727272727273 0.9099824801950032 0.8547647527910686 15
0.8541666666666666 0.8181818181818182 0.90515601217656 0.8276721014492754 10
0.8229166666666666 0.7727272727272727 0.9042925974594832 0.8462828196347032 5


# Gradient Boosted Trees

In [67]:
def boost_tree(partial_reporting_df, perc):
    scores = []
    tight_from_test = []
    p_scores = []
    p_tight = []
    
    p_X = partial_reporting_df.drop(columns=['WINNER'])
    p_y = partial_reporting_df.WINNER
    p_tight_only = partial_reporting_df[partial_reporting_df.TIGHT_RACE]
    p_tight_X = p_tight_only.drop(columns=['WINNER'])
    p_tight_y = p_tight_only.WINNER
    
    for train_index, test_index in leave_one_out.split(X_):
        X_test = X_.iloc[test_index]
        y_test = y_.iloc[test_index]

        oversampled_df = oversample(r100_6_nc.iloc[train_index])
        X_train = oversampled_df.drop(columns=['WINNER'])
        y_train = oversampled_df.WINNER

        gbt = GradientBoostingClassifier(n_estimators=100, max_depth=3)
        gbt.fit(X_train, y_train)
        
        scores.append(gbt.score(X_test, y_test))
        if X_test.TIGHT_RACE.bool():
            tight_from_test.append(gbt.score(X_test, y_test))
        p_scores.append(gbt.score(p_X, p_y))
        p_tight.append(gbt.score(p_tight_X, p_tight_y))

    print(np.mean(scores), np.mean(tight_from_test), np.mean(p_scores), np.mean(p_tight), perc)
    return [scores, tight_from_test, p_scores, p_tight]

In [68]:
for i in range(len(percents)):
    #boost_tree(testing_nc_dfs[i], percents[i])
    for_plots.append(boost_tree(testing_nc_dfs[i], percents[i]))

0.8645833333333334 0.8636363636363636 0.9138984018264841 0.8561613475177307 90
0.84375 0.9090909090909091 0.9068207762557078 0.8454262452107278 75
0.8854166666666666 0.9545454545454546 0.9443493150684933 0.9362745098039217 50
0.8854166666666666 0.9090909090909091 0.9036651425275993 0.8437771267361113 25
0.84375 0.9090909090909091 0.9114583333333334 0.8584529505582138 15
0.8541666666666666 0.9545454545454546 0.909484398782344 0.8525815217391305 10
0.8541666666666666 0.9090909090909091 0.9076325010950503 0.8497431506849314 5


In [70]:
len(for_plots)

21