In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

## Helper Functions

In [2]:
def encode(df):
    """
    Uses LabelEncoder to encode disrecete values
    """
    for column in df.columns:
        if df[column].dtype == type(object):
            le = LabelEncoder()
            df[column] = le.fit_transform(df[column])

In [3]:
def oversample(data_f):
    """
    Oversamples the highly contested races
    """
    df_contested = data_f[data_f.TIGHT_RACE == True].reset_index(drop=True)
    df_not_contested = data_f[data_f.TIGHT_RACE == False].reset_index(drop=True)
    sample = np.random.choice(range(df_contested.shape[0]), size=df_not_contested.shape[0], replace=True)
    df_contested_bootstrapped = df_contested.iloc[sample].reset_index(drop=True)
    frames = [df_not_contested, df_contested_bootstrapped]
    df_new = pd.concat(frames)
    return df_new

## Datasets

- **r100** : 100% of precincts reporting
- **r5**: 5% of precincts reporting
- **_3**: 3 strata
- **_6**: 6 strata

In [4]:
# 100 % reporting
r100_3 = pd.read_csv("../full_data/final.csv")
r100_6 = pd.read_csv("../full_data/full_final.csv")

# 90 % reporting
r90_3 = pd.read_csv("../partial_data/90/90_reporting.csv")
r90_6 = pd.read_csv("../partial_data/90/90_reporting_6.csv")

# 75 % reporting
r75_3 = pd.read_csv("../partial_data/75/75_reporting.csv")
r75_6 = pd.read_csv("../partial_data/75/75_reporting_6.csv")

# 50 % reporting
r50_3 = pd.read_csv("../partial_data/50/50_reporting.csv")
r50_6 = pd.read_csv("../partial_data/50/50_reporting_6.csv")

# 25 % reporting
r25_3 = pd.read_csv("../partial_data/25/25_reporting.csv")
r25_6 = pd.read_csv("../partial_data/25/25_reporting_6.csv")

# 15 % reporting
r15_3 = pd.read_csv("../partial_data/15/15_reporting.csv")
r15_6 = pd.read_csv("../partial_data/15/15_reporting_6.csv")

# 10 % reporting
r10_3 = pd.read_csv("../partial_data/10/10_reporting.csv")
r10_6 = pd.read_csv("../partial_data/10/10_reporting_6.csv")

# 5 % reporting
r5_3 = pd.read_csv("../partial_data/5/5_reporting.csv")
r5_6 = pd.read_csv("../partial_data/5/5_reporting_6.csv")

Encode all of the data frames

In [5]:
data_frames = [r100_3, r100_6, r90_3, r90_6, r75_3, r75_6, r50_3, r50_6, 
               r25_3, r25_6, r15_3, r15_6, r10_3, r10_6, r5_3, r5_6]

for d in data_frames:
    d = encode(d)

Initialize StratifiedKFold's to be set to 5 spilts

In [6]:
from sklearn.model_selection import LeaveOneOut
leave_one_out = LeaveOneOut()

# Random Forest

### Six Strata

Compare a Decision Tree with Correlations and 6 Strata

In [8]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_6.drop(columns=['TIGHT_RACE'])
y_over_s = r100_6.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_6.drop(columns=['WINNER'])
y_ = r100_6.WINNER

scores = []
tight_from_test = []
for train_index, test_index in leave_one_out.split(X_):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_6.iloc[train_index])
    X_train = oversampled_df.drop(columns=['WINNER'])
    y_train = oversampled_df.WINNER
    
    rf = RandomForestClassifier(max_depth=7, n_estimators=100)
    rf.fit(X_train, y_train)
    
    scores.append(rf.score(X_test, y_test))
    
print(np.mean(scores))

0.7395833333333334


With 6 strata but no correlations

In [9]:
r100_6_nc = r100_6[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO',
                         'S4_DEM_RATIO','S5_DEM_RATIO', 'S6_DEM_RATIO',
                         'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO',
                         'S4_REP_RATIO', 'S5_REP_RATIO', 'S6_REP_RATIO',
                         'TIGHT_RACE', 'WINNER']]

In [10]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_6_nc.drop(columns=['TIGHT_RACE'])
y_over_s = r100_6_nc.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_6_nc.drop(columns=['WINNER'])
y_ = r100_6_nc.WINNER

scores = []
tight_from_test = []
for train_index, test_index in leave_one_out.split(X_):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_6_nc.iloc[train_index])
    X_train = oversampled_df.drop(columns=['WINNER'])
    y_train = oversampled_df.WINNER
    
    rf = RandomForestClassifier(max_depth=7, n_estimators=100)
    rf.fit(X_train, y_train)
    
    scores.append(rf.score(X_test, y_test))
    
print(np.mean(scores))

0.84375


3 strata with correlations

In [11]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_3.drop(columns=['TIGHT_RACE'])
y_over_s = r100_3.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_3.drop(columns=['WINNER'])
y_ = r100_3.WINNER

scores = []
tight_from_test = []
for train_index, test_index in leave_one_out.split(X_):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_3.iloc[train_index])
    X_train = oversampled_df.drop(columns=['WINNER'])
    y_train = oversampled_df.WINNER
    
    rf = RandomForestClassifier(max_depth=7, n_estimators=100)
    rf.fit(X_train, y_train)
    
    scores.append(rf.score(X_test, y_test))
    
print(np.mean(scores))

0.71875


3 strata no correlations

In [12]:
r100_3_nc = r100_3[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO',
                    'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO',
                    'TIGHT_RACE', 'WINNER']]

In [13]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_3_nc.drop(columns=['TIGHT_RACE'])
y_over_s = r100_3_nc.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_3_nc.drop(columns=['WINNER'])
y_ = r100_3_nc.WINNER

scores = []
tight_from_test = []
for train_index, test_index in leave_one_out.split(X_):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_3_nc.iloc[train_index])
    X_train = oversampled_df.drop(columns=['WINNER'])
    y_train = oversampled_df.WINNER
    
    rf = RandomForestClassifier(max_depth=7, n_estimators=100)
    rf.fit(X_train, y_train)
    
    scores.append(rf.score(X_test, y_test))
    
print(np.mean(scores))

0.8333333333333334


# Decision Tree

In [14]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_6_nc.drop(columns=['TIGHT_RACE'])
y_over_s = r100_6_nc.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_6_nc.drop(columns=['WINNER'])
y_ = r100_6_nc.WINNER

scores = []
tight_from_test = []
for train_index, test_index in leave_one_out.split(X_):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_6_nc.iloc[train_index])
    X_train = oversampled_df.drop(columns=['WINNER'])
    y_train = oversampled_df.WINNER
    
    dt = DecisionTreeClassifier(max_depth=7)
    dt.fit(X_train, y_train)
    
    scores.append(dt.score(X_test, y_test))
    
print(np.mean(scores))

0.78125


In [17]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_3_nc.drop(columns=['TIGHT_RACE'])
y_over_s = r100_3_nc.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_3_nc.drop(columns=['WINNER'])
y_ = r100_3_nc.WINNER

scores = []
tight_from_test = []
for train_index, test_index in leave_one_out.split(X_):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_3_nc.iloc[train_index])
    X_train = oversampled_df.drop(columns=['WINNER'])
    y_train = oversampled_df.WINNER
    
    dt = DecisionTreeClassifier(max_depth=7)
    dt.fit(X_train, y_train)
    
    scores.append(dt.score(X_test, y_test))
    
print(np.mean(scores))

0.8541666666666666


# Boosted Trees

In [18]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_6_nc.drop(columns=['TIGHT_RACE'])
y_over_s = r100_6_nc.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_6_nc.drop(columns=['WINNER'])
y_ = r100_6_nc.WINNER

scores = []
tight_from_test = []
for train_index, test_index in leave_one_out.split(X_):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_6_nc.iloc[train_index])
    X_train = oversampled_df.drop(columns=['WINNER'])
    y_train = oversampled_df.WINNER
    
    gbc = GradientBoostingClassifier(n_estimators=100, max_depth=3)
    gbc.fit(X_train, y_train)
    
    scores.append(gbc.score(X_test, y_test))
    
print(np.mean(scores))

0.875


In [19]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_3_nc.drop(columns=['TIGHT_RACE'])
y_over_s = r100_3_nc.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_3_nc.drop(columns=['WINNER'])
y_ = r100_3_nc.WINNER

scores = []
tight_from_test = []
for train_index, test_index in leave_one_out.split(X_):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_3_nc.iloc[train_index])
    X_train = oversampled_df.drop(columns=['WINNER'])
    y_train = oversampled_df.WINNER
    
    gbc = GradientBoostingClassifier(n_estimators=100, max_depth=6)
    gbc.fit(X_train, y_train)
    
    scores.append(gbc.score(X_test, y_test))
    
print(np.mean(scores))

0.8229166666666666


# Test With Fewer Precincts Reporting

Copying the winning model from above

In [20]:
testing_dfs = [r90_6, r75_6, r50_6, r25_6, r15_6, r10_6, r5_6]
testing_nc_dfs = [df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO',
                         'S4_DEM_RATIO','S5_DEM_RATIO', 'S6_DEM_RATIO',
                         'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO',
                         'S4_REP_RATIO', 'S5_REP_RATIO', 'S6_REP_RATIO',
                         'TIGHT_RACE', 'WINNER']] for df in testing_dfs]

In [21]:
percents = [90, 75, 50, 25, 15, 10, 5]

In [22]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_6_nc.drop(columns=['TIGHT_RACE'])
y_over_s = r100_6_nc.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_6_nc.drop(columns=['WINNER'])
y_ = r100_6_nc.WINNER

### Random Forest

In [25]:
def random_forest(partial_reporting_df, perc):
    scores = []
    tight_from_test = []
    p_scores = []
    p_tight = []
    
    p_X = partial_reporting_df.drop(columns=['WINNER'])
    p_y = partial_reporting_df.WINNER
    p_tight_only = partial_reporting_df[partial_reporting_df.TIGHT_RACE]
    p_tight_X = p_tight_only.drop(columns=['WINNER'])
    p_tight_y = p_tight_only.WINNER
    
    for train_index, test_index in leave_one_out.split(X_):
        X_test = X_.iloc[test_index]
        y_test = y_.iloc[test_index]

        oversampled_df = oversample(r100_6_nc.iloc[train_index])
        X_train = oversampled_df.drop(columns=['WINNER'])
        y_train = oversampled_df.WINNER

        rf = RandomForestClassifier(max_depth=5, n_estimators=100)
        rf.fit(X_train, y_train)

        scores.append(rf.score(X_test, y_test))
        p_scores.append(rf.score(p_X, p_y))
        p_tight.append(rf.score(p_tight_X, p_tight_y))

    print(np.mean(scores), perc, np.mean(p_scores), np.mean(p_tight))

In [26]:
for i in range(len(percents)):
    random_forest(testing_nc_dfs[i], percents[i])

0.8125 90 0.9121432648401827 0.8471483451536644
0.8020833333333334 75 0.9049657534246576 0.8292624521072797
0.84375 50 0.9340753424657534 0.8786764705882355
0.8229166666666666 25 0.9028206870983689 0.8402235243055555
0.8125 15 0.9103538238878732 0.8562599681020734
0.8645833333333334 10 0.904632800608828 0.8272192028985507
0.8645833333333334 5 0.9049564717477003 0.843357591324201


### GBT

In [27]:
def boosting_tree(partial_reporting_df, perc):
    scores = []
    tight_from_test = []
    p_scores = []
    p_tight = []
    
    p_X = partial_reporting_df.drop(columns=['WINNER'])
    p_y = partial_reporting_df.WINNER
    p_tight_only = partial_reporting_df[partial_reporting_df.TIGHT_RACE]
    p_tight_X = p_tight_only.drop(columns=['WINNER'])
    p_tight_y = p_tight_only.WINNER
    
    for train_index, test_index in leave_one_out.split(X_):
        X_test = X_.iloc[test_index]
        y_test = y_.iloc[test_index]

        oversampled_df = oversample(r100_6_nc.iloc[train_index])
        X_train = oversampled_df.drop(columns=['WINNER'])
        y_train = oversampled_df.WINNER

        gbt = GradientBoostingClassifier(n_estimators=100, max_depth=3)
        gbt.fit(X_train, y_train)

        scores.append(gbt.score(X_test, y_test))
        p_scores.append(gbt.score(p_X, p_y))
        p_tight.append(gbt.score(p_tight_X, p_tight_y))

    print(np.mean(scores), perc, np.mean(p_scores), np.mean(p_tight))

In [28]:
for i in range(len(percents)):
    boosting_tree(testing_nc_dfs[i], percents[i])

0.875 90 0.9134988584474885 0.8569001182033098
0.8541666666666666 75 0.9049419710806696 0.8465038314176244
0.8645833333333334 50 0.9433504566210044 0.9368872549019609
0.8958333333333334 25 0.9022542840665677 0.844482421875
0.875 15 0.9141339122486288 0.8622408293460925
0.8854166666666666 10 0.9068683409436833 0.8528079710144927
0.8645833333333334 5 0.9067496167323696 0.850492294520548


## DecisionTree

In [29]:
# for training - seperate the target (WINNER)
X_ = r100_3_nc.drop(columns=['WINNER'])
y_ = r100_3_nc.WINNER

In [30]:
def decision_tree(partial_reporting_df, perc):
    scores = []
    tight_from_test = []
    p_scores = []
    p_tight = []
    
    p_X = partial_reporting_df.drop(columns=['WINNER'])
    p_y = partial_reporting_df.WINNER
    p_tight_only = partial_reporting_df[partial_reporting_df.TIGHT_RACE]
    p_tight_X = p_tight_only.drop(columns=['WINNER'])
    p_tight_y = p_tight_only.WINNER
    
    for train_index, test_index in leave_one_out.split(X_):
        X_test = X_.iloc[test_index]
        y_test = y_.iloc[test_index]

        oversampled_df = oversample(r100_3_nc.iloc[train_index])
        X_train = oversampled_df.drop(columns=['WINNER'])
        y_train = oversampled_df.WINNER

        dt = DecisionTreeClassifier(max_depth=7)
        dt.fit(X_train, y_train)

        scores.append(dt.score(X_test, y_test))
        p_scores.append(dt.score(p_X, p_y))
        p_tight.append(dt.score(p_tight_X, p_tight_y))

    print(np.mean(scores), perc, np.mean(p_scores), np.mean(p_tight))

In [34]:
testing_dfs = [r90_3, r75_3, r50_3, r25_3, r15_3, r10_3, r5_3]
testing_nc_dfs = [df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO',
                         'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO',
                         'TIGHT_RACE', 'WINNER']] for df in testing_dfs]

In [36]:
for i in range(len(percents)):
    decision_tree(testing_nc_dfs[i], percents[i])

0.8333333333333334 90 0.8573059360730593 0.7934397163120567
0.8020833333333334 75 0.8493864155251142 0.7672413793103449
0.8229166666666666 50 0.8550228310502282 0.7867647058823529
0.7604166666666666 25 0.8543262893392652 0.7885199652777778
0.84375 15 0.8596035191956125 0.8032795055821372
0.8645833333333334 10 0.8369006849315069 0.7391304347826088
0.78125 5 0.8541735107314937 0.7908818493150686
