In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

## Helper Functions

In [2]:
def encode(df):
    """
    Uses LabelEncoder to encode disrecete values
    """
    for column in df.columns:
        if df[column].dtype == type(object):
            le = LabelEncoder()
            df[column] = le.fit_transform(df[column])

In [3]:
def oversample(data_f):
    """
    Oversamples the highly contested races
    """
    df_contested = data_f[data_f.TIGHT_RACE == True].reset_index(drop=True)
    df_not_contested = data_f[data_f.TIGHT_RACE == False].reset_index(drop=True)
    sample = np.random.choice(range(df_contested.shape[0]), size=df_not_contested.shape[0], replace=True)
    df_contested_bootstrapped = df_contested.iloc[sample].reset_index(drop=True)
    frames = [df_not_contested, df_contested_bootstrapped]
    df_new = pd.concat(frames)
    return df_new

## Datasets

- **r100** : 100% of precincts reporting
- **r5**: 5% of precincts reporting
- **_3**: 3 strata
- **_6**: 6 strata

In [4]:
# 100 % reporting
r100_3 = pd.read_csv("../full_data/final.csv")
r100_6 = pd.read_csv("../full_data/full_final.csv")

# 90 % reporting
r90_3 = pd.read_csv("../partial_data/90/90_reporting.csv")
r90_6 = pd.read_csv("../partial_data/90/90_reporting_6.csv")

# 75 % reporting
r75_3 = pd.read_csv("../partial_data/75/75_reporting.csv")
r75_6 = pd.read_csv("../partial_data/75/75_reporting_6.csv")

# 50 % reporting
r50_3 = pd.read_csv("../partial_data/50/50_reporting.csv")
r50_6 = pd.read_csv("../partial_data/50/50_reporting_6.csv")

# 25 % reporting
r25_3 = pd.read_csv("../partial_data/25/25_reporting.csv")
r25_6 = pd.read_csv("../partial_data/25/25_reporting_6.csv")

# 15 % reporting
r15_3 = pd.read_csv("../partial_data/15/15_reporting.csv")
r15_6 = pd.read_csv("../partial_data/15/15_reporting_6.csv")

# 10 % reporting
r10_3 = pd.read_csv("../partial_data/10/10_reporting.csv")
r10_6 = pd.read_csv("../partial_data/10/10_reporting_6.csv")

# 5 % reporting
r5_3 = pd.read_csv("../partial_data/5/5_reporting.csv")
r5_6 = pd.read_csv("../partial_data/5/5_reporting_6.csv")

Encode all of the data frames

In [5]:
data_frames = [r100_3, r100_6, r90_3, r90_6, r75_3, r75_6, r50_3, r50_6, 
               r25_3, r25_6, r15_3, r15_6, r10_3, r10_6, r5_3, r5_6]

for d in data_frames:
    d = encode(d)

Initialize StratifiedKFold's to be set to 5 spilts

In [6]:
skf = StratifiedKFold(n_splits=3)

# Random Forest

### Six Strata

Compare a Decision Tree with Correlations and 6 Strata

In [7]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_6.drop(columns=['TIGHT_RACE'])
y_over_s = r100_6.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_6.drop(columns=['WINNER'])
y_ = r100_6.WINNER

scores = []
tight_from_test = []
for train_index, test_index in skf.split(X_over_s, y_over_s):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_6.iloc[train_index])
    X_train = oversampled_df.drop(columns=['WINNER'])
    y_train = oversampled_df.WINNER
    
    rf = RandomForestClassifier(max_depth=7, n_estimators=100)
    rf.fit(X_train, y_train)
    
    scores.append(rf.score(X_test, y_test))
    tight_from_test.append(rf.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
    
print(np.mean(scores), np.mean(tight_from_test))

0.6677663734115346 0.5952380952380952


With 6 strata but no correlations

In [8]:
r100_6_nc = r100_6[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO',
                         'S4_DEM_RATIO','S5_DEM_RATIO', 'S6_DEM_RATIO',
                         'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO',
                         'S4_REP_RATIO', 'S5_REP_RATIO', 'S6_REP_RATIO',
                         'TIGHT_RACE', 'WINNER']]

In [9]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_6_nc.drop(columns=['TIGHT_RACE'])
y_over_s = r100_6_nc.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_6_nc.drop(columns=['WINNER'])
y_ = r100_6_nc.WINNER

scores = []
tight_from_test = []
for train_index, test_index in skf.split(X_over_s, y_over_s):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_6_nc.iloc[train_index])
    X_train = oversampled_df.drop(columns=['WINNER'])
    y_train = oversampled_df.WINNER
    
    rf = RandomForestClassifier(max_depth=7, n_estimators=100)
    rf.fit(X_train, y_train)
    
    scores.append(rf.score(X_test, y_test))
    tight_from_test.append(rf.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
    
print(np.mean(scores), np.mean(tight_from_test))

0.8032644998370805 0.7797619047619048


3 strata with correlations

In [10]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_3.drop(columns=['TIGHT_RACE'])
y_over_s = r100_3.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_3.drop(columns=['WINNER'])
y_ = r100_3.WINNER

scores = []
tight_from_test = []
for train_index, test_index in skf.split(X_over_s, y_over_s):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_3.iloc[train_index])
    X_train = oversampled_df.drop(columns=['WINNER'])
    y_train = oversampled_df.WINNER
    
    rf = RandomForestClassifier(max_depth=7, n_estimators=100)
    rf.fit(X_train, y_train)
    
    scores.append(rf.score(X_test, y_test))
    tight_from_test.append(rf.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
    
print(np.mean(scores), np.mean(tight_from_test))

0.6781830400782013 0.6428571428571428


3 strata no correlations

In [11]:
r100_3_nc = r100_3[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO',
                    'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO',
                    'TIGHT_RACE', 'WINNER']]

In [12]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_3_nc.drop(columns=['TIGHT_RACE'])
y_over_s = r100_3_nc.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_3_nc.drop(columns=['WINNER'])
y_ = r100_3_nc.WINNER

scores = []
tight_from_test = []
for train_index, test_index in skf.split(X_over_s, y_over_s):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_3_nc.iloc[train_index])
    X_train = oversampled_df.drop(columns=['WINNER'])
    y_train = oversampled_df.WINNER
    
    rf = RandomForestClassifier(max_depth=7, n_estimators=100)
    rf.fit(X_train, y_train)
    
    scores.append(rf.score(X_test, y_test))
    tight_from_test.append(rf.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
    
print(np.mean(scores), np.mean(tight_from_test))

0.7918601335940045 0.6845238095238094


# Decision Tree

In [13]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_6_nc.drop(columns=['TIGHT_RACE'])
y_over_s = r100_6_nc.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_6_nc.drop(columns=['WINNER'])
y_ = r100_6_nc.WINNER

scores = []
tight_from_test = []
for train_index, test_index in skf.split(X_over_s, y_over_s):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_6_nc.iloc[train_index])
    X_train = oversampled_df.drop(columns=['WINNER'])
    y_train = oversampled_df.WINNER
    
    dt = DecisionTreeClassifier(max_depth=7)
    dt.fit(X_train, y_train)
    
    scores.append(dt.score(X_test, y_test))
    tight_from_test.append(dt.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
    
print(np.mean(scores), np.mean(tight_from_test))

0.7286677256435321 0.7261904761904763


In [14]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_3_nc.drop(columns=['TIGHT_RACE'])
y_over_s = r100_3_nc.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_3_nc.drop(columns=['WINNER'])
y_ = r100_3_nc.WINNER

scores = []
tight_from_test = []
for train_index, test_index in skf.split(X_over_s, y_over_s):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_3_nc.iloc[train_index])
    X_train = oversampled_df.drop(columns=['WINNER'])
    y_train = oversampled_df.WINNER
    
    dt = DecisionTreeClassifier(max_depth=7)
    dt.fit(X_train, y_train)
    
    scores.append(dt.score(X_test, y_test))
    tight_from_test.append(dt.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
    
print(np.mean(scores), np.mean(tight_from_test))

0.7413550830889539 0.6904761904761906


# Boosted Trees

In [15]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_6_nc.drop(columns=['TIGHT_RACE'])
y_over_s = r100_6_nc.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_6_nc.drop(columns=['WINNER'])
y_ = r100_6_nc.WINNER

scores = []
tight_from_test = []
for train_index, test_index in skf.split(X_over_s, y_over_s):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_6_nc.iloc[train_index])
    X_train = oversampled_df.drop(columns=['WINNER'])
    y_train = oversampled_df.WINNER
    
    gbc = GradientBoostingClassifier(n_estimators=100, max_depth=3)
    gbc.fit(X_train, y_train)
    
    scores.append(gbc.score(X_test, y_test))
    tight_from_test.append(gbc.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
    
print(np.mean(scores), np.mean(tight_from_test))

0.8127341968067775 0.9047619047619048


In [16]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_3_nc.drop(columns=['TIGHT_RACE'])
y_over_s = r100_3_nc.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_3_nc.drop(columns=['WINNER'])
y_ = r100_3_nc.WINNER

scores = []
tight_from_test = []
for train_index, test_index in skf.split(X_over_s, y_over_s):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_3_nc.iloc[train_index])
    X_train = oversampled_df.drop(columns=['WINNER'])
    y_train = oversampled_df.WINNER
    
    gbc = GradientBoostingClassifier(n_estimators=100, max_depth=6)
    gbc.fit(X_train, y_train)
    
    scores.append(gbc.score(X_test, y_test))
    tight_from_test.append(gbc.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
    
print(np.mean(scores), np.mean(tight_from_test))

0.7189027370478983 0.6904761904761906


# Test With Fewer Precincts Reporting

Copying the winning model from above

In [17]:
testing_dfs = [r90_6, r75_6, r50_6, r25_6, r15_6, r10_6, r5_6]
testing_nc_dfs = [df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO',
                         'S4_DEM_RATIO','S5_DEM_RATIO', 'S6_DEM_RATIO',
                         'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO',
                         'S4_REP_RATIO', 'S5_REP_RATIO', 'S6_REP_RATIO',
                         'TIGHT_RACE', 'WINNER']] for df in testing_dfs]

In [18]:
percents = [90, 75, 50, 25, 15, 10, 5]

In [19]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_6_nc.drop(columns=['TIGHT_RACE'])
y_over_s = r100_6_nc.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_6_nc.drop(columns=['WINNER'])
y_ = r100_6_nc.WINNER

In [28]:
def random_forest(partial_reporting_df, perc):
    scores = []
    tight_from_test = []
    p_scores = []
    p_tight = []
    
    p_X = partial_reporting_df.drop(columns=['WINNER'])
    p_y = partial_reporting_df.WINNER
    p_tight_only = partial_reporting_df[partial_reporting_df.TIGHT_RACE]
    p_tight_X = p_tight_only.drop(columns=['WINNER'])
    p_tight_y = p_tight_only.WINNER
    
    for train_index, test_index in skf.split(X_over_s, y_over_s):
        X_test = X_.iloc[test_index]
        y_test = y_.iloc[test_index]

        oversampled_df = oversample(r100_6_nc.iloc[train_index])
        X_train = oversampled_df.drop(columns=['WINNER'])
        y_train = oversampled_df.WINNER

        rf = RandomForestClassifier(max_depth=7, n_estimators=100)
        rf.fit(X_train, y_train)

        scores.append(dt.score(X_test, y_test))
        tight_from_test.append(dt.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
        p_scores.append(dt.score(p_X, p_y))
        p_tight.append(dt.score(p_tight_X, p_tight_y))

    print(perc, np.mean(scores), np.mean(tight_from_test), np.mean(p_scores), np.mean(p_tight))

In [29]:
for i in range(len(percents)):
    random_forest(testing_nc_dfs[i], percents[i])

90 0.8029488432714239 0.7321428571428571 0.8767123287671232 0.8392434988179668
75 0.8227944770283481 0.875 0.867579908675799 0.8160919540229884
50 0.7824311665037471 0.7321428571428571 0.8995433789954338 0.8039215686274509
25 0.8039161779081133 0.8273809523809524 0.8606030647553139 0.7743055555555555
15 0.7908927989573152 0.6845238095238094 0.8750761730652042 0.8341307814992026
10 0.7823904366243076 0.8273809523809524 0.8538812785388128 0.7608695652173912
5 0.7814434669273379 0.7321428571428572 0.8633377135348225 0.8082191780821918
