In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

## Helper Functions

In [2]:
def encode(df):
    """
    Uses LabelEncoder to encode disrecete values
    """
    for column in df.columns:
        if df[column].dtype == type(object):
            le = LabelEncoder()
            df[column] = le.fit_transform(df[column])

In [3]:
def oversample(data_f):
    """
    Oversamples the highly contested races
    """
    df_contested = data_f[data_f.TIGHT_RACE == True].reset_index(drop=True)
    df_not_contested = data_f[data_f.TIGHT_RACE == False].reset_index(drop=True)
    sample = np.random.choice(range(df_contested.shape[0]), size=df_not_contested.shape[0], replace=True)
    df_contested_bootstrapped = df_contested.iloc[sample].reset_index(drop=True)
    frames = [df_not_contested, df_contested_bootstrapped]
    df_new = pd.concat(frames)
    return df_new

In [4]:
def tight_race_split(df, num_strata, include_corrs=True):
    """
    Given a dataframe, grabs the races that are tight for testing
    """
    
    tight_only = df[df.TIGHT_RACE]
    
    if not include_corrs:
        if num_strata == 3:
            tight_X = tight_only[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 
                                      'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO', 
                                      'TIGHT_RACE']]

        elif num_strata == 6:
            tight_X = tight_only[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO',
                                  'S4_DEM_RATIO','S5_DEM_RATIO', 'S6_DEM_RATIO',
                                  'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO',
                                  'S4_REP_RATIO', 'S5_REP_RATIO', 'S6_REP_RATIO',
                                  'TIGHT_RACE']]
    else:
        tight_X = tight_only.drop(columns=['WINNER'])
    
    tight_y = tight_only.WINNER
    
    return tight_X, tight_y

## Datasets

- **r100** : 100% of precincts reporting
- **r5**: 5% of precincts reporting
- **_3**: 3 strata
- **_6**: 6 strata

In [5]:
# 100 % reporting
r100_3 = pd.read_csv("../full_data/final.csv")
r100_6 = pd.read_csv("../full_data/full_final.csv")

# 90 % reporting
r90_3 = pd.read_csv("../partial_data/90/90_reporting.csv")
r90_6 = pd.read_csv("../partial_data/90/90_reporting_6.csv")

# 75 % reporting
r75_3 = pd.read_csv("../partial_data/75/75_reporting.csv")
r75_6 = pd.read_csv("../partial_data/75/75_reporting_6.csv")

# 50 % reporting
r50_3 = pd.read_csv("../partial_data/50/50_reporting.csv")
r50_6 = pd.read_csv("../partial_data/50/50_reporting_6.csv")

# 25 % reporting
r25_3 = pd.read_csv("../partial_data/25/25_reporting.csv")
r25_6 = pd.read_csv("../partial_data/25/25_reporting_6.csv")

# 15 % reporting
r15_3 = pd.read_csv("../partial_data/15/15_reporting.csv")
r15_6 = pd.read_csv("../partial_data/15/15_reporting_6.csv")

# 10 % reporting
r10_3 = pd.read_csv("../partial_data/10/10_reporting.csv")
r10_6 = pd.read_csv("../partial_data/10/10_reporting_6.csv")

# 5 % reporting
r5_3 = pd.read_csv("../partial_data/5/5_reporting.csv")
r5_6 = pd.read_csv("../partial_data/5/5_reporting_6.csv")

Encode all of the data frames

In [6]:
data_frames = [r100_3, r100_6, r90_3, r90_6, r75_3, r75_6, r50_3, r50_6, 
               r25_3, r25_6, r15_3, r15_6, r10_3, r10_6, r5_3, r5_6]

for d in data_frames:
    d = encode(d)

Initialize StratifiedKFold's to be set to 5 spilts

In [7]:
skf = StratifiedKFold(n_splits=10)

# Random Forest

### Six Strata

Compare a Decision Tree with Correlations and 6 Strata

In [8]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_6.drop(columns=['TIGHT_RACE'])
y_over_s = r100_6.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_6.drop(columns=['WINNER'])
y_ = r100_6.WINNER

scores = []
tight_from_test = []
for train_index, test_index in skf.split(X_over_s, y_over_s):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_6.iloc[train_index])
    X = oversampled_df.drop(columns=['WINNER'])
    y = oversampled_df.WINNER
    
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    
    rf = RandomForestClassifier(max_depth=7, n_estimators=100)
    rf.fit(X_train, y_train)
    
    scores.append(rf.score(X_test, y_test))
    tight_from_test.append(rf.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
    
print(np.mean(scores), np.mean(tight_from_test))

0.685959595959596 0.5833333333333333


With 6 strata but no correlations

In [9]:
r100_6_nc = r100_6[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO',
                         'S4_DEM_RATIO','S5_DEM_RATIO', 'S6_DEM_RATIO',
                         'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO',
                         'S4_REP_RATIO', 'S5_REP_RATIO', 'S6_REP_RATIO',
                         'TIGHT_RACE', 'WINNER']]

In [10]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_6_nc.drop(columns=['TIGHT_RACE'])
y_over_s = r100_6_nc.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_6_nc.drop(columns=['WINNER'])
y_ = r100_6_nc.WINNER

scores = []
tight_from_test = []
for train_index, test_index in skf.split(X_over_s, y_over_s):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_6_nc.iloc[train_index])
    X = oversampled_df.drop(columns=['WINNER'])
    y = oversampled_df.WINNER
    
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    
    rf = RandomForestClassifier(max_depth=7, n_estimators=100)
    rf.fit(X_train, y_train)
    
    scores.append(rf.score(X_test, y_test))
    tight_from_test.append(rf.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
    
print(np.mean(scores), np.mean(tight_from_test))

0.7383838383838384 0.7833333333333333


3 strata with correlations

In [11]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_3.drop(columns=['TIGHT_RACE'])
y_over_s = r100_3.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_3.drop(columns=['WINNER'])
y_ = r100_3.WINNER

scores = []
tight_from_test = []
for train_index, test_index in skf.split(X_over_s, y_over_s):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_3.iloc[train_index])
    X = oversampled_df.drop(columns=['WINNER'])
    y = oversampled_df.WINNER
    
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    
    rf = RandomForestClassifier(max_depth=7, n_estimators=100)
    rf.fit(X_train, y_train)
    
    scores.append(rf.score(X_test, y_test))
    tight_from_test.append(rf.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
    
print(np.mean(scores), np.mean(tight_from_test))

0.731111111111111 0.65


3 strata no correlations

In [12]:
r100_3_nc = r100_3[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO',
                    'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO',
                    'TIGHT_RACE', 'WINNER']]

In [13]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_3_nc.drop(columns=['TIGHT_RACE'])
y_over_s = r100_3_nc.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_3_nc.drop(columns=['WINNER'])
y_ = r100_3_nc.WINNER

scores = []
tight_from_test = []
for train_index, test_index in skf.split(X_over_s, y_over_s):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_3_nc.iloc[train_index])
    X = oversampled_df.drop(columns=['WINNER'])
    y = oversampled_df.WINNER
    
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    
    rf = RandomForestClassifier(max_depth=7, n_estimators=100)
    rf.fit(X_train, y_train)
    
    scores.append(rf.score(X_test, y_test))
    tight_from_test.append(rf.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
    
print(np.mean(scores), np.mean(tight_from_test))

0.7908080808080808 0.7833333333333333


# Decision Tree

In [23]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_6_nc.drop(columns=['TIGHT_RACE'])
y_over_s = r100_6_nc.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_6_nc.drop(columns=['WINNER'])
y_ = r100_6_nc.WINNER

scores = []
tight_from_test = []
for train_index, test_index in skf.split(X_over_s, y_over_s):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_6_nc.iloc[train_index])
    X = oversampled_df.drop(columns=['WINNER'])
    y = oversampled_df.WINNER
    
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    
    dt = DecisionTreeClassifier(max_depth=7)
    dt.fit(X_train, y_train)
    
    scores.append(dt.score(X_test, y_test))
    tight_from_test.append(dt.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
    
print(np.mean(scores), np.mean(tight_from_test))

0.8085858585858585 0.85


In [30]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_3_nc.drop(columns=['TIGHT_RACE'])
y_over_s = r100_3_nc.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_3_nc.drop(columns=['WINNER'])
y_ = r100_3_nc.WINNER

scores = []
tight_from_test = []
for train_index, test_index in skf.split(X_over_s, y_over_s):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_3_nc.iloc[train_index])
    X = oversampled_df.drop(columns=['WINNER'])
    y = oversampled_df.WINNER
    
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    
    dt = DecisionTreeClassifier(max_depth=7)
    dt.fit(X_train, y_train)
    
    scores.append(dt.score(X_test, y_test))
    tight_from_test.append(dt.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
    
print(np.mean(scores), np.mean(tight_from_test))

0.7572727272727272 0.75


# Boosted Trees

In [42]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_6_nc.drop(columns=['TIGHT_RACE'])
y_over_s = r100_6_nc.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_6_nc.drop(columns=['WINNER'])
y_ = r100_6_nc.WINNER

scores = []
tight_from_test = []
for train_index, test_index in skf.split(X_over_s, y_over_s):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_6_nc.iloc[train_index])
    X = oversampled_df.drop(columns=['WINNER'])
    y = oversampled_df.WINNER
    
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    
    gbc = GradientBoostingClassifier(n_estimators=100, max_depth=4)
    gbc.fit(X_train, y_train)
    
    scores.append(gbc.score(X_test, y_test))
    tight_from_test.append(gbc.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
    
print(np.mean(scores), np.mean(tight_from_test))

0.8118181818181818 0.9


In [46]:
# for stratification - seperate the TIGHT_RACE
X_over_s = r100_3_nc.drop(columns=['TIGHT_RACE'])
y_over_s = r100_3_nc.TIGHT_RACE

# for training - seperate the target (WINNER)
X_ = r100_3_nc.drop(columns=['WINNER'])
y_ = r100_3_nc.WINNER

scores = []
tight_from_test = []
for train_index, test_index in skf.split(X_over_s, y_over_s):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(r100_3_nc.iloc[train_index])
    X = oversampled_df.drop(columns=['WINNER'])
    y = oversampled_df.WINNER
    
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    
    gbc = GradientBoostingClassifier(n_estimators=100, max_depth=6)
    gbc.fit(X_train, y_train)
    
    scores.append(gbc.score(X_test, y_test))
    tight_from_test.append(gbc.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
    
print(np.mean(scores), np.mean(tight_from_test))

0.7765656565656565 0.7
