In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

## Helper Functions

In [3]:
def encode(df):
    """
    Uses LabelEncoder to encode disrecete values
    """
    for column in df.columns:
        if df[column].dtype == type(object):
            le = LabelEncoder()
            df[column] = le.fit_transform(df[column])

In [4]:
def oversample(data_f):
    """
    Oversamples the highly contested races
    """
    df_contested = data_f[df.TIGHT_RACE == True].reset_index(drop=True)
    df_not_contested = data_f[df.TIGHT_RACE == False].reset_index(drop=True)
    sample = np.random.choice(range(df_contested.shape[0]), size=df_not_contested.shape[0], replace=True)
    df_contested_bootstrapped = df_contested.iloc[sample].reset_index(drop=True)
    frames = [df_not_contested, df_contested_bootstrapped]
    df_new = pd.concat(frames)
    return df_new

## Datasets

- **r100** : 100% of precincts reporting
- **r5**: 5% of precincts reporting
- **_3**: 3 strata
- **_6**: 6 strata

In [5]:
# 100 % reporting
r100_3 = pd.read_csv("../full_data/final.csv")
r100_6 = pd.read_csv("../full_data/full_final.csv")

# 90 % reporting
r90_3 = pd.read_csv("../partial_data/90/90_reporting.csv")
r90_6 = pd.read_csv("../partial_data/90/90_reporting_6.csv")

# 75 % reporting
r75_3 = pd.read_csv("../partial_data/75/75_reporting.csv")
r75_6 = pd.read_csv("../partial_data/75/75_reporting_6.csv")

# 50 % reporting
r50_3 = pd.read_csv("../partial_data/50/50_reporting.csv")
r50_6 = pd.read_csv("../partial_data/50/50_reporting_6.csv")

# 25 % reporting
r25_3 = pd.read_csv("../partial_data/25/25_reporting.csv")
r25_6 = pd.read_csv("../partial_data/25/25_reporting_6.csv")

# 15 % reporting
r15_3 = pd.read_csv("../partial_data/15/15_reporting.csv")
r15_6 = pd.read_csv("../partial_data/15/15_reporting_6.csv")

# 10 % reporting
r10_3 = pd.read_csv("../partial_data/10/10_reporting.csv")
r10_6 = pd.read_csv("../partial_data/10/10_reporting_6.csv")

# 5 % reporting
r5_3 = pd.read_csv("../partial_data/5/5_reporting.csv")
r5_6 = pd.read_csv("../partial_data/5/5_reporting_6.csv")

Encode all of the data frames

In [8]:
data_frames = [r100_3, r100_6, r90_3, r90_6, r75_3, r75_6, r50_3, r50_6, 
               r25_3, r25_6, r15_3, r15_6, r10_3, r10_6, r5_3, r5_6]

for d in data_frames:
    d = encode(d)

Initialize StratifiedKFold's to be set to 5 spilts

In [10]:
skf = StratifiedKFold(n_splits=5)

# Decision Tree

### Six Strata

In [None]:
X_kf = r100_6.drop(columns=['TIGHT_RACE'])
y_kf = df.TIGHT_RACE

X_ = df.drop(columns=['WINNER'])
y_ = df.WINNER