In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("../data/final.csv")

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
for column in df.columns:
    if df[column].dtype == type(object):
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])

In [6]:
from sklearn.model_selection import StratifiedKFold

In [7]:
X_kf = df.drop(columns=['TIGHT_RACE'])
y_kf = df.TIGHT_RACE

X_ = df.drop(columns=['WINNER'])
y_ = df.WINNER

skf = StratifiedKFold(n_splits=10)

In [8]:
def oversample(data_f):
    df_contested = data_f[df.TIGHT_RACE == True].reset_index(drop=True)
    df_not_contested = data_f[df.TIGHT_RACE == False].reset_index(drop=True)
    sample = np.random.choice(range(df_contested.shape[0]), size=df_not_contested.shape[0], replace=True)
    df_contested_bootstrapped = df_contested.iloc[sample].reset_index(drop=True)
    frames = [df_not_contested, df_contested_bootstrapped]
    df_new = pd.concat(frames)
    return df_new

# Decision Tree

In [9]:
from sklearn.tree import DecisionTreeClassifier

In [10]:
scores = []
for train_index, test_index in skf.split(X_kf, y_kf):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(df)
    X = oversampled_df.drop(columns=['WINNER'])
    y = oversampled_df.WINNER
    
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    
    dt = DecisionTreeClassifier(max_depth=7)
    dt.fit(X_train, y_train)
    scores.append(dt.score(X_test, y_test))
    #print(dt.tree_.node_count, dt.tree_.max_depth)
    
print(scores); print(np.mean(scores))

[0.7272727272727273, 0.6363636363636364, 0.6, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8888888888888888, 1.0]
0.8852525252525252


### Use fewer features

In [11]:
X_kf = df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO']]
y_kf = df.TIGHT_RACE

X_ = df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO', 'TIGHT_RACE']]
y_ = df.WINNER

skf = StratifiedKFold(n_splits=10, shuffle=True)

In [12]:
scores = []
for train_index, test_index in skf.split(X_kf, y_kf):
    
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(df)
    X = oversampled_df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO', 'TIGHT_RACE']]
    y = oversampled_df.WINNER
    
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    
    dt = DecisionTreeClassifier(max_depth=5)
    dt.fit(X_train, y_train)
    scores.append(dt.score(X_test, y_test))
    # print(dt.tree_.max_depth)
    
print(scores); print(np.mean(scores))

[1.0, 1.0, 1.0, 0.9, 1.0, 0.7777777777777778, 1.0, 1.0, 0.8888888888888888, 1.0]
0.9566666666666667


# Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
scores = []
for train_index, test_index in skf.split(X_kf, y_kf):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(df)
    X = oversampled_df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO', 'TIGHT_RACE']]
    y = oversampled_df.WINNER
    
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    
    rf = RandomForestClassifier(max_depth=7, n_estimators=100)
    rf.fit(X_train, y_train)
    scores.append(rf.score(X_test, y_test))
    # print(dt.tree_.max_depth)
    
print(scores); print(np.mean(scores))

[0.9090909090909091, 0.9090909090909091, 0.9, 0.8, 1.0, 0.8888888888888888, 1.0, 1.0, 1.0, 0.8888888888888888]
0.9295959595959596


# Boosted Trees

In [15]:
from sklearn.ensemble import GradientBoostingClassifier

In [16]:
scores = []
for train_index, test_index in skf.split(X_kf, y_kf):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(df)
    X = oversampled_df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO', 'TIGHT_RACE']]
    y = oversampled_df.WINNER
    
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    
    gbt = GradientBoostingClassifier(n_estimators=100, max_depth=2)
    gbt.fit(X_train, y_train)
    scores.append(gbt.score(X_test, y_test))
    
print(scores); print(np.mean(scores))

[1.0, 1.0, 1.0, 0.9, 0.7777777777777778, 1.0, 1.0, 0.7777777777777778, 1.0, 0.8888888888888888]
0.9344444444444445


test the accuracy on the tight races

In [17]:
tight_only = df[df.TIGHT_RACE]

In [18]:
tight_X = tight_only[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO', 'TIGHT_RACE']]
tight_y = tight_only.WINNER

In [19]:
scores = []
score_tight = []
tight_from_test = []
for train_index, test_index in skf.split(X_kf, y_kf):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(df)
    X = oversampled_df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO', 'TIGHT_RACE']]
    y = oversampled_df.WINNER
    
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    
    gbt = GradientBoostingClassifier(n_estimators=100)
    gbt.fit(X_train, y_train)
    scores.append(gbt.score(X_test, y_test))
    score_tight.append(gbt.score(tight_X, tight_y))
    tight_from_test.append(gbt.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
    
print(np.mean(scores), np.mean(score_tight), np.mean(tight_from_test))

0.9333333333333333 0.9045454545454545 0.85
