In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("../full_data/final.csv")

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
for column in df.columns:
    if df[column].dtype == type(object):
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])

In [6]:
from sklearn.model_selection import StratifiedKFold

In [15]:
X_kf = df.drop(columns=['TIGHT_RACE'])
y_kf = df.TIGHT_RACE

X_ = df.drop(columns=['WINNER'])
y_ = df.WINNER

skf = StratifiedKFold(n_splits=10)

In [16]:
def oversample(data_f):
    df_contested = data_f[df.TIGHT_RACE == True].reset_index(drop=True)
    df_not_contested = data_f[df.TIGHT_RACE == False].reset_index(drop=True)
    sample = np.random.choice(range(df_contested.shape[0]), size=df_not_contested.shape[0], replace=True)
    df_contested_bootstrapped = df_contested.iloc[sample].reset_index(drop=True)
    frames = [df_not_contested, df_contested_bootstrapped]
    df_new = pd.concat(frames)
    return df_new

# Decision Tree - includes corrs

In [9]:
from sklearn.tree import DecisionTreeClassifier

In [17]:
scores = []
for train_index, test_index in skf.split(X_kf, y_kf):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(df)
    X = oversampled_df.drop(columns=['WINNER'])
    y = oversampled_df.WINNER
    
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    
    dt = DecisionTreeClassifier(max_depth=5)
    dt.fit(X_train, y_train)
    scores.append(dt.score(X_test, y_test))
    #print(dt.tree_.node_count, dt.tree_.max_depth)
    
print(scores); print(np.mean(scores))

[0.2727272727272727, 0.7272727272727273, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7777777777777778, 1.0]
0.8577777777777778


### Use fewer features

In [18]:
X_kf = df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO']]
y_kf = df.TIGHT_RACE

X_ = df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO', 'TIGHT_RACE']]
y_ = df.WINNER

skf = StratifiedKFold(n_splits=10, shuffle=True)

In [19]:
scores = []
for train_index, test_index in skf.split(X_kf, y_kf):
    
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(df)
    X = oversampled_df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO', 'TIGHT_RACE']]
    y = oversampled_df.WINNER
    
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    
    dt = DecisionTreeClassifier(max_depth=5)
    dt.fit(X_train, y_train)
    scores.append(dt.score(X_test, y_test))
    # print(dt.tree_.max_depth)
    
print(scores); print(np.mean(scores))

[0.8181818181818182, 0.8181818181818182, 0.8, 0.9, 1.0, 1.0, 0.8888888888888888, 1.0, 0.7777777777777778, 1.0]
0.9003030303030304


# Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
scores = []
for train_index, test_index in skf.split(X_kf, y_kf):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(df)
    X = oversampled_df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO', 'TIGHT_RACE']]
    y = oversampled_df.WINNER
    
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    
    rf = RandomForestClassifier(max_depth=7, n_estimators=100)
    rf.fit(X_train, y_train)
    scores.append(rf.score(X_test, y_test))
    # print(dt.tree_.max_depth)
    
print(scores); print(np.mean(scores))

[1.0, 1.0, 1.0, 1.0, 0.8888888888888888, 0.8888888888888888, 0.8888888888888888, 0.8888888888888888, 0.8888888888888888, 1.0]
0.9444444444444444


test the accuracy on the tight only races

In [24]:
tight_only = df[df.TIGHT_RACE]

In [25]:
tight_X = tight_only[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO', 'TIGHT_RACE']]
tight_y = tight_only.WINNER

Test on lower reporting

In [30]:
five_p_reporting = pd.read_csv('../partial_data/5/5_reporting.csv')

In [33]:
for column in five_p_reporting.columns:
    if five_p_reporting[column].dtype == type(object):
        le = LabelEncoder()
        five_p_reporting[column] = le.fit_transform(five_p_reporting[column])

In [34]:
five_x = five_p_reporting[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO', 'TIGHT_RACE']]
five_y = five_p_reporting.WINNER

In [37]:
five_p_tight_only = five_p_reporting[five_p_reporting.TIGHT_RACE]
five_p_tight_X = five_p_tight_only[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO', 'TIGHT_RACE']]
five_p_tight_y = five_p_tight_only.WINNER

In [41]:
scores = []
score_tight = []
tight_from_test = []
five_p_scores = []
five_p_tight = []
for train_index, test_index in skf.split(X_kf, y_kf):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(df)
    X = oversampled_df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO', 'TIGHT_RACE']]
    y = oversampled_df.WINNER
    
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    
    rf = RandomForestClassifier(max_depth=7, n_estimators=100)
    rf.fit(X_train, y_train)
    scores.append(rf.score(X_test, y_test))
    score_tight.append(rf.score(tight_X, tight_y))
    tight_from_test.append(rf.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
    five_p_scores.append(rf.score(five_x, five_y))
    five_p_tight.append(rf.score(five_p_tight_X, five_p_tight_y))
    
print(np.mean(scores), np.mean(score_tight), np.mean(tight_from_test), np.mean(five_p_scores), np.mean(five_p_tight))

0.9284848484848485 0.9227272727272726 0.8 0.9138441725692209 0.8246575342465752


# Boosted Trees

In [22]:
from sklearn.ensemble import GradientBoostingClassifier

In [23]:
scores = []
for train_index, test_index in skf.split(X_kf, y_kf):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(df)
    X = oversampled_df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO', 'TIGHT_RACE']]
    y = oversampled_df.WINNER
    
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    
    gbt = GradientBoostingClassifier(n_estimators=100, max_depth=2)
    gbt.fit(X_train, y_train)
    scores.append(gbt.score(X_test, y_test))
    
print(scores); print(np.mean(scores))

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8888888888888888, 1.0, 0.8888888888888888, 0.8888888888888888]
0.9666666666666668


test the accuracy on the tight races

In [26]:
scores = []
score_tight = []
tight_from_test = []
for train_index, test_index in skf.split(X_kf, y_kf):
    X_test = X_.iloc[test_index]
    y_test = y_.iloc[test_index]
    
    oversampled_df = oversample(df)
    X = oversampled_df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO', 'TIGHT_RACE']]
    y = oversampled_df.WINNER
    
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    
    gbt = GradientBoostingClassifier(n_estimators=100)
    gbt.fit(X_train, y_train)
    scores.append(gbt.score(X_test, y_test))
    score_tight.append(gbt.score(tight_X, tight_y))
    tight_from_test.append(gbt.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
    
print(np.mean(scores), np.mean(score_tight), np.mean(tight_from_test))

0.9282828282828284 0.9090909090909092 0.8333333333333333
