In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("../data/final.csv")

df_contested = df[df.TIGHT_RACE == True].reset_index(drop=True)
df_not_contested = df[df.TIGHT_RACE == False].reset_index(drop=True)
sample = np.random.choice(range(df_contested.shape[0]), size=df_not_contested.shape[0], replace=True)
df_contested_bootstrapped = df_contested.iloc[sample].reset_index(drop=True)
frames = [df_not_contested, df_contested_bootstrapped]
df_new = pd.concat(frames)
df = df_new

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
for column in df.columns:
    if df[column].dtype == type(object):
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])

In [6]:
from sklearn.model_selection import StratifiedKFold

In [7]:
X_kf = df.drop(columns=['TIGHT_RACE'])
y_kf = df.TIGHT_RACE

X = df.drop(columns=['WINNER'])
y = df.WINNER

skf = StratifiedKFold(n_splits=10)

# Decision Tree

In [8]:
from sklearn.tree import DecisionTreeClassifier

In [9]:
scores = []
for train_index, test_index in skf.split(X_kf, y_kf):
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    dt = DecisionTreeClassifier(max_depth=7)
    dt.fit(X_train, y_train)
    scores.append(dt.score(X_test, y_test))
    print(dt.tree_.node_count, dt.tree_.max_depth)
    
print(scores); print(np.mean(scores))

29 7
31 7
25 6
25 6
27 6
29 7
31 6
27 7
27 7
31 7
[0.875, 0.8125, 0.75, 0.8125, 0.9285714285714286, 0.9285714285714286, 0.9285714285714286, 0.7857142857142857, 0.7857142857142857, 0.7142857142857143]
0.8321428571428571


### Use fewer features

In [10]:
X_kf = df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO']]
y_kf = df.TIGHT_RACE

X = df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO', 'TIGHT_RACE']]
y = df.WINNER

skf = StratifiedKFold(n_splits=10, shuffle=True)

In [11]:
scores = []
for train_index, test_index in skf.split(X_kf, y_kf):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    dt = DecisionTreeClassifier(max_depth=5)
    dt.fit(X_train, y_train)
    scores.append(dt.score(X_test, y_test))
    # print(dt.tree_.max_depth)
    
print(scores); print(np.mean(scores))

[0.875, 1.0, 0.8125, 0.875, 0.9285714285714286, 0.8571428571428571, 0.8571428571428571, 1.0, 0.8571428571428571, 1.0]
0.90625


# Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
scores = []
for train_index, test_index in skf.split(X_kf, y_kf):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    rf = RandomForestClassifier(max_depth=7, n_estimators=100)
    rf.fit(X_train, y_train)
    scores.append(rf.score(X_test, y_test))
    # print(dt.tree_.max_depth)
    
print(scores); print(np.mean(scores))

[0.9375, 0.9375, 1.0, 0.875, 0.9285714285714286, 0.7857142857142857, 0.9285714285714286, 0.9285714285714286, 1.0, 0.8571428571428571]
0.9178571428571429


# Boosted Trees

In [14]:
from sklearn.ensemble import GradientBoostingClassifier

In [15]:
scores = []
for train_index, test_index in skf.split(X_kf, y_kf):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    gbt = GradientBoostingClassifier(n_estimators=100, max_depth=2)
    gbt.fit(X_train, y_train)
    scores.append(gbt.score(X_test, y_test))
    
print(scores); print(np.mean(scores))

[0.875, 0.8125, 1.0, 0.9375, 1.0, 0.9285714285714286, 0.9285714285714286, 0.9285714285714286, 0.7857142857142857, 0.9285714285714286]
0.9125


test the accuracy on the tight races

In [16]:
tight_only = df[df.TIGHT_RACE]

In [17]:
tight_X = tight_only[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO', 'TIGHT_RACE']]
tight_y = tight_only.WINNER

In [36]:
scores = []
score_tight = []
tight_from_test = []
for train_index, test_index in skf.split(X_kf, y_kf):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    gbt = GradientBoostingClassifier(n_estimators=100, max_depth=2)
    gbt.fit(X_train, y_train)
    scores.append(gbt.score(X_test, y_test))
    score_tight.append(gbt.score(tight_X, tight_y))
    tight_from_test.append(gbt.score(X_test[X_test.TIGHT_RACE], y_test[X_test.TIGHT_RACE]))
    
print(np.mean(scores), np.mean(score_tight), np.mean(tight_from_test))

0.9392857142857143 1.0 1.0
