In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../data/final.csv")

df_contested = df[df.TIGHT_RACE == True].reset_index(drop=True)
df_not_contested = df[df.TIGHT_RACE == False].reset_index(drop=True)
sample = np.random.choice(range(df_contested.shape[0]), size=df_not_contested.shape[0], replace=True)
df_contested_bootstrapped = df_contested.iloc[sample].reset_index(drop=True)
frames = [df_not_contested, df_contested_bootstrapped]
df_new = pd.concat(frames)
df = df_new

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
for column in df.columns:
    if df[column].dtype == type(object):
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])

In [5]:
from sklearn.model_selection import StratifiedKFold

In [6]:
X_kf = df.drop(columns=['TIGHT_RACE'])
y_kf = df.TIGHT_RACE

X = df.drop(columns=['WINNER'])
y = df.WINNER

skf = StratifiedKFold(n_splits=10)

# Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier

In [20]:
scores = []
for train_index, test_index in skf.split(X_kf, y_kf):
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    dt = DecisionTreeClassifier(max_depth=7)
    dt.fit(X_train, y_train)
    scores.append(dt.score(X_test, y_test))
    print(dt.tree_.node_count, dt.tree_.max_depth)
    
print(scores); print(np.mean(scores))

27 7
29 6
25 6
25 6
29 7
31 7
29 6
31 7
27 7
29 7
[0.8125, 0.8125, 0.75, 0.625, 0.875, 0.9285714285714286, 1.0, 0.8571428571428571, 0.8571428571428571, 0.8571428571428571]
0.8375


### Use fewer features

In [27]:
X_kf = df[['S1_DEM_RATIO','S2_DEM_RATIO', 'S3_DEM_RATIO', 'S1_REP_RATIO', 'S2_REP_RATIO', 'S3_REP_RATIO']]
y_kf = df.TIGHT_RACE

X = X_kf
y = df.WINNER

skf = StratifiedKFold(n_splits=10, shuffle=True)

In [29]:
scores = []
for train_index, test_index in skf.split(X_kf, y_kf):
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    dt = DecisionTreeClassifier(max_depth=5)
    dt.fit(X_train, y_train)
    scores.append(dt.score(X_test, y_test))
    print(dt.tree_.max_depth)
    print(dt.feature_importances_)
    
print(scores); print(np.mean(scores))

5
[0.05432249 0.44245417 0.13356381 0.         0.34362839 0.02603114]
5
[0.1735043  0.34635139 0.14721226 0.25712549 0.07580656 0.        ]
5
[0.         0.67039847 0.03073109 0.12688965 0.17198079 0.        ]
5
[0.         0.41195506 0.12180416 0.15969195 0.27685129 0.02969754]
5
[0.13099654 0.43573383 0.0802899  0.0575851  0.26903405 0.02636057]
5
[0.07390128 0.37963235 0.23408029 0.02956907 0.25498729 0.02782971]
5
[0.01769009 0.38582412 0.13151257 0.027253   0.39549226 0.04222795]
5
[0.07945871 0.37331878 0.08692582 0.28446406 0.14642604 0.02940659]
5
[0.02451511 0.44881646 0.02846915 0.13934902 0.33071118 0.02813908]
5
[0.         0.39815309 0.14324745 0.1106718  0.25809174 0.08983591]
[0.875, 0.8125, 0.875, 0.875, 0.875, 0.8571428571428571, 0.8571428571428571, 0.9285714285714286, 0.9285714285714286, 0.8571428571428571]
0.8741071428571429
