In [18]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

In [10]:
# Read 
senate = pd.read_csv("senate_analytical.csv")

senate.head()

Unnamed: 0,Cand_Id,Cand_Incumbent_Challenger_Open_Seat,Cand_Name,Cand_Office,Cand_State,GE WINNER INDICATOR,Individual_Contribution,Other_Committee_Contribution,Party_Committee_Contribution,Total_Disbursement,...,Cand_Office_St_SD,Cand_Office_St_TN,Cand_Office_St_TX,Cand_Office_St_UT,Cand_Office_St_VA,Cand_Office_St_VT,Cand_Office_St_WA,Cand_Office_St_WI,Cand_Office_St_WV,Cand_Office_St_WY
0,S6AK00078,CHALLENGER,"CUDDY, DAVID W",S,AK,0,31261.0,260.0,0.0,862663.0,...,0,0,0,0,0,0,0,0,0,0
1,S8AK00074,CHALLENGER,"CALDERO, ROCKY CHRISTOPHER",S,AK,0,31622.0,0.0,0.0,25891.0,...,0,0,0,0,0,0,0,0,0,0
2,S8AK00082,CHALLENGER,"METCALFE, RAY",S,AK,0,30267.0,0.0,0.0,30562.0,...,0,0,0,0,0,0,0,0,0,0
3,S8AK00108,CHALLENGER,"SIKMA, RODERIC H 'RICK'",S,AK,0,4746.0,0.0,0.0,12203.0,...,0,0,0,0,0,0,0,0,0,0
4,S8AK00124,CHALLENGER,"VICKERS, RAYMOND B VIC",S,AK,0,10215.0,0.0,0.0,1010617.0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
y = senate["GE WINNER INDICATOR"]
X = senate.drop(columns=["Cand_Id","Cand_Incumbent_Challenger_Open_Seat","Cand_Name","Cand_Office","Cand_State","GE WINNER INDICATOR","Individual_Contribution","Other_Committee_Contribution","Party_Committee_Contribution","Total_Disbursement","Total_Receipt","year"])

print(y.value_counts())
print(X.shape)
# y.head()

0    2073
1     169
Name: GE WINNER INDICATOR, dtype: int64
(2242, 60)


In [12]:
senate_test = pd.read_csv("test_analytical.csv")

# print(senate_test["Cand_Office_St"].value_counts())

y_test = senate_test["GE WINNER INDICATOR"]

test_states = senate_test["Cand_Office_St"]
X_test = senate_test.drop(columns=["Cand_Office_St","Cand_Office_Dist","Cand_Id","Cand_Incumbent_Challenger_Open_Seat","Cand_Name","Cand_Office","Cand_State","GE WINNER INDICATOR","Individual_Contribution","Other_Committee_Contribution","Party_Committee_Contribution","Total_Disbursement","Total_Receipt"])


print(X_test.shape)
X_test.columns

# y_test = y_test.astype(int)
X_test.head()

(483, 60)


Unnamed: 0,top_individual_contribution,top_total_disbursement,top_other_comm_contribution,top_party_comm_contribution,in_state,incumbent,open,Cand_Party_Affiliation_DEM,Cand_Party_Affiliation_OTHER,Cand_Party_Affiliation_REP,...,Cand_Office_St_ID,Cand_Office_St_NH,Cand_Office_St_AR,Cand_Office_St_OK,Cand_Office_St_KS,Cand_Office_St_LA,Cand_Office_St_IL,Cand_Office_St_AL,Cand_Office_St_SD,Cand_Office_St_SC
0,0,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### Random Forest

In [13]:
rf = RandomForestClassifier(n_estimators=300)
rf.fit(X,y)
X_test_rf = X_test.copy()
y_prob_rf = rf.predict_proba(X_test_rf)[:,1]

print("Score: " + str(rf.score(X_test_rf,y_test)))

X_test_rf["State"] = test_states
X_test_rf["y_prob"] = y_prob_rf

X_test_rf["winner"] = 0
X_test_rf.loc[X_test_rf.groupby(['State'])["y_prob"].idxmax(),"winner"] = 1

print("AUC: " + str(roc_auc_score(y_test, y_prob_rf)))
precision_recall_fscore_support(y_test, X_test_rf["winner"])

Score: 0.9730848861283644
AUC: 0.9859259259259259


(array([0.98886414, 0.82352941]),
 array([0.98666667, 0.84848485]),
 array([0.98776418, 0.8358209 ]),
 array([450,  33], dtype=int64))

In [19]:
X_test_rf["actual"] = y_test

Dem_wins = X_test_rf[(X_test_rf["winner"]==1) & (X_test_rf['Cand_Party_Affiliation_DEM']==1)].shape[0]
print("Pred DEM: " + str(Dem_wins))

Dem_tp = X_test_rf[(X_test_rf["winner"]==1) & (X_test_rf["actual"] == 1) & (X_test_rf['Cand_Party_Affiliation_DEM']==1)].shape[0]
print("True positives for DEM: " + str(Dem_tp))


Rep_wins = X_test_rf[(X_test_rf["winner"]==1) & (X_test_rf['Cand_Party_Affiliation_REP']==1)].shape[0]
print("Pred REP: " + str(Rep_wins))

Rep_tp = X_test_rf[(X_test_rf["winner"]==1) & (X_test_rf["actual"] == 1) & (X_test_rf['Cand_Party_Affiliation_REP']==1)].shape[0]
print("True positives for REP: " + str(Rep_tp))

#x2[(x2["pred"]==1) & (x2['Cand_Party_Affiliation_REP']==1)].shape
#x2[(x2["pred"]==1) & (x2['Cand_Party_Affiliation_OTHER']==1)].shape
tn, fp, fn, tp = confusion_matrix(y_test, X_test_rf["winner"]).ravel()
print(tn, fp, fn, tp)

Pred DEM: 24
True positives for DEM: 20
Pred REP: 8
True positives for REP: 6
444 6 5 28


### KNN

In [6]:
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X, y)
X_test_knn = X_test.copy()
y_prob_knn = neigh.predict_proba(X_test_knn)[:,1]

print("Score: " + str(neigh.score(X_test_knn,y_test)))

X_test_knn["State"] = test_states
X_test_knn["y_prob"] = y_prob_knn

X_test_knn["winner"] = 0
X_test_knn.loc[X_test_knn.groupby(['State'])["y_prob"].idxmax(),"winner"] = 1

print("AUC: " + str(roc_auc_score(y_test, y_prob_knn)))
precision_recall_fscore_support(y_test, X_test_knn["winner"])

Score: 0.9751552795031055
AUC: 0.928956228956229


(array([0.98663697, 0.79411765]),
 array([0.98444444, 0.81818182]),
 array([0.98553949, 0.80597015]),
 array([450,  33], dtype=int64))

### SVMs

In [7]:
c_values = [0.01,0.03,0.1,0.3,1,3,10,30,100,300]
training_acc = []
testing_acc = []

opt_test_c1 =  0.01
opt_test_c1 =  0.01

opt_test_precision = 0
opt_test_recall = 0

for c in c_values:
    svm_model_op = SVC(C=c,probability=True)
    svm_model_op.fit(X, y) 

    X_test_svm = X_test.copy()
    y_prob = svm_model_op.predict_proba(X_test_svm)[:,1]
    
    X_test_svm["State"] = test_states
    X_test_svm["y_prob"] = y_prob

    X_test_svm["winner"] = 0
    X_test_svm.loc[X_test_svm.groupby(['State'])["y_prob"].idxmax(),"winner"] = 1

    test_scores = precision_recall_fscore_support(y_test, X_test_svm["winner"])
        
    testing_acc.append(test_scores)
    
    if test_scores[0][1] > opt_test_precision:
        opt_test_precision = test_scores[0][1]
        opt_test_c1 = c

    if test_scores[1][1] > opt_test_recall:
        opt_test_recall = test_scores[1][1]
        opt_test_c2 = c

# print("Optimal Precision: "+ str(opt_test_precision) + "\t C: " + str(opt_test_c1))
# print("Optimal Recall: "+ str(opt_test_recall) + "\t C: " + str(opt_test_c2))
svm_model_op = SVC(C=opt_test_c1,probability=True)

svm_model_op.fit(X, y)
X_test_svm = X_test.copy()
y_test_svm = svm_model_op.predict_proba(X_test_svm)[:,1]

print("Score: " + str(svm_model_op.score(X_test_svm,y_test)))

X_test_svm["State"] = test_states
X_test_svm["y_prob"] = y_test_svm

X_test_svm["winner"] = 0
X_test_svm.loc[X_test_svm.groupby(['State'])["y_prob"].idxmax(),"winner"] = 1

print("AUC: " + str(roc_auc_score(y_test, y_test_svm)))
precision_recall_fscore_support(y_test, X_test_svm["winner"])

Score: 0.9710144927536232
AUC: 0.988080808080808


(array([0.98663697, 0.79411765]),
 array([0.98444444, 0.81818182]),
 array([0.98553949, 0.80597015]),
 array([450,  33], dtype=int64))

### Logistic Regression

In [8]:
logit = LogisticRegression(C=1)
logit.fit(X, y)
X_test_log = X_test.copy()
y_test_log = logit.predict_proba(X_test_log)[:,1]

print("Score: " + str(logit.score(X_test_log,y_test)))

X_test_log["State"] = test_states
X_test_log["y_prob"] = y_test_log

X_test_log["winner"] = 0
X_test_log.loc[X_test_log.groupby(['State'])["y_prob"].idxmax(),"winner"] = 1

print("AUC: " + str(roc_auc_score(y_test, y_test_log)))
precision_recall_fscore_support(y_test, X_test_log["winner"])

Score: 0.9710144927536232
AUC: 0.9857912457912459


(array([0.98663697, 0.79411765]),
 array([0.98444444, 0.81818182]),
 array([0.98553949, 0.80597015]),
 array([450,  33], dtype=int64))