In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler 
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import roc_auc_score

Read the Senate analytical dataset and randomize the rows

In [2]:
# Read 
senate = pd.read_csv("../senate_analytical.csv")

# Randomize the data
senate = shuffle(senate)

senate.columns

Index(['Cand_Id', 'Cand_Incumbent_Challenger_Open_Seat', 'Cand_Name',
       'Cand_Office', 'Cand_State', 'GE WINNER INDICATOR',
       'Individual_Contribution', 'Other_Committee_Contribution',
       'Party_Committee_Contribution', 'Total_Disbursement', 'Total_Receipt',
       'year', 'top_individual_contribution', 'top_total_disbursement',
       'top_other_comm_contribution', 'top_party_comm_contribution',
       'in_state', 'incumbent', 'open', 'Cand_Party_Affiliation_DEM',
       'Cand_Party_Affiliation_OTHER', 'Cand_Party_Affiliation_REP',
       'Cand_Office_St_AK', 'Cand_Office_St_AL', 'Cand_Office_St_AR',
       'Cand_Office_St_AZ', 'Cand_Office_St_CA', 'Cand_Office_St_CO',
       'Cand_Office_St_CT', 'Cand_Office_St_DE', 'Cand_Office_St_FL',
       'Cand_Office_St_GA', 'Cand_Office_St_HI', 'Cand_Office_St_IA',
       'Cand_Office_St_ID', 'Cand_Office_St_IL', 'Cand_Office_St_IN',
       'Cand_Office_St_KS', 'Cand_Office_St_KY', 'Cand_Office_St_LA',
       'Cand_Office_St_MA',

In [3]:
y = senate["GE WINNER INDICATOR"]
X = senate.drop(columns=["Cand_Id","Cand_Incumbent_Challenger_Open_Seat","Cand_Name","Cand_Office","Cand_State","GE WINNER INDICATOR","Individual_Contribution","Other_Committee_Contribution","Party_Committee_Contribution","Total_Disbursement","Total_Receipt","year"])

print(y.value_counts())
print(X.shape)
# y.head()

0    2073
1     169
Name: GE WINNER INDICATOR, dtype: int64
(2242, 60)


Read the Test dataset (2018 Senate data) !

In [4]:
senate_test = pd.read_csv("../test_analytical.csv")

# print(senate_test["Cand_Office_St"].value_counts())

y_test = senate_test["GE WINNER INDICATOR"]

test_states = senate_test["Cand_Office_St"]
X_test = senate_test.drop(columns=["Cand_Office_St","Cand_Office_Dist","Cand_Id","Cand_Incumbent_Challenger_Open_Seat","Cand_Name","Cand_Office","Cand_State","GE WINNER INDICATOR","Individual_Contribution","Other_Committee_Contribution","Party_Committee_Contribution","Total_Disbursement","Total_Receipt"])


print(X_test.shape)
X_test.columns

# y_test = y_test.astype(int)
X_test.head()

(483, 60)


Unnamed: 0,top_individual_contribution,top_total_disbursement,top_other_comm_contribution,top_party_comm_contribution,in_state,incumbent,open,Cand_Party_Affiliation_DEM,Cand_Party_Affiliation_OTHER,Cand_Party_Affiliation_REP,...,Cand_Office_St_ID,Cand_Office_St_NH,Cand_Office_St_AR,Cand_Office_St_OK,Cand_Office_St_KS,Cand_Office_St_LA,Cand_Office_St_IL,Cand_Office_St_AL,Cand_Office_St_SD,Cand_Office_St_SC
0,0,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### Oversampling

In [5]:
sampling_over = RandomOverSampler(random_state=44)
X_train, y_train = sampling_over.fit_sample(X.values, y.values)

print(pd.Series(y_train).value_counts())

1    2073
0    2073
dtype: int64


In [6]:
clf = RandomForestClassifier(n_estimators=200)
clf.fit(X_train,y_train)

X_test_os = X_test.copy()
y_prob = clf.predict_proba(X_test_os)[:,1]

print("Score: " + str(clf.score(X_test_os,y_test)))

Score: 0.968944099378882


In [7]:
X_test_os["State"] = test_states
X_test_os["y_prob"] = y_prob

X_test_os["winner"] = 0
X_test_os.loc[X_test_os.groupby(['State'])["y_prob"].idxmax(),"winner"] = 1

print("AUC: " + str(roc_auc_score(y_test, y_prob)))
precision_recall_fscore_support(y_test, X_test_os["winner"])

AUC: 0.9848148148148148


(array([0.98663697, 0.79411765]),
 array([0.98444444, 0.81818182]),
 array([0.98553949, 0.80597015]),
 array([450,  33], dtype=int64))

### SMOTE

In [8]:
sampling_smote = SMOTE(random_state=44)
X_train, y_train = sampling_smote.fit_sample(X.values, y.values)

print(pd.Series(y_train).value_counts())

1    2073
0    2073
dtype: int64


In [9]:
clf = RandomForestClassifier(n_estimators=200)
clf.fit(X_train,y_train)

X_test_smote = X_test.copy()

y_prob = clf.predict_proba(X_test_smote)[:,1]

print("Score: " + str(clf.score(X_test_smote,y_test)))

Score: 0.9730848861283644


In [10]:
X_test_smote["State"] = test_states
X_test_smote["y_prob"] = y_prob

X_test_smote["winner"] = 0
X_test_smote.loc[X_test_smote.groupby(['State'])["y_prob"].idxmax(),"winner"] = 1

print("AUC: " + str(roc_auc_score(y_test, y_prob)))
precision_recall_fscore_support(y_test, X_test_smote["winner"])

AUC: 0.9848148148148148


(array([0.98886414, 0.82352941]),
 array([0.98666667, 0.84848485]),
 array([0.98776418, 0.8358209 ]),
 array([450,  33], dtype=int64))

### Undersampling

In [11]:
sampling_under = RandomUnderSampler(random_state=44)
X_train, y_train = sampling_under.fit_sample(X.values, y.values)

print(pd.Series(y_train).value_counts())

1    169
0    169
dtype: int64


In [12]:
clf = RandomForestClassifier(n_estimators=200)
clf.fit(X_train,y_train)

X_test_under = X_test.copy()

y_prob = clf.predict_proba(X_test_under)[:,1]
print("Score: " + str(clf.score(X_test_under,y_test)))

Score: 0.9503105590062112


In [13]:
X_test_under["State"] = test_states
X_test_under["y_prob"] = y_prob

X_test_under["winner"] = 0
X_test_under.loc[X_test_under.groupby(['State'])["y_prob"].idxmax(),"winner"] = 1

print("AUC: " + str(roc_auc_score(y_test, y_prob)))
precision_recall_fscore_support(y_test, X_test_under["winner"])

AUC: 0.9871043771043772


(array([0.98886414, 0.82352941]),
 array([0.98666667, 0.84848485]),
 array([0.98776418, 0.8358209 ]),
 array([450,  33], dtype=int64))

### None

In [14]:
clf = RandomForestClassifier(n_estimators=200)
clf.fit(X.values,y.values)

X_test_none = X_test.copy()

y_prob = clf.predict_proba(X_test_none)[:,1]

print("Score: " + str(clf.score(X_test_none,y_test)))

Score: 0.9730848861283644


In [15]:
X_test_none["State"] = test_states
X_test_none["y_prob"] = y_prob

X_test_none["winner"] = 0
X_test_none.loc[X_test_none.groupby(['State'])["y_prob"].idxmax(),"winner"] = 1

print("AUC: " + str(roc_auc_score(y_test, y_prob)))
precision_recall_fscore_support(y_test, X_test_none["winner"])

AUC: 0.9852525252525253


(array([0.98886414, 0.82352941]),
 array([0.98666667, 0.84848485]),
 array([0.98776418, 0.8358209 ]),
 array([450,  33], dtype=int64))

In [16]:
clf = RandomForestClassifier(n_estimators=200)
clf.fit(X_train,y_train)
X_test_1 = X_test.copy()
clf.score(X_test_1,y_test)

0.9668737060041408

### Let's plot graph for each sampling method