In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

import warnings
warnings.filterwarnings('ignore')

In [2]:
def convert_value(value):
    
    if value == 'Y':
        return 1.0
    if value == 'N':
        return 0.0

# Read the CSV file from the Resources folder into a Pandas DataFrame
candidate_summary_df = pd.read_csv(
    Path("./Resources/CandidateSummaryAction1.csv")    
)

# Select specific columns
col_list = ['can_id','can_off_sta','can_inc_cha_ope_sea','can_off_dis','net_con','winner']
candidate_summary_df = candidate_summary_df[col_list]

# Convert net contributions to float by removing unwanted characters like $
candidate_summary_df["net_con"] = candidate_summary_df["net_con"].str.replace('$','').str.replace(',','').str.replace('(','-').str.replace(')','').astype('float32')

# If no values fill winner column with N
#candidate_summary_df['winner'] = candidate_summary_df['winner'].fillna('N')

# Convert Winner column 
#candidate_summary_df['winner'] = candidate_summary_df['winner'].apply(lambda val: convert_value(val))
candidate_summary_df['winner'] = candidate_summary_df['winner'].apply(lambda val: float(int(val=='Y')))

# Drop NaN Rows
candidate_summary_df = candidate_summary_df.dropna()
    
# Review the DataFrame
candidate_summary_df.head(10000)

#candidate_summary_df.loc[candidate_summary_df['winner']==0.0]

Unnamed: 0,can_id,can_off_sta,can_inc_cha_ope_sea,can_off_dis,net_con,winner
0,H2GA12121,GA,INCUMBENT,12.0,1.074950e+06,1.0
1,H6PA02171,PA,CHALLENGER,2.0,1.406719e+06,1.0
2,H6FL04105,FL,OPEN,4.0,6.508554e+05,1.0
3,H4MT01041,MT,INCUMBENT,0.0,4.938944e+06,1.0
4,H8CA09060,CA,INCUMBENT,13.0,1.197677e+06,1.0
...,...,...,...,...,...,...
1794,H6MS01164,MS,OPEN,1.0,1.025384e+05,0.0
1795,S6CA00618,CA,OPEN,0.0,1.525000e+04,0.0
1800,H6MS01198,MS,OPEN,1.0,1.408580e+05,0.0
1803,P20003158,US,OPEN,0.0,2.502500e+04,0.0


In [3]:
# First, construct lookup table of number of candidates running for each district
num_comp_lookup = candidate_summary_df.groupby(['can_off_sta','can_off_dis'])['can_id'].count().to_dict()

# Then perform lookup on this table to append number of competitors
def fcn(row):
    key = (row.can_off_sta,row.can_off_dis)
    if key in num_comp_lookup:
        return num_comp_lookup[key]
#     else:
#        # Handling NaNs:
#        num_competitors = 0
candidate_summary_df['num_comp'] = candidate_summary_df.apply(lambda row: fcn(row), axis=1)

# Drop rows where number of competitors couldn't be calculated (>0) and where uncontested (>1)
candidate_summary_df = candidate_summary_df[ candidate_summary_df['num_comp'] > 1 ]

In [4]:
# Add column: fraction of spend for this district

# First, construct lookup table of total spend for this district
total_net_con_for_district = candidate_summary_df.groupby(['can_off_sta','can_off_dis'])['net_con'].sum().to_dict()

# Add column giving this total of net contributions
def fcn(row):
    key = (row.can_off_sta,row.can_off_dis)
    if key in num_comp_lookup:
        return total_net_con_for_district[key]
#     else:
#        # Handling NaNs:
#        num_competitors = 0
candidate_summary_df['total_net_con_for_district'] = candidate_summary_df.apply(lambda row: fcn(row), axis=1)

# Calculate the fraction
candidate_summary_df['fraction_net_con_for_district'] = candidate_summary_df['net_con'] / candidate_summary_df['total_net_con_for_district']

In [5]:
candidate_summary_df

Unnamed: 0,can_id,can_off_sta,can_inc_cha_ope_sea,can_off_dis,net_con,winner,num_comp,total_net_con_for_district,fraction_net_con_for_district
0,H2GA12121,GA,INCUMBENT,12.0,1.074950e+06,1.0,2,1.102857e+06,0.974695
1,H6PA02171,PA,CHALLENGER,2.0,1.406719e+06,1.0,4,2.025600e+06,0.694470
2,H6FL04105,FL,OPEN,4.0,6.508554e+05,1.0,7,1.318453e+06,0.493651
3,H4MT01041,MT,INCUMBENT,0.0,4.938944e+06,1.0,2,7.256408e+06,0.680632
4,H8CA09060,CA,INCUMBENT,13.0,1.197677e+06,1.0,2,1.202198e+06,0.996239
...,...,...,...,...,...,...,...,...,...
1794,H6MS01164,MS,OPEN,1.0,1.025384e+05,0.0,13,2.719151e+06,0.037710
1795,S6CA00618,CA,OPEN,0.0,1.525000e+04,0.0,19,1.779746e+07,0.000857
1800,H6MS01198,MS,OPEN,1.0,1.408580e+05,0.0,13,2.719151e+06,0.051802
1803,P20003158,US,OPEN,0.0,2.502500e+04,0.0,63,8.042554e+09,0.000003


In [6]:
regress_cols = ['num_comp','fraction_net_con_for_district']
_ = candidate_summary_df[['winner'] + regress_cols].dropna()

# Separate the data into labels and features
# Separate the y variable, the labels
y = candidate_summary_df["winner"]

# Separate the X variable, the features
#X = candidate_summary_df.drop(columns=['winner'])
X = candidate_summary_df[regress_cols]

y.head()
y.value_counts()

0.0    1168
1.0     402
Name: winner, dtype: int64

In [7]:
X.shape

(1570, 2)

In [8]:
y.shape

(1570,)

In [9]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=0)

In [10]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
logistic_regression_model = LogisticRegression(random_state=0)

# Fit the model using training data
logistic_regression_model.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [11]:
# Make a prediction using the testing data
testing_predictions = logistic_regression_model.predict(X_test)
training_predictions = logistic_regression_model.predict(X_train)

In [12]:
from sklearn.metrics import accuracy_score

# Print the balanced_accuracy score of the model
accuracy_score(y_test, testing_predictions)

0.9554140127388535

In [13]:
# Import the module
from sklearn.metrics import classification_report

# Save and Print a training classification report
training_report = classification_report(y_train, training_predictions)

print(training_report)

              precision    recall  f1-score   support

         0.0       0.96      0.98      0.97       822
         1.0       0.95      0.87      0.91       277

    accuracy                           0.96      1099
   macro avg       0.95      0.93      0.94      1099
weighted avg       0.96      0.96      0.96      1099



In [14]:
#Import the module
from sklearn.metrics import confusion_matrix

#Print the confusion matrix
training_matrix = confusion_matrix(y_train, training_predictions)
print(training_matrix)

[[809  13]
 [ 35 242]]


In [15]:
# Save and Print a testing classification report
testing_report = classification_report(y_test, testing_predictions)

print(testing_report)

              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97       346
         1.0       0.92      0.91      0.92       125

    accuracy                           0.96       471
   macro avg       0.94      0.94      0.94       471
weighted avg       0.96      0.96      0.96       471



In [32]:
# Add column of predictions alongside each row
data_grp_preds = candidate_summary_df[['can_id']+ regress_cols].dropna(axis=0)

In [34]:
data_grp_preds['winner_prediction'] = logistic_regression_model.predict( data_grp_preds[regress_cols] )
data_grp_preds = data_grp_preds.set_index('can_id')
data_grp_preds.head()

Unnamed: 0_level_0,num_comp,fraction_net_con_for_district,winner_prediction
can_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
H2GA12121,2,0.974695,1.0
H6PA02171,4,0.69447,1.0
H6FL04105,7,0.493651,1.0
H4MT01041,2,0.680632,1.0
H8CA09060,2,0.996239,1.0


In [35]:
# Join main data with predictions data on can_id
candidate_summary_df = candidate_summary_df.set_index('can_id')
candidate_summary_df = candidate_summary_df.join(dat_grp_preds['winner_prediction'])

In [36]:
# Group the data by district
candidate_summary_df_grp = _.set_index(['can_off_sta','can_off_dis']).sort_values(by=['can_off_sta','can_off_dis'])
candidate_summary_df_grp

Unnamed: 0_level_0,Unnamed: 1_level_0,can_inc_cha_ope_sea,net_con,winner,num_comp,total_net_con_for_district,fraction_net_con_for_district,winner_prediction
can_off_sta,can_off_dis,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AK,0.0,INCUMBENT,1.051122e+06,1.0,8,5990981.00,0.175451,0.0
AK,0.0,CHALLENGER,5.471000e+03,0.0,8,5990981.00,0.000913,0.0
AK,0.0,CHALLENGER,1.354600e+04,0.0,8,5990981.00,0.002261,0.0
AK,0.0,CHALLENGER,2.725680e+05,0.0,8,5990981.00,0.045496,0.0
AK,0.0,CHALLENGER,6.633100e+05,0.0,8,5990981.00,0.110718,0.0
...,...,...,...,...,...,...,...,...
WY,0.0,OPEN,5.096600e+04,0.0,10,2421253.75,0.021049,0.0
WY,0.0,OPEN,1.938261e+05,0.0,10,2421253.75,0.080052,0.0
WY,0.0,OPEN,1.464268e+05,0.0,10,2421253.75,0.060476,0.0
WY,0.0,OPEN,2.154800e+04,0.0,10,2421253.75,0.008900,0.0


In [39]:
# Check winning predictions are comparable with the actual winner
candidate_summary_df_grp.loc[candidate_summary_df_grp['winner']== candidate_summary_df_grp['winner_prediction'] ]

Unnamed: 0_level_0,Unnamed: 1_level_0,can_inc_cha_ope_sea,net_con,winner,num_comp,total_net_con_for_district,fraction_net_con_for_district,winner_prediction
can_off_sta,can_off_dis,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AK,0.0,CHALLENGER,5471.000000,0.0,8,5990981.00,0.000913,0.0
AK,0.0,CHALLENGER,13546.000000,0.0,8,5990981.00,0.002261,0.0
AK,0.0,CHALLENGER,272568.000000,0.0,8,5990981.00,0.045496,0.0
AK,0.0,CHALLENGER,663310.000000,0.0,8,5990981.00,0.110718,0.0
AK,0.0,CHALLENGER,1470.000000,0.0,8,5990981.00,0.000245,0.0
...,...,...,...,...,...,...,...,...
WY,0.0,OPEN,50966.000000,0.0,10,2421253.75,0.021049,0.0
WY,0.0,OPEN,193826.125000,0.0,10,2421253.75,0.080052,0.0
WY,0.0,OPEN,146426.828125,0.0,10,2421253.75,0.060476,0.0
WY,0.0,OPEN,21548.000000,0.0,10,2421253.75,0.008900,0.0
