In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
candidates_df = pd.read_csv('../data/candidates_clean.csv', dtype = {'district': 'str'})

In [3]:
candidates_df.head()

Unnamed: 0,candidate_name,party,incumbent,winner,perc_vote,money_raised,money_spent,state_name,district,state_dist,perc_vote_max,winner_bool
0,Jerry Carl,R,0,1,64.9,1971321,1859349,AL,1,AL01,64.9,1
1,James Averhart,D,0,0,35.0,80095,78973,AL,1,AL01,64.9,0
2,Barry Moore,R,0,1,65.3,650807,669368,AL,2,AL02,65.3,1
3,Phyllis Harvey-Hall,D,0,0,34.6,56050,55988,AL,2,AL02,65.3,0
4,Mike D Rogers,R,1,1,67.5,1193111,1218564,AL,3,AL03,67.5,1


In [4]:
grouped_candidates = candidates_df.groupby(['state_name', 'state_dist'])


# Calculate the total cash raised for each district
total_cash_raised_per_district = grouped_candidates['money_raised'].transform('sum')
total_cash_spent_per_district = grouped_candidates['money_spent'].transform('sum')

# Calculate the percentage of Cash Raised for each candidate
candidates_df['perc_money_raised'] = (candidates_df['money_raised'] / total_cash_raised_per_district) * 100
candidates_df['perc_money_spent'] = (candidates_df['money_spent'] / total_cash_spent_per_district) * 100
# candidates_df['raised_spent_diff'] = candidates_df['money_raised'] - candidates_df['money_spent']

We create a function that will 
* Generate random test-train splits, putting 10% of the data in a test set and 90% in a training set
* build the regression model
* predict on the test set and calculate the accuracy
* use a different seed each time (this is probably overkill!) 

In [14]:
import time
t = 1000 * time.time() # current time in milliseconds
np.random.seed(int(t) % 2**32)

def check_acc():

    t = 1000 * time.time() # current time in milliseconds
    np.random.seed(int(t) % 2**32)

    # Define the proportion of data to use for the test set (e.g., 80% train, 20% test)
    test_size = 0.1

    # Generate a boolean mask for selecting rows for the test set
    test_mask = np.random.rand(len(candidates_df)) < test_size

    # Split the data into training and test sets
    train_data = candidates_df[~test_mask]
    test_data = candidates_df[test_mask]

    # reset the index of the new DataFrames
    train_data = train_data.reset_index(drop=True)
    test_data = test_data.reset_index(drop=True)
    model = smf.logit('winner_bool ~ perc_money_spent + incumbent', data = train_data).fit()
    preds = (model.predict(test_data) > 0.5).astype(int)
    return (test_data['winner'] == preds).sum()/len(test_data['winner'])

Run the function 100 times and store the accuracies in an array

In [29]:
accuracies = []
for i in range(1000):
    accuracies.append(check_acc())

Optimization terminated successfully.
         Current function value: 0.170315
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.171656
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.180416
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.175478
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.178007
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.182190
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.177272
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.172232
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.169242
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.177134
  

In [31]:
print(min(accuracies))
print(max(accuracies))
print(sum(accuracies)/len(accuracies))

0.8522727272727273
1.0
0.9354939641437359
