In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv("final_q1_cleaned.csv")
data.drop('Unnamed: 0', axis = 1, inplace = True)

In [3]:
data['received_endorsements?'].replace({False: 0, True: 1}, inplace=True)
data['Challenger?(else_open)'].replace({False: 0, True: 1}, inplace=True)
data['Republican?'].replace({False: 0, True: 1}, inplace=True)
data.drop('Exempt_Legal_Accounting_Disbursement', axis=1, inplace=True)
data = data.fillna(0)

In [4]:
data = data[data['Cand_Office'] == 'H']

In [5]:
data_12 = data[(data['num_candidates_in_race'] <= 2) & (data['num_candidates_in_race'] >= 1)]
data_35 = data[(data['num_candidates_in_race'] <= 5) & (data['num_candidates_in_race'] >= 3)]
data_69 = data[(data['num_candidates_in_race'] <= 9) & (data['num_candidates_in_race'] >= 6)]
data_10 = data[data['num_candidates_in_race'] >= 10]

## IPW

In [6]:
dem = data[data['Republican?'] == False]
dem = dem.reset_index()
rep = data[data['Republican?'] == True]
rep = rep.reset_index()

In [7]:
x_dem = dem[['district_lean', 'Endorsements', 'num_candidates_in_race',  'Total_Receipt', 
             'Total_Contribution', 'Challenger?(else_open)']]
y_dem = dem['Primary %']
z_dem = dem['received_endorsements?']

In [8]:
lr_dem = LR(penalty='none', max_iter=200, random_state=0)

In [9]:
lr_dem.fit(x_dem, z_dem)

LogisticRegression(max_iter=200, penalty='none', random_state=0)

In [10]:
def estimate_treatment_effect(lr, X, Y, Z, prop):
    ex = prop
    return np.mean(Z * Y / ex) - np.mean((1 - Z) * Y / (1 - ex))

In [11]:
estimate_treatment_effect(lr_dem, x_dem, y_dem, z_dem, lr_dem.predict_proba(x_dem)[:, 1]) # no trimming

-21.489203196613904

In [12]:
dem['Propensity Scores'] = lr_dem.predict_proba(x_dem)[:, 1]

In [13]:
trimmed_dem = dem[(dem['Propensity Scores'] > 0.1) & (dem['Propensity Scores'] < 0.9)]

In [14]:
x_dem_trim = trimmed_dem[['district_lean', 'Endorsements', 'num_candidates_in_race',  'Total_Receipt', 
             'Total_Contribution', 'Challenger?(else_open)']]
y_dem_trim = trimmed_dem['Primary %']
z_dem_trim = trimmed_dem['received_endorsements?']

In [15]:
estimate_treatment_effect(lr_dem, x_dem_trim, y_dem_trim, z_dem_trim, trimmed_dem['Propensity Scores'])

-21.380596123393747

In [16]:
x_rep = rep[['district_lean', 'Endorsements', 'num_candidates_in_race',  'Total_Receipt', 
             'Total_Contribution', 'Challenger?(else_open)']]
y_rep = rep['Primary %'] 
z_rep = rep['received_endorsements?']

In [17]:
lr_rep = LR(penalty='none', max_iter=200, random_state=0)

In [18]:
lr_rep.fit(x_rep, z_rep)

LogisticRegression(max_iter=200, penalty='none', random_state=0)

In [19]:
rep['Propensity Scores'] = lr_rep.predict_proba(x_rep)[:, 1]

In [20]:
trimmed_rep = rep[(rep['Propensity Scores'] > 0.1) & (rep['Propensity Scores'] < 0.9)]

In [21]:
x_rep_trim = trimmed_rep[['district_lean', 'Endorsements', 'num_candidates_in_race',  'Total_Receipt', 
             'Total_Contribution', 'Challenger?(else_open)']]
y_rep_trim = trimmed_rep['Primary %']
z_rep_trim = trimmed_rep['received_endorsements?']

In [22]:
estimate_treatment_effect(lr_rep, x_rep_trim, y_rep_trim, z_rep_trim, trimmed_rep['Propensity Scores'])

-41.754555299499174

In [23]:
estimate_treatment_effect(lr_rep, x_rep, y_rep, z_rep, lr_rep.predict_proba(x_rep)[:, 1]) # no trimming

-38.84562263143313

## Linear Regression

In [24]:
def fit_OLS_model(df, target_variable, explanatory_variables, intercept = False):
    target = df[target_variable]
    inputs = df[explanatory_variables]
    if intercept:
        inputs = sm.add_constant(inputs)
    
    fitted_model = sm.OLS(target, inputs).fit()
    return(fitted_model)

In [25]:
first = fit_OLS_model(data, 'Primary %', ['district_lean', 'Endorsements', 'num_candidates_in_race', 
                                          'Total_Contribution', 'Challenger?(else_open)'])
print(first.summary())

                                 OLS Regression Results                                
Dep. Variable:              Primary %   R-squared (uncentered):                   0.542
Model:                            OLS   Adj. R-squared (uncentered):              0.539
Method:                 Least Squares   F-statistic:                              180.3
Date:                Sun, 08 May 2022   Prob (F-statistic):                   1.32e-126
Time:                        15:56:10   Log-Likelihood:                         -3697.3
No. Observations:                 768   AIC:                                      7405.
Df Residuals:                     763   BIC:                                      7428.
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                             coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------

In [26]:
features_second = ['district_lean', 'Endorsements', 'num_candidates_in_race', 'Total_Contribution',]
second = fit_OLS_model(data, 'Primary %', features_second, True)
print(second.summary())

                            OLS Regression Results                            
Dep. Variable:              Primary %   R-squared:                       0.273
Model:                            OLS   Adj. R-squared:                  0.269
Method:                 Least Squares   F-statistic:                     71.50
Date:                Sun, 08 May 2022   Prob (F-statistic):           1.89e-51
Time:                        15:56:10   Log-Likelihood:                -3572.6
No. Observations:                 768   AIC:                             7155.
Df Residuals:                     763   BIC:                             7178.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                     44

  x = pd.concat(x[::order], 1)


In [27]:
data_10['num_candidates_in_race'].value_counts()

10    47
11    26
12    22
14    21
32    20
16    17
13    16
22    14
18     8
Name: num_candidates_in_race, dtype: int64

In [28]:
reg12 = fit_OLS_model(data_12, 'Primary %', ['district_lean', 'Endorsements', 'num_candidates_in_race', 
                                          'Total_Contribution', 'Challenger?(else_open)'])
print(reg12.summary())

                                 OLS Regression Results                                
Dep. Variable:              Primary %   R-squared (uncentered):                   0.836
Model:                            OLS   Adj. R-squared (uncentered):              0.830
Method:                 Least Squares   F-statistic:                              151.5
Date:                Sun, 08 May 2022   Prob (F-statistic):                    1.45e-56
Time:                        15:56:10   Log-Likelihood:                         -750.40
No. Observations:                 154   AIC:                                      1511.
Df Residuals:                     149   BIC:                                      1526.
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                             coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------

In [29]:
reg35 = fit_OLS_model(data_35, 'Primary %', ['district_lean', 'Endorsements', 'num_candidates_in_race', 
                                          'Total_Contribution', 'Challenger?(else_open)'])
print(reg35.summary())

                                 OLS Regression Results                                
Dep. Variable:              Primary %   R-squared (uncentered):                   0.677
Model:                            OLS   Adj. R-squared (uncentered):              0.670
Method:                 Least Squares   F-statistic:                              94.61
Date:                Sun, 08 May 2022   Prob (F-statistic):                    1.98e-53
Time:                        15:56:10   Log-Likelihood:                         -1027.6
No. Observations:                 231   AIC:                                      2065.
Df Residuals:                     226   BIC:                                      2082.
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                             coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------

In [30]:
reg69 = fit_OLS_model(data_69, 'Primary %', ['district_lean', 'Endorsements', 'num_candidates_in_race', 
                                          'Total_Contribution', 'Challenger?(else_open)'])
print(reg69.summary())

                                 OLS Regression Results                                
Dep. Variable:              Primary %   R-squared (uncentered):                   0.598
Model:                            OLS   Adj. R-squared (uncentered):              0.587
Method:                 Least Squares   F-statistic:                              55.61
Date:                Sun, 08 May 2022   Prob (F-statistic):                    3.27e-35
Time:                        15:56:10   Log-Likelihood:                         -826.00
No. Observations:                 192   AIC:                                      1662.
Df Residuals:                     187   BIC:                                      1678.
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                             coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------

In [31]:
reg10 = fit_OLS_model(data_10, 'Primary %', ['district_lean', 'Endorsements', 'num_candidates_in_race', 
                                          'Total_Contribution', 'Challenger?(else_open)'])
print(reg10.summary())

                                 OLS Regression Results                                
Dep. Variable:              Primary %   R-squared (uncentered):                   0.515
Model:                            OLS   Adj. R-squared (uncentered):              0.502
Method:                 Least Squares   F-statistic:                              39.52
Date:                Sun, 08 May 2022   Prob (F-statistic):                    1.49e-27
Time:                        15:56:10   Log-Likelihood:                         -794.93
No. Observations:                 191   AIC:                                      1600.
Df Residuals:                     186   BIC:                                      1616.
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                             coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------