# Regression Model

Attempting to predict congressmembers returns to flag someone who may be doing extremely well could be useful. Therefore, a regression model is fitted on aspects of the congressmembers to see how their features might allow us to predict their stock performance.

In [98]:
import pandas as pd
import numpy as np

# import models here 
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Ridge, Lasso, LinearRegression
#setting up random seed for project requirements 
np.random.seed(73)


In [99]:
#reading in final dataset from EDA 
df = pd.read_csv('./data/stock_percentage_difference.csv')

In [100]:
df.columns

Index(['level_0', 'transaction_date', 'ticker', 'asset_description', 'type',
       'amount', 'representative', 'timestamp', 'state', 'party_x',
       'total_senate_years', 'total_house_years', 'total_years',
       'total_raised', 'total_from_small_donors', 'percent_from_small_donors*',
       'total_money_raised', 'start', 'day_30', 'day_30_stock_perc_diff',
       'day_60', 'day_60_stock_perc_diff', 'day_90', 'day_90_stock_perc_diff',
       'day_180', 'day_180_stock_perc_diff', 'sp_start', 'sp_30',
       'sp_30_sp_perc_diff', 'sp_60', 'sp_60_sp_perc_diff', 'sp_90',
       'sp_90_sp_perc_diff', 'sp_180', 'sp_180_sp_perc_diff'],
      dtype='object')

In [101]:
values = df.groupby('representative')['day_180', 'sp_180'].mean().sort_values('day_180')

  values = df.groupby('representative')['day_180', 'sp_180'].mean().sort_values('day_180')


In [102]:
values.day_180[:5].values

array([ 0.05      ,  6.51800003,  9.75      , 10.22999954, 12.26000023])

In [103]:
df['total_raised'] =[int(raised.replace('$','')) for raised in df['total_raised']]

In [104]:
df['percent_from_small_donors*'] = [float(pct.replace('%','')) for pct in df['percent_from_small_donors*']]

In [105]:
df['percent_from_small_donors*']

0         6.00
1         6.00
2         6.00
3         6.00
4         6.00
         ...  
10076    19.55
10077    19.55
10078    19.55
10079    19.55
10080    19.55
Name: percent_from_small_donors*, Length: 10081, dtype: float64

In [106]:
df['type'].value_counts()

purchase        5101
sale_full       2804
sale_partial    2088
exchange          88
Name: type, dtype: int64

In [107]:
beats_sp = []
for i, row in df.iterrows():
    if row['day_180_stock_perc_diff'] > row['sp_180_sp_perc_diff']:
        beats_sp.append(1)
    else:
        beats_sp.append(0)
df['beats_sp'] = beats_sp

In [108]:
df = df.loc[df['type'] == 'purchase']

In [109]:
df['beats_sp'].mean()

0.46324250147029994

In [110]:
df.columns

Index(['level_0', 'transaction_date', 'ticker', 'asset_description', 'type',
       'amount', 'representative', 'timestamp', 'state', 'party_x',
       'total_senate_years', 'total_house_years', 'total_years',
       'total_raised', 'total_from_small_donors', 'percent_from_small_donors*',
       'total_money_raised', 'start', 'day_30', 'day_30_stock_perc_diff',
       'day_60', 'day_60_stock_perc_diff', 'day_90', 'day_90_stock_perc_diff',
       'day_180', 'day_180_stock_perc_diff', 'sp_start', 'sp_30',
       'sp_30_sp_perc_diff', 'sp_60', 'sp_60_sp_perc_diff', 'sp_90',
       'sp_90_sp_perc_diff', 'sp_180', 'sp_180_sp_perc_diff', 'beats_sp'],
      dtype='object')

In [111]:
Features = ['party_x', 'total_house_years','total_senate_years', 'total_raised', 'percent_from_small_donors*']
X = df[Features]
y = df['day_180_stock_perc_diff']

X = pd.get_dummies(X, columns = ['party_x'], drop_first = True)

In [112]:
y

1        0.254
2        0.380
16       0.084
17      -0.065
18      -0.118
         ...  
10072    0.038
10073    0.159
10074    0.214
10078    0.272
10079   -0.082
Name: day_180_stock_perc_diff, Length: 5101, dtype: float64

In [113]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [114]:
lr = LinearRegression()

lr.fit(X_train_sc, y_train)

LinearRegression()

In [115]:
print('Training Score:', ridge.score(X_train_sc, y_train))
print('Testing Score:', ridge.score(X_test_sc, y_test))

Training Score: 0.01835866055515012
Testing Score: 0.026083489235383173


In [116]:
coef_df = pd.DataFrame({'Features':X.columns, 'coefs': ridge.coef_})

In [117]:
coef_df.sort_values('coefs')

Unnamed: 0,Features,coefs
2,total_raised,-0.043769
1,total_senate_years,-0.03072
6,party_x_Republican,-0.004643
4,party_x_Independent,0.000691
5,party_x_Libertarian,0.003985
3,percent_from_small_donors*,0.006014
0,total_house_years,0.00839


**Interpretation**

The low accuracy of this regression model made its feature importances as a method of determining who may be flagged unusable. Therefore it was discarded for the classification model.