### Introduction to Machine Learning, UZH FS18, Group Project

### Group 2: Barbara Capl, Mathias Lüthi, Pamela Matias, Stefanie Rentsch


#     
# IV.     Logistic Regression (Features Selected with Random Forest)

In [13]:
# hide unnecessary warnings ("depreciation" of packages etc.)
import warnings
warnings.filterwarnings('ignore')

# import packages
import numpy as np
import pandas as pd
import matplotlib as pl
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
plt.style.use('seaborn-whitegrid')

# Import datasets (pre-selected with RF)

In [61]:
# Import imputed dataset
imputed_dataset_f = pd.read_csv('Data/generated/imputed_dataset_ml_f.csv', sep = ',')
# Import dataset wit dropped Nans
dropnan_dataset_f = pd.read_csv('Data/generated/dropnan_dataset_ml_f.csv', sep = ',')

print('Shape of Imputed Dataset = ' + str(imputed_dataset_f.shape))
print('Shape of Dataset with Nans dropped = ' + str(dropnan_dataset_f.shape))

display(imputed_dataset_f.head())
display(dropnan_dataset_f.head())

Shape of Imputed Dataset = (3519, 17)
Shape of Dataset with Nans dropped = (1430, 17)


Unnamed: 0.1,Unnamed: 0,vwretx,DATE,sprtrn,vwretd,ewretd,ewretx,divyield,BID,PEG_trailing,pe_op_basic,cash_lt,PEG_1yrforward,ASKHI,ptb,PEG_ltgforward,NEXT_DAY_PREDICTION
0,0,-0.003552,1.138752e+18,0.000453,-0.001637,0.004836,0.003476,0.0134,26.87,10.28,21.496,1.71,14.555,28.04,6.281,1.838,1.0
1,1,0.017585,1.141171e+18,0.011065,0.019053,0.036978,0.035232,0.0132,27.24,10.41,21.768,1.71,14.739,27.89,6.293,1.842,0.0
2,2,0.011494,1.14385e+18,0.012187,0.012965,0.009791,0.008497,0.0149,24.16,9.239,19.32,1.71,13.081,27.74,5.573,1.666,0.0
3,3,-0.033025,1.146442e+18,-0.030917,-0.031045,-0.044331,-0.045982,0.0159,22.7,0.709,17.695,1.572,-5.842,24.29,5.496,1.48,1.0
4,4,-0.001881,1.14912e+18,8.7e-05,-0.000386,-0.008479,-0.010474,0.0155,23.38,0.73,18.203,1.572,-6.01,23.4702,5.577,1.522,1.0


Unnamed: 0.1,Unnamed: 0,vwretd,ptb,vwretx,DATE,CAPEI,ewretd,ewretx,PEG_1yrforward,accrual,pretret_earnat,ps,roe,pcf,pe_op_dil,BID,NEXT_DAY_PREDICTION
0,0,0.041186,4.085,0.038914,1328054400000000000,14.863,0.036494,0.035003,-20.961,0.053,0.352,3.696,0.421,9.193,11.5,31.73,1.0
1,1,0.023994,4.155,0.02211,1330560000000000000,15.121,0.017608,0.015303,-21.784,0.053,0.352,3.76,0.421,9.353,11.687,32.24,0.0
2,2,-0.065531,3.497,-0.067942,1335830400000000000,13.332,-0.068434,-0.070404,9.552,0.06,0.346,3.358,0.391,8.204,10.653,29.19,1.0
3,3,0.038156,3.656,0.036055,1338508800000000000,13.939,0.034188,0.031622,-30.032,0.06,0.346,3.51,0.391,8.577,11.164,30.58,0.0
4,4,0.026265,3.785,0.023665,1343779200000000000,14.175,0.02482,0.022964,1.451,0.127,0.317,3.505,0.268,8.17,11.289,30.81,0.0


###   
# Logistic Regression (features pre-selected with Random Forest)
## for Version 1: Imputed Dataset
## for Version 2: Dataset with rows deleted where Nan
###   

## A. R-Style (tried multiple features)

In [88]:
# Chose formula (what to regress)
# R-Style formula
formula_1 = 'imputed_dataset_f.NEXT_DAY_PREDICTION ~ imputed_dataset_f.vwretx + imputed_dataset_f.DATE + imputed_dataset_f.sprtrn + imputed_dataset_f.vwretd + imputed_dataset_f.ewretd + imputed_dataset_f.ewretx + imputed_dataset_f.divyield + imputed_dataset_f.BID + imputed_dataset_f.PEG_trailing + imputed_dataset_f.pe_op_basic + imputed_dataset_f.cash_lt + imputed_dataset_f.PEG_1yrforward + imputed_dataset_f.ASKHI + imputed_dataset_f.ptb + imputed_dataset_f.PEG_ltgforward'
formula_2 = 'dropnan_dataset_f.NEXT_DAY_PREDICTION ~ dropnan_dataset_f.vwretd + dropnan_dataset_f.ptb + dropnan_dataset_f.vwretx + dropnan_dataset_f.DATE + dropnan_dataset_f.CAPEI + dropnan_dataset_f.ewretd + dropnan_dataset_f.ewretx + dropnan_dataset_f.PEG_1yrforward + dropnan_dataset_f.accrual + dropnan_dataset_f.pretret_earnat + dropnan_dataset_f.ps + dropnan_dataset_f.roe + dropnan_dataset_f.pcf + dropnan_dataset_f.pe_op_dil + dropnan_dataset_f.BID'

# Regress model
logReg_1r = smf.glm(formula = formula_1, data = imputed_dataset_f, family = sm.families.Binomial()).fit()
print('Summary Regression Version 1: Imputed Dataset')
print(logReg_1r.summary())
print("")
logReg_2r = smf.glm(formula = formula_2, data = dropnan_dataset_f, family = sm.families.Binomial()).fit()
print('Summary Regression Version 2: Dropnan Dataset')
print(logReg_2r.summary())

Summary Regression Version 1: Imputed Dataset
                           Generalized Linear Model Regression Results                           
Dep. Variable:     imputed_dataset_f.NEXT_DAY_PREDICTION   No. Observations:                 3519
Model:                                               GLM   Df Residuals:                     3518
Model Family:                                   Binomial   Df Model:                            0
Link Function:                                     logit   Scale:                             1.0
Method:                                             IRLS   Log-Likelihood:                -2413.1
Date:                                   Mon, 26 Mar 2018   Deviance:                       4826.1
Time:                                           13:16:58   Pearson chi2:                 3.52e+03
No. Iterations:                                        4                                         
                                       coef    std err          z      P

### weird shit: why is vwretx / vwretd NAN?

## B.  GLM

In [80]:
# Regress Model
print('Summary Regression Version 1: Imputed Dataset')
print("")
logReg_1g = sm.GLM(endog = imputed_dataset_f.NEXT_DAY_PREDICTION, 
                     exog = sm.add_constant(imputed_dataset_f.vwretx), family = sm.families.Binomial()).fit()
print(logReg_1g.summary())
print("")
print("")
print('Regression Version 2: Dropnan Dataset')

logReg_2g = sm.GLM(endog = dropnan_dataset_f.NEXT_DAY_PREDICTION, 
                     exog = sm.add_constant(dropnan_dataset_f.vwretd), family = sm.families.Binomial()).fit()
print(logReg_2g.summary())

Summary Regression Version 1: Imputed Dataset

                  Generalized Linear Model Regression Results                  
Dep. Variable:     NEXT_DAY_PREDICTION   No. Observations:                 3519
Model:                             GLM   Df Residuals:                     3517
Model Family:                 Binomial   Df Model:                            1
Link Function:                   logit   Scale:                             1.0
Method:                           IRLS   Log-Likelihood:                -2412.9
Date:                 Mon, 26 Mar 2018   Deviance:                       4825.9
Time:                         13:14:00   Pearson chi2:                 3.52e+03
No. Iterations:                      4                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2392      0.034      7.007      0.000       0.172       0.

## C. Logit

In [81]:
# to workaround error message
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

# Regress Model
print('Summary Regression Version 1: Imputed Dataset')
print("")

logReg_1l = sm.Logit(endog = imputed_dataset_f.NEXT_DAY_PREDICTION, 
                     exog = sm.add_constant(imputed_dataset_f.vwretx)).fit()

print(logReg_1l.summary())

print("")
print("")

print('Regression Version 2: Dropnan Dataset')
print("")

logReg_2l = sm.Logit(endog = dropnan_dataset_f.NEXT_DAY_PREDICTION, 
                     exog = sm.add_constant(dropnan_dataset_f.vwretd)).fit()
print("")

print(logReg_2l.summary())



Summary Regression Version 1: Imputed Dataset

Optimization terminated successfully.
         Current function value: 0.685690
         Iterations 4
                            Logit Regression Results                           
Dep. Variable:     NEXT_DAY_PREDICTION   No. Observations:                 3519
Model:                           Logit   Df Residuals:                     3517
Method:                            MLE   Df Model:                            1
Date:                 Mon, 26 Mar 2018   Pseudo R-squ.:               0.0002128
Time:                         13:14:03   Log-Likelihood:                -2412.9
converged:                        True   LL-Null:                       -2413.5
                                         LLR p-value:                    0.3108
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2392      0.034      7.007      0.00

## D. Multiple Logistic Regression with Skearn

In [98]:
from sklearn.linear_model import LogisticRegression

X_1d = imputed_dataset_f.drop('NEXT_DAY_PREDICTION', axis = 1)
y_1d = imputed_dataset_f['NEXT_DAY_PREDICTION']

logReg_1d = LogisticRegression(C = 1e9, fit_intercept = False).fit(X1, y1)

logReg_1d.coef_

array([[2.83934235e-34, 2.49350651e-39, 1.88691314e-19, 3.44810382e-39,
        2.76762319e-39, 2.20490069e-39, 2.01277677e-39, 3.75776261e-39,
        8.83504272e-36, 1.96345872e-37, 2.06199654e-36, 7.75583360e-38,
        4.08867167e-37, 9.64241853e-36, 3.52023902e-37, 3.24144429e-37]])

## E. Multiple Logistic Regression

In [115]:
# Assign Features to matrix X and response vector y
X_1e = sm.add_constant(imputed_dataset_f[['vwretx', 'DATE', 'sprtrn', 'vwretd', 'ewretd', 'ewretx', 'divyield', 'BID', 'PEG_trailing', 'pe_op_basic', 'cash_lt', 'PEG_1yrforward', 'ASKHI', 'ptb', 'PEG_ltgforward']])
y_1e = imputed_dataset_f.NEXT_DAY_PREDICTION   

# Run Logistic Regresion
logReg_1e = sm.Logit(endog = y_1e, exog = X_1e).fit()
print(logReg_1e.summary().tables[0])
print(logReg_1e.summary().tables[1])



Optimization terminated successfully.
         Current function value: 0.677663
         Iterations 7
                            Logit Regression Results                           
Dep. Variable:     NEXT_DAY_PREDICTION   No. Observations:                 3519
Model:                           Logit   Df Residuals:                     3518
Method:                            MLE   Df Model:                            0
Date:                 Mon, 26 Mar 2018   Pseudo R-squ.:                 0.01192
Time:                         13:31:21   Log-Likelihood:                -2384.7
converged:                        True   LL-Null:                       -2413.5
                                         LLR p-value:                       nan
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const              0.2114      0.530      0.399      0.690      -0.828       1.250
vwretx   

#### P-Value of DATE, divyield, PEG_trailing, pe_op_basic: LARGE!
#### NO CLEAR EVIDENCE OF ASSOCIATION BETWEEN THOSE VARIABLES AND THE RESPONSE VECTOR

####     

In [118]:
# Assign Features to matrix X and response vector y
X_2e = sm.add_constant(dropnan_dataset_f[['vwretd', 'ptb', 'vwretx', 'DATE', 'CAPEI', 'ewretd', 'ewretx', 'PEG_1yrforward', 'accrual', 'pretret_earnat', 'ps', 'roe', 'pcf', 'pe_op_dil', 'BID']])   
y_2e = dropnan_dataset_f.NEXT_DAY_PREDICTION   

# Run Logistic Regresion
logReg_2e = sm.Logit(endog = y_2e, exog = X_2e).fit()
print(logReg_2e.summary().tables[0])
print(logReg_2e.summary().tables[1])




Optimization terminated successfully.
         Current function value: 0.676865
         Iterations 7
                            Logit Regression Results                           
Dep. Variable:     NEXT_DAY_PREDICTION   No. Observations:                 1430
Model:                           Logit   Df Residuals:                     1429
Method:                            MLE   Df Model:                            0
Date:                 Mon, 26 Mar 2018   Pseudo R-squ.:                0.008302
Time:                         13:36:36   Log-Likelihood:                -967.92
converged:                        True   LL-Null:                       -976.02
                                         LLR p-value:                       nan
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const              0.0641      1.034      0.062      0.951      -1.962       2.091
vwretd   

#### P-Value of ptb, DATEM CAPEI, accrual, pretret_earnat, ps, pcf, pe_op:dil: LARGE!
#### NO CLEAR EVIDENCE OF ASSOCIATION BETWEEN THOSE VARIABLES AND THE RESPONSE VECTOR

####     

## Assessing Output of A. R-Style

In [82]:
# Show Parameters Version1: Imputed Dataset
display(logReg_1r.params)
display(logReg_1r.pvalues)

# Hypothesis Testing




Intercept                           1.459114e-37
imputed_dataset_f.vwretx            0.000000e+00
imputed_dataset_f.DATE              1.886914e-19
imputed_dataset_f.sprtrn            7.490354e-40
imputed_dataset_f.vwretd            9.538252e-40
imputed_dataset_f.ewretd            8.854673e-40
imputed_dataset_f.ewretx            6.242271e-40
imputed_dataset_f.divyield          3.740795e-39
imputed_dataset_f.BID               1.049690e-35
imputed_dataset_f.PEG_trailing      3.423965e-37
imputed_dataset_f.pe_op_basic       2.220030e-36
imputed_dataset_f.cash_lt           4.657995e-38
imputed_dataset_f.PEG_1yrforward    8.327682e-38
imputed_dataset_f.ASKHI             1.093510e-35
imputed_dataset_f.ptb               5.151328e-37
imputed_dataset_f.PEG_ltgforward    2.749420e-37
dtype: float64

Intercept                           5.981352e-13
imputed_dataset_f.vwretx                     NaN
imputed_dataset_f.DATE              5.981352e-13
imputed_dataset_f.sprtrn            5.981352e-13
imputed_dataset_f.vwretd            5.981352e-13
imputed_dataset_f.ewretd            5.981352e-13
imputed_dataset_f.ewretx            5.981352e-13
imputed_dataset_f.divyield          5.981352e-13
imputed_dataset_f.BID               5.981352e-13
imputed_dataset_f.PEG_trailing      5.981352e-13
imputed_dataset_f.pe_op_basic       5.981352e-13
imputed_dataset_f.cash_lt           5.981352e-13
imputed_dataset_f.PEG_1yrforward    5.981352e-13
imputed_dataset_f.ASKHI             5.981352e-13
imputed_dataset_f.ptb               5.981352e-13
imputed_dataset_f.PEG_ltgforward    5.981352e-13
dtype: float64

In [86]:
# Show Parameters Version2: Dropnan Dataset
display(logReg_2r.params)
display(logReg_2r.pvalues)

Intercept                           1.719792e-37
dropnan_dataset_f.vwretd            0.000000e+00
dropnan_dataset_f.ptb               7.009701e-37
dropnan_dataset_f.vwretx            9.186921e-40
dropnan_dataset_f.DATE              2.242039e-19
dropnan_dataset_f.CAPEI             3.342407e-36
dropnan_dataset_f.ewretd            1.074458e-39
dropnan_dataset_f.ewretx            7.656674e-40
dropnan_dataset_f.PEG_1yrforward    1.140402e-37
dropnan_dataset_f.accrual           6.922654e-39
dropnan_dataset_f.pretret_earnat    4.140806e-38
dropnan_dataset_f.ps                3.574910e-37
dropnan_dataset_f.roe               4.753547e-38
dropnan_dataset_f.pcf               1.931762e-36
dropnan_dataset_f.pe_op_dil         2.577519e-36
dropnan_dataset_f.BID               1.221502e-35
dtype: float64

Intercept                           4.234549e-08
dropnan_dataset_f.vwretd                     NaN
dropnan_dataset_f.ptb               4.234549e-08
dropnan_dataset_f.vwretx            4.234549e-08
dropnan_dataset_f.DATE              4.234549e-08
dropnan_dataset_f.CAPEI             4.234549e-08
dropnan_dataset_f.ewretd            4.234549e-08
dropnan_dataset_f.ewretx            4.234549e-08
dropnan_dataset_f.PEG_1yrforward    4.234549e-08
dropnan_dataset_f.accrual           4.234549e-08
dropnan_dataset_f.pretret_earnat    4.234549e-08
dropnan_dataset_f.ps                4.234549e-08
dropnan_dataset_f.roe               4.234549e-08
dropnan_dataset_f.pcf               4.234549e-08
dropnan_dataset_f.pe_op_dil         4.234549e-08
dropnan_dataset_f.BID               4.234549e-08
dtype: float64

## Hypothesis Testing for A. R-Style

In [87]:
logReg_1r.conf_int(alpha = 0.01)

Unnamed: 0,0,1
Intercept,9.371758999999999e-38,1.981052e-37
imputed_dataset_f.vwretx,0.0,0.0
imputed_dataset_f.DATE,1.2119479999999998e-19,2.5618799999999997e-19
imputed_dataset_f.sprtrn,4.810987e-40,1.016972e-39
imputed_dataset_f.vwretd,6.126333e-40,1.2950170000000001e-39
imputed_dataset_f.ewretd,5.687276e-40,1.202207e-39
imputed_dataset_f.ewretx,4.009354e-40,8.475187e-40
imputed_dataset_f.divyield,2.402679e-39,5.0789120000000004e-39
imputed_dataset_f.BID,6.742065e-36,1.4251739999999999e-35
imputed_dataset_f.PEG_trailing,2.199182e-37,4.648749e-37
