# (Multiple) Logistic Regression

In [1]:
# Load Packages
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
plt.style.use("seaborn-whitegrid")
%matplotlib inline

  from pandas.core import datetools


## 1. Import Standardized Data

### 1.1 Version 1 with only ratios as predictive features

In [2]:
# import Data (already splitted to train/test-data and selected features-> bc_randomforest_feature_selection)
X1_train_s = pd.read_csv("Data/generated_splits/X1_train_s.csv")
y1_train = pd.read_csv("Data/generated_splits/y1_train")

X1_test_s = pd.read_csv("Data/generated_splits/X1_test_s.csv")
y1_test = pd.read_csv("Data/generated_splits/y1_test")

#### 1.1.1 Set index Version 1

In [3]:
# rename column 1 from unnamed to index_number
colNms_X1_train = X1_train_s.columns.values
colNms_X1_train[0] = "index_number"
colNms_y1_train = y1_train.columns.values
colNms_y1_train[0] = "index_number"
colNms_X1_test = X1_test_s.columns.values
colNms_X1_test[0] = "index_number"
colNms_y1_test = y1_test.columns.values
colNms_y1_test[0] = "index_number"

# set index
X1_train_s = X1_train_s.set_index(["index_number"])
y1_train = y1_train.set_index(["index_number"])
X1_test_s = X1_test_s.set_index(["index_number"])
y1_test = y1_test.set_index(["index_number"])

### 1.2 Version 2 with ratios + seasonality and other market data

In [4]:
# import Data (already splitted to train/test-data and selected features-> bc_randomforest_feature_selection)
X2_train_s = pd.read_csv("Data/generated_splits/X2_train_s.csv")
y2_train = pd.read_csv("Data/generated_splits/y2_train")

X2_test_s = pd.read_csv("Data/generated_splits/X2_test_s.csv")
y2_test = pd.read_csv("Data/generated_splits/y2_test")

#### 1.2.1 Set index Version 2

In [5]:
# rename column 1 from unnamed to index_number
colNms_X2_train = X2_train_s.columns.values
colNms_X2_train[0] = "index_number"
colNms_y2_train = y2_train.columns.values
colNms_y2_train[0] = "index_number"
colNms_X2_test = X2_test_s.columns.values
colNms_X2_test[0] = "index_number"
colNms_y2_test = y2_test.columns.values
colNms_y2_test[0] = "index_number"

# set index
X2_train_s = X2_train_s.set_index(["index_number"])
y2_train = y2_train.set_index(["index_number"])
X2_test_s = X2_test_s.set_index(["index_number"])
y2_test = y2_test.set_index(["index_number"])

# 2. Logistic Regressions

## 2.1 Version 1

### 2.1.1 Logistic Regression with only one exogen variable

#### Logistic Regression

In [6]:
## Logistic Regression (with most important feature from feature selection)
# Assign features to matrix X and response to y
X = sm.add_constant(X1_train_s.iloc[:, 0:1])
y = y1_train

logReg = sm.Logit(endog = y, exog= X).fit()

Optimization terminated successfully.
         Current function value: 0.685746
         Iterations 5


In [7]:
# Workaround solution for error ("AttributeError: module 'scipy.stats' has no attribute 'chisqprob'")
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

#### Summary

In [8]:
# LogReg with only one feature as exogen variable
print(logReg.summary())

                           Logit Regression Results                           
Dep. Variable:                      0   No. Observations:                 2836
Model:                          Logit   Df Residuals:                     2834
Method:                           MLE   Df Model:                            1
Date:                Mon, 09 Apr 2018   Pseudo R-squ.:               0.0005333
Time:                        14:44:01   Log-Likelihood:                -1944.8
converged:                       True   LL-Null:                       -1945.8
                                        LLR p-value:                    0.1497
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1839      0.055      3.349      0.001       0.076       0.291
CAPEI          0.0027      0.002      1.363      0.173      -0.001       0.007


#### Plot

In [None]:
## Plotting
# Create df with sorted results
res = pd.DataFrame()
res["CAPEI"] = X1_train_s.CAPEI
res["pred"] = logReg.predict()
res = res.sort_values("CAPEI")
#colNms_X1_train[1]
# Plot scatter and log.Reg
plt.figure(figsize =(8,5))
plt.scatter(X1_train_s.CAPEI, y1_train, marker =".")
plt.plot(res.CAPEI, res.pred, c = "k")
plt.axhline(y=0, color = "gray", linestyle = "dashed")
plt.axhline(y=1, color = "gray", linestyle = "dashed")
plt.ylabel("Probability of up", fontsize =12)
plt.xlabel("CAPEI", fontsize =12);

### 2.1.2 Multiple Logistic Regression with all selected features

#### Logistic Regression

In [9]:
## Multiple Log. Regression (with all 15 features from feature selection)
# Assign features to X and response vector y
X = sm.add_constant(X1_train_s)
y = y1_train

#### Summary

In [10]:
# LogReg 1
print("Multiple Logistic Regression with all selected features")
print(78*"_")
print("")
# Run Log.Reg
logReg = sm.Logit(endog = y, exog=X).fit()
print(logReg.summary().tables[0])
print(logReg.summary().tables[1])

Multiple Logistic Regression with all selected features
______________________________________________________________________________

Optimization terminated successfully.
         Current function value: 0.669582
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                      0   No. Observations:                 2836
Model:                          Logit   Df Residuals:                     2820
Method:                           MLE   Df Model:                           15
Date:                Mon, 09 Apr 2018   Pseudo R-squ.:                 0.02409
Time:                        14:44:15   Log-Likelihood:                -1898.9
converged:                       True   LL-Null:                       -1945.8
                                        LLR p-value:                 1.962e-13
                     coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------

In [None]:
# compare LogReg with only one feature as exogen variable & LogReg 1 
# (explicitly Log-Likelihood values-> is there an improvement? (smaller values are prefered!))

In [None]:
# Check for non significant values (p-value > 0.05) and the Log-Likelihood value

### 2.1.3 Multiple Logistic Regression with only significant features

In [None]:
# apply an other logistic regression on a transformed dataset with only all significant values from LogReg 1 (above)

In [11]:
# extracting significant features with an alpha-boundery of 0.05
sign_features = (colNms_X1_train[np.where(logReg.pvalues < 0.05)])

In [12]:
print(sign_features)

['pcf' 'divyield' 'evm' 'bm' 'pe_op_dil' 'pe_op_basic' 'ptb'
 'aftret_equity']


#### Logistic Regression

In [13]:
## Multiple Log. Regression (with significant features from logreg above)
# Assign features to X and response vector y
X = sm.add_constant(X1_train_s[sign_features])
y = y1_train

#### Summary

In [15]:
# LogReg 2
print("Multiple Logistic Regression with selected significant features")
print(78*"_")
print("")
# Run Log.Reg
logReg = sm.Logit(endog = y, exog=X).fit()
print(logReg.summary().tables[0])
print(logReg.summary().tables[1])

Multiple Logistic Regression with selected significant features
______________________________________________________________________________

Optimization terminated successfully.
         Current function value: 0.670545
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                      0   No. Observations:                 2836
Model:                          Logit   Df Residuals:                     2827
Method:                           MLE   Df Model:                            8
Date:                Mon, 09 Apr 2018   Pseudo R-squ.:                 0.02269
Time:                        14:44:47   Log-Likelihood:                -1901.7
converged:                       True   LL-Null:                       -1945.8
                                        LLR p-value:                 1.031e-15
                    coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------

In [None]:
# compare LogReg 1 & 2 (explicitly Log-Likelihood values-> is there an improvement? (smaller values are prefered!))

In [None]:
# Does the logistic regression support our choice in feature selection (with random forest)?

## 2.2 Version 2

### 2.2.1 Logistic Regression with only one exogen variable

#### Logistic Regression

In [16]:
## Logistic Regression (with most important feature from feature selection)
# Assign features to matrix X and response to y
X = sm.add_constant(X2_train_s.iloc[:, 0:1])
y = y2_train

logReg = sm.Logit(endog = y, exog= X).fit()

Optimization terminated successfully.
         Current function value: 0.057065
         Iterations 13


In [17]:
# Workaround solution for error ("AttributeError: module 'scipy.stats' has no attribute 'chisqprob'")
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

#### Summary

In [18]:
# LogReg with only one feature as exogen variable
print(logReg.summary())

                           Logit Regression Results                           
Dep. Variable:                      0   No. Observations:                 2836
Model:                          Logit   Df Residuals:                     2834
Method:                           MLE   Df Model:                            1
Date:                Mon, 09 Apr 2018   Pseudo R-squ.:                  0.9168
Time:                        14:45:58   Log-Likelihood:                -161.84
converged:                       True   LL-Null:                       -1945.8
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.9371      0.163     -5.738      0.000      -1.257      -0.617
RET          392.1753     29.763     13.177      0.000     333.842     450.509

Possibly complete quasi-separation: A fraction 0.68

### 2.2.2 Multiple Logistic Regression with all selected features

#### Logistic Regression

In [None]:
## Multiple Log. Regression (tried with all 15 features from feature selection, but it gives an error because it seems that
# there are dependent columns)
# Assign features to X and response vector y
#X = sm.add_constant(X2_train_s)
#y = y2_train

In [19]:
# check for all independent columns
import sympy 
reduced_form, inds = sympy.Matrix(X2_train_s.values).rref()
reduced_form

Matrix([
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1.0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0

In [20]:
# independent columns
inds

(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)

In [44]:
# Assign features to X and response vector y-> because of inds only until column 14
X = sm.add_constant(X2_train_s.iloc[:, 0:14])
y = y2_train

#### Summary

In [45]:
# LogReg A
print("Multiple Logistic Regression with all selected features")
print(78*"_")
print("")
# Run Log.Reg
logRegA = sm.Logit(endog = y, exog= X).fit()
print(logRegA.summary().tables[0])
print(logRegA.summary().tables[1])

Multiple Logistic Regression with all selected features
______________________________________________________________________________

Optimization terminated successfully.
         Current function value: 0.038523
         Iterations 22
                           Logit Regression Results                           
Dep. Variable:                      0   No. Observations:                 2836
Model:                          Logit   Df Residuals:                     2821
Method:                           MLE   Df Model:                           14
Date:                Mon, 09 Apr 2018   Pseudo R-squ.:                  0.9439
Time:                        15:07:00   Log-Likelihood:                -109.25
converged:                       True   LL-Null:                       -1945.8
                                        LLR p-value:                     0.000
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------

In [None]:
# compare LogReg with only one feature as exogen variable & LogReg A 
# (explicitly Log-Likelihood values-> is there an improvement? (smaller values are prefered!))

In [None]:
# Check for non significant values (p-value > 0.05) and the Log-Likelihood value

### 2.2.3 Multiple Logistic Regression with only significant features

In [23]:
# extracting significant features with an alpha-boundery of 0.05
sign_features = (colNms_X2_train[np.where(logReg.pvalues < 0.05)])

In [24]:
print(sign_features)

['RETX']


#### Logistic Regression

In [25]:
## Multiple Log. Regression (with significant features from logreg above)
# Assign features to X and response vector y
X = sm.add_constant(X2_train_s[sign_features])
y = y2_train

#### Summary

In [27]:
# LogReg B
print("Multiple Logistic Regression with selected significant features")
print(78*"_")
print("")
# Run Log.Reg
logReg = sm.Logit(endog = y, exog=X).fit()
print(logReg.summary().tables[0])
print(logReg.summary().tables[1])

Multiple Logistic Regression with selected significant features
______________________________________________________________________________

Optimization terminated successfully.
         Current function value: 0.039538
         Iterations 14
                           Logit Regression Results                           
Dep. Variable:                      0   No. Observations:                 2836
Model:                          Logit   Df Residuals:                     2834
Method:                           MLE   Df Model:                            1
Date:                Mon, 09 Apr 2018   Pseudo R-squ.:                  0.9424
Time:                        14:47:21   Log-Likelihood:                -112.13
converged:                       True   LL-Null:                       -1945.8
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------

In [None]:
# compare LogReg A & B (explicitly Log-Likelihood values-> is there an improvement? (smaller values are prefered!))

In [None]:
# Does the logistic regression support our choice in feature selection (with random forest)?

## 3. Prediction

### 3.1 Version 1 with best LogReg Model

In [37]:
modelA = LogisticRegression()
modelA.fit(X1_train_s, y1_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [38]:
expected = y1_test
predicted = modelA.predict(X1_test_s)

In [39]:
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

             precision    recall  f1-score   support

          0       0.61      0.21      0.31       307
          1       0.60      0.90      0.72       403

avg / total       0.60      0.60      0.54       710

[[ 64 243]
 [ 41 362]]


In [48]:
# loglikelihood for "regression" of predicted on expected
from sklearn.metrics import log_loss

log_loss(expected, predicted)

13.815784223000241

### 3.2 Version 2 with best LogReg Model

In [49]:
modelB = LogisticRegression()
modelB.fit(X2_train_s, y2_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [50]:
expected = y2_test
predicted = modelB.predict(X2_test_s)

In [51]:
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       307
          1       0.57      1.00      0.72       403

avg / total       0.32      0.57      0.41       710

[[  0 307]
 [  0 403]]


  'precision', 'predicted', average, warn_for)


In [52]:
# loglikelihood for "regression" of predicted on expected
from sklearn.metrics import log_loss

log_loss(expected, predicted)

14.934718069927676