# (Multiple) Logistic Regression

In [1]:
# Load Packages
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
plt.style.use("seaborn-whitegrid")
%matplotlib inline

  from pandas.core import datetools


## 1. Import Standardized Data

### 1.1 Version 1 with only ratios as predictive features

In [4]:
# import Data (already splitted to train/test-data and selected features-> bc_randomforest_feature_selection)
X1_train_s = pd.read_csv("Data/generated_splits/X1_train_s.csv")
y1_train = pd.read_csv("Data/generated_splits/y1_train_s.csv")

X1_test_s = pd.read_csv("Data/generated_splits/X1_test_s.csv")
y1_test = pd.read_csv("Data/generated_splits/y1_test_s.csv")

#### 1.1.1 Set index Version 1

### 1.2 Version 2 with ratios + seasonality and other market data

In [10]:
# import Data (already splitted to train/test-data and selected features-> bc_randomforest_feature_selection)
X2_train_s = pd.read_csv("Data/generated_splits/X2_train_s.csv")
y2_train = pd.read_csv("Data/generated_splits/y2_train_s.csv")

X2_test_s = pd.read_csv("Data/generated_splits/X2_test_s.csv")
y2_test = pd.read_csv("Data/generated_splits/y2_test_s.csv")



#### 1.2.1 Set index Version 2

# 2. Logistic Regressions

## 2.1 Version 1

### 2.1.1 Logistic Regression with only one exogen variable

#### Logistic Regression

In [12]:
## Logistic Regression (with most important feature from feature selection)
# Assign features to matrix X and response to y
X = sm.add_constant(X1_train_s.iloc[:, 0:1])
display(X)
y = y1_train

logReg = sm.Logit(endog = y, exog= X).fit()

Unnamed: 0,const,CAPEI
0,1.0,20.773
1,1.0,25.089
2,1.0,12.337
3,1.0,14.924
4,1.0,12.689
5,1.0,16.056
6,1.0,10.904
7,1.0,27.523
8,1.0,12.393
9,1.0,18.379


Optimization terminated successfully.
         Current function value: 0.685746
         Iterations 5


In [None]:
# Workaround solution for error ("AttributeError: module 'scipy.stats' has no attribute 'chisqprob'")
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

#### Summary

In [None]:
# LogReg with only one feature as exogen variable
print(logReg.summary())

#### Plot

In [None]:
## Plotting
# Create df with sorted results
res = pd.DataFrame()
res["CAPEI"] = X1_train_s.CAPEI
res["pred"] = logReg.predict()
res = res.sort_values("CAPEI")
#colNms_X1_train[1]
# Plot scatter and log.Reg
plt.figure(figsize =(8,5))
plt.scatter(X1_train_s.CAPEI, y1_train, marker =".")
plt.plot(res.CAPEI, res.pred, c = "k")
plt.axhline(y=0, color = "gray", linestyle = "dashed")
plt.axhline(y=1, color = "gray", linestyle = "dashed")
plt.ylabel("Probability of up", fontsize =12)
plt.xlabel("CAPEI", fontsize =12);

### 2.1.2 Multiple Logistic Regression with all selected features

#### Logistic Regression

In [None]:
## Multiple Log. Regression (with all 15 features from feature selection)
# Assign features to X and response vector y
X = sm.add_constant(X1_train_s)
y = y1_train

#### Summary

In [None]:
# LogReg 1
print("Multiple Logistic Regression with all selected features")
print(78*"_")
print("")
# Run Log.Reg
logReg = sm.Logit(endog = y, exog=X).fit()
print(logReg.summary().tables[0])
print(logReg.summary().tables[1])

In [None]:
# compare LogReg with only one feature as exogen variable & LogReg 1 
# (explicitly Log-Likelihood values-> is there an improvement? (smaller values are prefered!))

In [None]:
# Check for non significant values (p-value > 0.05) and the Log-Likelihood value

### 2.1.3 Multiple Logistic Regression with only significant features

In [None]:
# apply an other logistic regression on a transformed dataset with only all significant values from LogReg 1 (above)

In [None]:
# extracting significant features with an alpha-boundery of 0.05
sign_features = (colNms_X1_train[np.where(logReg.pvalues < 0.05)])

In [None]:
print(sign_features)

#### Logistic Regression

In [None]:
## Multiple Log. Regression (with significant features from logreg above)
# Assign features to X and response vector y
X = sm.add_constant(X1_train_s[sign_features])
y = y1_train

#### Summary

In [None]:
# LogReg 2
print("Multiple Logistic Regression with selected significant features")
print(78*"_")
print("")
# Run Log.Reg
logReg = sm.Logit(endog = y, exog=X).fit()
print(logReg.summary().tables[0])
print(logReg.summary().tables[1])

In [None]:
# compare LogReg 1 & 2 (explicitly Log-Likelihood values-> is there an improvement? (smaller values are prefered!))

In [None]:
# Does the logistic regression support our choice in feature selection (with random forest)?

## 2.2 Version 2

### 2.2.1 Logistic Regression with only one exogen variable

#### Logistic Regression

In [None]:
## Logistic Regression (with most important feature from feature selection)
# Assign features to matrix X and response to y
X = sm.add_constant(X2_train_s.iloc[:, 0:1])
y = y2_train

logReg = sm.Logit(endog = y, exog= X).fit()

In [None]:
# Workaround solution for error ("AttributeError: module 'scipy.stats' has no attribute 'chisqprob'")
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

#### Summary

In [None]:
# LogReg with only one feature as exogen variable
print(logReg.summary())

### 2.2.2 Multiple Logistic Regression with all selected features

#### Logistic Regression

In [None]:
## Multiple Log. Regression (tried with all 15 features from feature selection, but it gives an error because it seems that
# there are dependent columns)
# Assign features to X and response vector y
#X = sm.add_constant(X2_train_s)
#y = y2_train

In [13]:
# check for all independent columns
import sympy 
reduced_form, inds = sympy.Matrix(X2_train_s.values).rref()
reduced_form

Matrix([
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1.0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0],
[0, 0, 0, 0

In [14]:
# independent columns
inds

(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)

In [15]:
# Assign features to X and response vector y-> because of inds only until column 14
X = sm.add_constant(X2_train_s.iloc[:, 0:14])
y = y2_train

#### Summary

In [16]:
# LogReg A
print("Multiple Logistic Regression with all selected features")
print(78*"_")
print("")
# Run Log.Reg
logRegA = sm.Logit(endog = y, exog= X).fit()
print(logRegA.summary().tables[0])
print(logRegA.summary().tables[1])

Multiple Logistic Regression with all selected features
______________________________________________________________________________

Optimization terminated successfully.
         Current function value: 0.038523
         Iterations 22


AttributeError: module 'scipy.stats' has no attribute 'chisqprob'

In [None]:
# compare LogReg with only one feature as exogen variable & LogReg A 
# (explicitly Log-Likelihood values-> is there an improvement? (smaller values are prefered!))

# Check for non significant values (p-value > 0.05) and the Log-Likelihood value

### 2.2.3 Multiple Logistic Regression with only significant features

In [None]:
# extracting significant features with an alpha-boundery of 0.05
sign_features = (colNms_X2_train[np.where(logReg.pvalues < 0.05)])
print(sign_features)

#### Logistic Regression

In [None]:
## Multiple Log. Regression (with significant features from logreg above)
# Assign features to X and response vector y
X = sm.add_constant(X2_train_s[sign_features])
y = y2_train

#### Summary

In [None]:
# LogReg B
print("Multiple Logistic Regression with selected significant features")
print(78*"_")
print("")
# Run Log.Reg
logReg = sm.Logit(endog = y, exog=X).fit()
print(logReg.summary().tables[0])
print(logReg.summary().tables[1])

In [None]:
# compare LogReg A & B (explicitly Log-Likelihood values-> is there an improvement? (smaller values are prefered!))
# Does the logistic regression support our choice in feature selection (with random forest)?

## 3. Prediction

### 3.1 Version 1 with best LogReg Model

In [None]:
modelA = LogisticRegression()
modelA.fit(X1_train_s, y1_train)

In [None]:
expected = y1_test
predicted = modelA.predict(X1_test_s)

In [None]:
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

In [None]:
# loglikelihood for "regression" of predicted on expected
from sklearn.metrics import log_loss

log_loss(expected, predicted)

### 3.2 Version 2 with best LogReg Model

In [None]:
modelB = LogisticRegression()
modelB.fit(X2_train_s, y2_train)

In [None]:
expected = y2_test
predicted = modelB.predict(X2_test_s)

In [None]:
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

In [None]:
# loglikelihood for "regression" of predicted on expected
from sklearn.metrics import log_loss

log_loss(expected, predicted)