# (Multiple) Logistic Regression

In [None]:
# Load Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
plt.style.use("seaborn-whitegrid")
%matplotlib inline

## 1. Import Standardized Data

### 1.1 Version 1 with only ratios as predictive features

In [None]:
# import Data (already splitted to train/test-data and selected features-> bc_randomforest_feature_selection)
X1_train_s = pd.read_csv("Data/generated_splits/X1_train.csv")
y1_train = pd.read_csv("Data/generated_splits/y1_train.csv")

X1_train_s = X1_train.set_index(["PERMNO", "DATE"])
y1_train = y1_train.set_index(["PERMNO", "DATE"])

X1_train_s.head(3)
y1_train.head(3)

### 1.2 Version 2 with ratios + seasonality and other market data

In [None]:
# import Data (already splitted to train/test-data and selected features-> bc_randomforest_feature_selection)
X2_train_s = pd.read_csv("Data/generated_splits/X2_train.csv")
y2_train = pd.read_csv("Data/generated_splits/y2_train.csv")

X2_train_s = X2_train.set_index(["PERMNO", "DATE"])
y2_train = y2_train.set_index(["PERMNO", "DATE"])

X2_train_s.head(3)
y2_train.head(3)

# 2. Logistic Regressions

## 2.1 Version 1

### 2.1.1 Logistic Regression with only one exogen variable

#### Logistic Regression

In [None]:
## Logistic Regression (with most important feature from feature selection)
# Assign features to matrix X and response to y
X = sm.add_constant(X1_train_s[["_"]])
y = y1_train

logReg = sm.Logit(endog = y, exog= X).fit()

In [None]:
# Workaround solution for error ("AttributeError: module 'scipy.stats' has no attribute 'chisqprob'")
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

#### Summary

In [None]:
# LogReg with only one feature as exogen variable
print(logReg.summary())

#### Plot

In [None]:
## Plotting
# Create df with sorted results
res = pd.DataFrame()
res["_"] = X1_train_s._
res["pred"] = logReg.predict()
res = res.sort_values("_")

# Plot scatter and log.Reg
plt.figure(figsize =(8,5))
plt.scatter(X1_train_s._, y1_train, marker =".")
plt.plot(res._, res.pred, c = "k")
plt.axhline(y=0, color = "gray", linestyle = "dashed")
plt.axhline(y=1, color = "gray", linestyle = "dashed")
plt.ylabel("Probability of up", fontsize =12)
plt.xlabel("_", fontsize =12);

### 2.1.2 Multiple Logistic Regression with all selected features

#### Logistic Regression

In [None]:
## Multiple Log. Regression (with all 15 features from feature selection)
# Assign features to X and response vector y
X = sm.add_constant(X1_train_s)
y = y1_train

#### Summary

In [None]:
# LogReg 1
print("Multiple Logistic Regression with all selected features")
print(78*"_")
print("")
# Run Log.Reg
logReg = sm.Logit(endog = y, exog=X).fit()
print(logReg.summary().tables[0])
print(logReg.summary().tables[1])

In [None]:
# compare LogReg with only one feature as exogen variable & LogReg 1 
# (explicitly Log-Likelihood values-> is there an improvement? (smaller values are prefered!))

In [None]:
# Check for non significant values (p-value > 0.05) and the Log-Likelihood value

### 2.1.3 Multiple Logistic Regression with only significant features

In [None]:
# apply an other logistic regression on a transformed dataset with only all significant values from LogReg 1 (above)

#### Logistic Regression

In [None]:
## Multiple Log. Regression (with significant features from logreg above)
# Assign features to X and response vector y
X = sm.add_constant(X1_train_s[["_", "_", "_", "_", "_"]])
y = y1_train

#### Summary

In [None]:
# LogReg 2
print("Multiple Logistic Regression with selected features")
print(78*"_")
print("")
# Run Log.Reg
logReg = sm.Logit(endog = y, exog=X).fit()
print(logReg.summary().tables[0])
print(logReg.summary().tables[1])

In [None]:
# compare LogReg 1 & 2 (explicitly Log-Likelihood values-> is there an improvement? (smaller values are prefered!))

In [None]:
# Does the logistic regression support our choice in feature selection (with random forest)?

## 2.2 Version 2

### 2.2.1 Logistic Regression with only one exogen variable

#### Logistic Regression

In [None]:
## Logistic Regression (with most important feature from feature selection)
# Assign features to matrix X and response to y
X = sm.add_constant(X2_train_s[["_"]])
y = y2_train

logReg = sm.Logit(endog = y, exog= X).fit()

In [None]:
# Workaround solution for error ("AttributeError: module 'scipy.stats' has no attribute 'chisqprob'")
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

#### Summary

In [None]:
# LogReg with only one feature as exogen variable
print(logReg.summary())

### 2.2.2 Multiple Logistic Regression with all selected features

#### Logistic Regression

In [None]:
## Multiple Log. Regression (with all 15 features from feature selection)
# Assign features to X and response vector y
X = sm.add_constant(X2_train_s)
y = y2_train

#### Summary

In [None]:
# LogReg A
print("Multiple Logistic Regression with all selected features")
print(78*"_")
print("")
# Run Log.Reg
logReg = sm.Logit(endog = y, exog=X).fit()
print(logReg.summary().tables[0])
print(logReg.summary().tables[1])

In [None]:
# compare LogReg with only one feature as exogen variable & LogReg A 
# (explicitly Log-Likelihood values-> is there an improvement? (smaller values are prefered!))

In [None]:
# Check for non significant values (p-value > 0.05) and the Log-Likelihood value

### 2.2.3 Multiple Logistic Regression with only significant features

#### Logistic Regression

In [None]:
## Multiple Log. Regression (with significant features from logreg above)
# Assign features to X and response vector y
X = sm.add_constant(X2_train_s[["_", "_", "_", "_", "_"]])
y = y2_train

#### Summary

In [None]:
# LogReg B
print("Multiple Logistic Regression with selected features")
print(78*"_")
print("")
# Run Log.Reg
logReg = sm.Logit(endog = y, exog=X).fit()
print(logReg.summary().tables[0])
print(logReg.summary().tables[1])

In [None]:
# compare LogReg A & B (explicitly Log-Likelihood values-> is there an improvement? (smaller values are prefered!))

In [None]:
# Does the logistic regression support our choice in feature selection (with random forest)?