# Logistic Regression 

In [None]:
import warnings; 
warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore",category=UserWarning)
warnings.simplefilter(action="ignore",category=FutureWarning)

import matplotlib.pyplot as plt
#plt.style.use('ggplot')
plt.style.use('seaborn-v0_8-muted')
plt.rcParams['figure.figsize'] = (5, 5)
plt.rcParams['grid.linestyle'] = ':'   
plt.rcParams['axes.grid'] = False

import seaborn as sns
sns.set_style("whitegrid", {'axes.grid' : False})
#sns.color_palette("RdBu", n_colors=10)

# Interactive plots embedded within the notebook
#%matplotlib notebook 
# Static images of plots embedded within the notebook
# %matplotlib inline   
%config InlineBackend.figure_formats = {'png', 'retina'}

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels as sm


#pd.options.plotting.backend = "plotly" 
# Conflict with options in original matplotlib.

print('Numpy version', np.__version__)
print('Pandas version', pd.__version__)
print('Seaborn version', sns.__version__)
print('Statsmodels version', sm.__version__)

## Binary logistic regression

In [3]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

### Financial status data

In [4]:
fin_df = pd.read_excel('data/supervised-learning.xlsx', 
                       sheet_name='Financial', usecols=['Status','X1','X2','X3'], header=0)


Look at the first few rows and summary information using .head() and .info()

Determine number of classes in the data by using `.value_counts()`

In [None]:
fin_df['Status'].value_counts()

Target variable encoding

In [7]:
fin_df['Y'] = fin_df['Status'].replace({"C1-Solvent":1, "C2-Bankrupt":0}) 

### Model fitting using statsmodels

Fit the model by using the R-style formula.  

In [None]:
import statsmodels.formula.api as smf

formula = 'Y ~ X1+X2+X3'
solvent_model = smf.logit(formula = formula, data=fin_df).fit()
solvent_model.params

Predict the probability that a company will be solvent after 2 years.  
Calling `predict()` with no argument returns the probabilites of being solvent.

To make predictions from model fitted using the R-style formula, the input must have columns that match the variables in the formula.

In [None]:
X_fin = fin_df.drop(columns=['Y','Status'], axis=1)
X_fin[20:22]
predictions = solvent_model.predict(X_fin[20:22])
print(predictions)

Fit the mode using the API style (all columns of the inputs are taken).

In [None]:
import statsmodels.api as sm

y_fin = fin_df['Y']
X_reg = sm.add_constant(fin_df.drop(columns=['Y','Status'], axis=1))
solvent_model = sm.Logit(y_fin, X_reg).fit()
solvent_model.params

Numpy array inputs can be used as the inputs for the model fitted by using the API style.

In [None]:
X_fin[20:22]
solvent_model.predict(sm.add_constant(X_fin[20:22], has_constant='add'))
solvent_model.predict(np.array([[1, 31, 4.2, 0.3],
                                [1, 19.2, 1.3, 1.1]]))

### Model inferference 
Show the inferential statistics of the fitted coefficients using `.summary2()`.  
The model should be refitted if the intercept is not significant.

In [None]:
print(f'Log-likelihood of full model: {solvent_model.llf:.4f}')
print(f'Log-likelihood of null model: {solvent_model.llnull:.4f}')
print(f'# model parameters: {int(solvent_model.df_model)}')

### Generate classification report and confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

predictions = solvent_model.predict()
predictions_nominal = [ 0 if x < 0.5 else 1 for x in predictions]
predictions_nominal = (solvent_model.predict() > 0.5).astype("int32") 

print(confusion_matrix(fin_df["Y"], predictions_nominal))
print(classification_report(fin_df["Y"], predictions_nominal, digits = 3))

## Logistic regression using Sklearn

In [None]:
# Load data from file to fin_df

fin_df = pd.read_excel('data/supervised-learning.xlsx', 
                       sheet_name='Financial', usecols=['Status','X1','X2','X3'], header=0)
y = fin_df['Status']
X = fin_df.drop('Status', axis=1)


from sklearn.linear_model import LogisticRegression

logreg_clf = LogisticRegression(fit_intercept=True, penalty=None)
logreg_clf.fit(X, y)

Make sample predictions

In [None]:
# Predict labels and probabilities
logreg_clf.predict(X[:10])
logreg_clf.predict_proba(X[:10]).round(3)

Determine the accuracy, confusion matrix, and classification report

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

y_pred = logreg_clf.predict(X)

y_pred_accuracy = logreg_clf.score(X, y)
print(f'Accuracy of logistic regression classifier on train set: {y_pred_accuracy:.2f}')

print(confusion_matrix(y, y_pred))
print(classification_report(y, y_pred))