# Logistic Regression 

In [1]:
import warnings; 
warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore",category=UserWarning)
warnings.simplefilter(action="ignore",category=FutureWarning)

import matplotlib.pyplot as plt
#plt.style.use('ggplot')
plt.style.use('seaborn-v0_8-muted')
plt.rcParams['figure.figsize'] = (5, 5)
plt.rcParams['grid.linestyle'] = ':'   
plt.rcParams['axes.grid'] = False

import seaborn as sns
sns.set_style("whitegrid", {'axes.grid' : False})
#sns.color_palette("RdBu", n_colors=10)

# Interactive plots embedded within the notebook
#%matplotlib notebook 
# Static images of plots embedded within the notebook
# %matplotlib inline   
%config InlineBackend.figure_formats = {'png', 'retina'}

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels as sm


#pd.options.plotting.backend = "plotly" 
# Conflict with options in original matplotlib.

print('Numpy version', np.__version__)
print('Pandas version', pd.__version__)
print('Seaborn version', sns.__version__)
print('Statsmodels version', sm.__version__)

Numpy version 2.0.2
Pandas version 2.2.3
Seaborn version 0.13.2
Statsmodels version 0.14.2


## Binary logistic regression

In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

### Financial status data

In [3]:
fin_df = pd.read_excel('data/supervised-learning.xlsx', 
                       sheet_name='Financial', usecols=['Status','X1','X2','X3'], header=0)


Look at the first few rows and summary information using .head() and .info()

Determine number of classes in the data by using `.value_counts()`

In [4]:
fin_df['Status'].value_counts()

Status
C1-Solvent     33
C2-Bankrupt    33
Name: count, dtype: int64

Target variable encoding

In [5]:
fin_df['Y'] = fin_df['Status'].replace({"C1-Solvent":1, "C2-Bankrupt":0}) 

### Model fitting using statsmodels

Fit the model by using the R-style formula.  

In [6]:
import statsmodels.formula.api as smf

formula = 'Y ~ X1+X2+X3'
solvent_model = smf.logit(formula = formula, data=fin_df).fit()
solvent_model.params

Optimization terminated successfully.
         Current function value: 0.044037
         Iterations 14


Intercept   -10.153452
X1            0.331247
X2            0.180876
X3            5.087466
dtype: float64

Predict the probability that a company will be solvent after 2 years.  
Calling `predict()` with no argument returns the probabilites of being solvent.

To make predictions from model fitted using the R-style formula, the input must have columns that match the variables in the formula.

In [7]:
X_fin = fin_df.drop(columns=['Y','Status'], axis=1)
X_fin[20:22]
predictions = solvent_model.predict(X_fin[20:22])
print(predictions)

Unnamed: 0,X1,X2,X3
20,40.6,5.8,1.8
21,34.6,26.4,1.8


20    0.999999
21    1.000000
dtype: float64


Fit the mode using the API style (all columns of the inputs are taken).

In [8]:
import statsmodels.api as sm

y_fin = fin_df['Y']
X_reg = sm.add_constant(fin_df.drop(columns=['Y','Status'], axis=1))
solvent_model = sm.Logit(y_fin, X_reg).fit()
solvent_model.params

Optimization terminated successfully.
         Current function value: 0.044037
         Iterations 14


const   -10.153452
X1        0.331247
X2        0.180876
X3        5.087466
dtype: float64

Numpy array inputs can be used as the inputs for the model fitted by using the API style.

In [9]:
X_fin[20:22]
solvent_model.predict(sm.add_constant(X_fin[20:22], has_constant='add'))
solvent_model.predict(np.array([[1, 31, 4.2, 0.3],
                                [1, 19.2, 1.3, 1.1]]))

Unnamed: 0,X1,X2,X3
20,40.6,5.8,1.8
21,34.6,26.4,1.8


20    0.999999
21    1.000000
dtype: float64

array([0.91691258, 0.88471307])

### Model inferference 
Show the inferential statistics of the fitted coefficients using `.summary2()`.  
The model should be refitted if the intercept is not significant.

In [10]:
print(f'Log-likelihood of full model: {solvent_model.llf:.4f}')
print(f'Log-likelihood of null model: {solvent_model.llnull:.4f}')
print(f'# model parameters: {int(solvent_model.df_model)}')

Log-likelihood of full model: -2.9065
Log-likelihood of null model: -45.7477
# model parameters: 3


### Generate classification report and confusion matrix

In [11]:
from sklearn.metrics import confusion_matrix, classification_report

predictions = solvent_model.predict()
predictions_nominal = [ 0 if x < 0.5 else 1 for x in predictions]
predictions_nominal = (solvent_model.predict() > 0.5).astype("int32") 

print(confusion_matrix(fin_df["Y"], predictions_nominal))
print(classification_report(fin_df["Y"], predictions_nominal, digits = 3))

[[32  1]
 [ 1 32]]
              precision    recall  f1-score   support

           0      0.970     0.970     0.970        33
           1      0.970     0.970     0.970        33

    accuracy                          0.970        66
   macro avg      0.970     0.970     0.970        66
weighted avg      0.970     0.970     0.970        66



## Logistic regression using Sklearn

In [12]:
# Load data from file to fin_df

fin_df = pd.read_excel('data/supervised-learning.xlsx', 
                       sheet_name='Financial', usecols=['Status','X1','X2','X3'], header=0)
y = fin_df['Status']
X = fin_df.drop('Status', axis=1)


from sklearn.linear_model import LogisticRegression

logreg_clf = LogisticRegression(fit_intercept=True, penalty=None)
logreg_clf.fit(X, y)

Make sample predictions

In [13]:
# Predict labels and probabilities
logreg_clf.predict(X[:10])
logreg_clf.predict_proba(X[:10]).round(3)

array(['C1-Solvent', 'C1-Solvent', 'C1-Solvent', 'C1-Solvent',
       'C1-Solvent', 'C1-Solvent', 'C1-Solvent', 'C1-Solvent',
       'C1-Solvent', 'C1-Solvent'], dtype=object)

array([[1.   , 0.   ],
       [1.   , 0.   ],
       [0.961, 0.039],
       [1.   , 0.   ],
       [1.   , 0.   ],
       [1.   , 0.   ],
       [1.   , 0.   ],
       [1.   , 0.   ],
       [1.   , 0.   ],
       [1.   , 0.   ]])

Determine the accuracy, confusion matrix, and classification report

In [14]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

y_pred = logreg_clf.predict(X)

y_pred_accuracy = logreg_clf.score(X, y)
print(f'Accuracy of logistic regression classifier on train set: {y_pred_accuracy:.2f}')

print(confusion_matrix(y, y_pred))
print(classification_report(y, y_pred))

Accuracy of logistic regression classifier on train set: 0.97
[[32  1]
 [ 1 32]]
              precision    recall  f1-score   support

  C1-Solvent       0.97      0.97      0.97        33
 C2-Bankrupt       0.97      0.97      0.97        33

    accuracy                           0.97        66
   macro avg       0.97      0.97      0.97        66
weighted avg       0.97      0.97      0.97        66



In [15]:
solvent_model.predict(np.array([[1, 7, 30, 1.2]]))

array([0.97579056])