<a href="https://colab.research.google.com/github/manvirkaur84/manvirkaur/blob/main/docs/ml-concepts/207_ML_MSBA/SBACaseLogit%20Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [136]:
import sys
import sklearn
print(sys.version)
print('The scikit-learn version is {}.'.format(sklearn.__version__))

3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]
The scikit-learn version is 1.6.1.


In [137]:
%pip install dmba



#Section 1

Using sklearn LogisticRegression() liblinear solver to reproduct parameter (coefficient) estimates (up to 4 decimals) in Tables 7(a), 8, 9 of this article using the SBA case data SBAcase.11.13.17.csv

In [138]:
%matplotlib inline
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pylab as plt
import seaborn as sns
from dmba import classificationSummary, gainsChart, liftChart
from dmba.metric import AIC_score
import math
from scipy.stats import chi2


DATA = Path('/content/sample_data/')

In [145]:
#code from Dicussion Assignment #1  - sklearn LogisticRegression() liblinear solver
#Table 7(a) --> make this a subsection of Section 1


sba_df = pd.read_csv(DATA / 'SBAcase.11.13.17.csv')

sba_df

# TARGET VARIABLE - 'Default' is our dummy variable derived from "MIS_Status"
# The value for “Default” = 1 if MIS_Status = CHGOFF, and “Default” = 0 if MIS_Status = PIF
sba_df['Default'] = np.where(sba_df['MIS_Status'] == 'CHGOFF', 1, 0)
sba_df.drop(columns=['MIS_Status'], inplace=True)

# Training data only
train = sba_df[sba_df['Selected'] == 1].copy()

# Predictors for Table 7(a)
predictors = ['New', 'RealEstate', 'DisbursementGross', 'Portion', 'Recession']
X_train = train[predictors].copy()
y_train = train['Default']

scaler = StandardScaler()
X_train['DisbursementGross'] = scaler.fit_transform(X_train[['DisbursementGross']])


In [151]:


# 5. Fit logistic regression (liblinear, very large C to mimic no penalty)
logit_reg = LogisticRegression(solver='liblinear', penalty='l2', C=1e42, max_iter=1000)
logit_reg.fit(X_train, y_train)

train_X2 = sm.add_constant(X_train, prepend=True)
logit_full2 = sm.GLM(y_train, train_X2, family=sm.families.Binomial())
logit_result2 = logit_full2.fit()
logit_result2.summary()

params = logit_result2.params

coef_table = pd.DataFrame({
    "Variable": train_X2.columns,
    "DF": [1]*len(params),
    "Estimate": params,        # raw values
    "Std Error": logit_result2.bse,      # raw values
    "Wald Chi-Square": (params / logit_result2.bse)**2,
    "Pr > ChiSq": logit_result2.pvalues
})

print(coef_table)


                            Variable  DF  Estimate  Std Error  \
const                          const   1  1.270355   0.343027   
New                              New   1 -0.077167   0.210088   
RealEstate                RealEstate   1 -2.033084   0.363594   
DisbursementGross  DisbursementGross   1 -0.115957   0.121068   
Portion                      Portion   1 -2.829764   0.559381   
Recession                  Recession   1  0.497105   0.241298   

                   Wald Chi-Square    Pr > ChiSq  
const                    13.714951  2.127540e-04  
New                       0.134915  7.133890e-01  
RealEstate               31.266312  2.249495e-08  
DisbursementGross         0.917349  3.381721e-01  
Portion                  25.590873  4.220309e-07  
Recession                 4.244135  3.938613e-02  
