<a href="https://colab.research.google.com/github/manvirkaur84/manvirkaur/blob/main/docs/ml-concepts/207_ML_MSBA/Assignment-logistic-regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
!pip install dmba



In [20]:
import sys
import sklearn
print(sys.version)
print('The scikit-learn version is {}.'.format(sklearn.__version__))

3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]
The scikit-learn version is 1.6.1.


In [21]:
%matplotlib inline
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pylab as plt
import seaborn as sns
from dmba import classificationSummary, gainsChart, liftChart
from dmba.metric import AIC_score
import math

DATA = Path('/content/sample_data/')

In [22]:
#Dicussion Assignment #1  - sklearn LogisticRegression() liblinear solver
bank_df = pd.read_csv(DATA / 'UniversalBank.csv')
bank_df.drop(columns=['ID', 'ZIP Code'], inplace=True)
bank_df.columns = [c.replace(' ', '_') for c in bank_df.columns]
bank_df

# Treat education as categorical, convert to dummy variables
bank_df['Education'] = bank_df['Education'].astype('category')
new_categories = {1: 'Undergrad', 2: 'Graduate', 3: 'Advanced/Professional'}
bank_df.Education.cat.rename_categories(new_categories)

bank_df = pd.get_dummies(bank_df, prefix_sep='_', drop_first=True, dtype=int)

y = bank_df['Personal_Loan']
X = bank_df.drop(columns=['Personal_Loan'])

# partition data
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

# fit a logistic regression (set penalty=l2 and C=1e42 to avoid regularization)
# see https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# see https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
logit_reg = LogisticRegression(penalty="l2", C=1e42, solver='liblinear')
logit_reg.fit(train_X, train_y)

print('intercept ', logit_reg.intercept_[0])
print(pd.DataFrame({'coeff': logit_reg.coef_[0]}, index=X.columns).transpose())
print()
print('AIC', AIC_score(valid_y, logit_reg.predict(valid_X), df = len(train_X.columns) + 1))

intercept  -12.493436061176814
            Age  Experience    Income    Family     CCAvg  Mortgage  \
coeff -0.037685    0.039202  0.058844  0.612251  0.240489  0.001012   

       Securities_Account  CD_Account    Online  CreditCard  Education_2  \
coeff            -1.01428    3.649097 -0.678306   -0.958283     4.202148   

       Education_3  
coeff     4.355761  

AIC -709.1524769205962


In [32]:
#Discussion Assignment #2 - sklearn LogisticRegression() liblinear solver
bank_df = pd.read_csv(DATA / 'UniversalBank.csv')
bank_df.drop(columns=['ID', 'ZIP Code'], inplace=True)
bank_df.columns = [c.replace(' ', '_') for c in bank_df.columns]
bank_df

# Treat education as categorical, convert to dummy variables
bank_df['Education'] = bank_df['Education'].astype('category')
new_categories = {1: 'Undergrad', 2: 'Graduate', 3: 'Advanced/Professional'}
bank_df.Education.cat.rename_categories(new_categories)

bank_df = pd.get_dummies(bank_df, prefix_sep='_', drop_first=True, dtype=int)

y = bank_df['Personal_Loan']
X = bank_df.drop(columns=['Personal_Loan'])

# partition data
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

# fit a logistic regression (set penalty=l2 and C=1e42 to avoid regularization)
# see https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# see https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
logit_reg = LogisticRegression(penalty="l2", C=1e42, solver='liblinear', tol=1e-10)
logit_reg.fit(train_X, train_y)

print('intercept ', logit_reg.intercept_[0])
print(pd.DataFrame({'coeff': logit_reg.coef_[0]}, index=X.columns).transpose())
print()
print('AIC', AIC_score(valid_y, logit_reg.predict(valid_X), df = len(train_X.columns) + 1))

intercept  -12.563385910114459
            Age  Experience    Income    Family     CCAvg  Mortgage  \
coeff -0.035391    0.036947  0.058904  0.612778  0.240773  0.001014   

       Securities_Account  CD_Account    Online  CreditCard  Education_2  \
coeff           -1.030501    3.662732 -0.679423   -0.960853     4.207628   

       Education_3  
coeff     4.358105  

AIC -733.9975169177105


In [33]:
#Discussion Assignment #3 - sklearn LogisticRegression() Default Solver 'lbfgs'
bank_df = pd.read_csv(DATA / 'UniversalBank.csv')
bank_df.drop(columns=['ID', 'ZIP Code'], inplace=True)
bank_df.columns = [c.replace(' ', '_') for c in bank_df.columns]
bank_df

# Treat education as categorical, convert to dummy variables
bank_df['Education'] = bank_df['Education'].astype('category')

bank_df = pd.get_dummies(bank_df, prefix_sep='_', drop_first=True, dtype=int)

y = bank_df['Personal_Loan']
X = bank_df.drop(columns=['Personal_Loan'])

# partition data
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

logit_reg_default = LogisticRegression(penalty=None, solver='lbfgs', max_iter=10000)
logit_reg_default.fit(train_X, train_y)
print('intercept ', logit_reg_default.intercept_[0])
print(pd.DataFrame({'coeff': logit_reg_default.coef_[0]}, index=X.columns).transpose())

intercept  -12.565721894981692
           Age  Experience    Income    Family     CCAvg  Mortgage  \
coeff -0.03552     0.03707  0.058921  0.613629  0.240863  0.001012   

       Securities_Account  CD_Account    Online  CreditCard  Education_2  \
coeff           -1.021516    3.673777 -0.681357    -0.95892     4.208849   

       Education_3  
coeff     4.356279  


In [34]:
#Discussion Assignment #4 - sklearn LogisticRegression() Default Solver 'lbfgs' and tol parameter
bank_df = pd.read_csv(DATA / 'UniversalBank.csv')
bank_df.drop(columns=['ID', 'ZIP Code'], inplace=True)
bank_df.columns = [c.replace(' ', '_') for c in bank_df.columns]
bank_df

# Treat education as categorical, convert to dummy variables
bank_df['Education'] = bank_df['Education'].astype('category')

bank_df = pd.get_dummies(bank_df, prefix_sep='_', drop_first=True, dtype=int)

y = bank_df['Personal_Loan']
X = bank_df.drop(columns=['Personal_Loan'])

# partition data
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

logit_reg_default = LogisticRegression(penalty=None, solver='lbfgs', tol=1e-28, max_iter=10000)
logit_reg_default.fit(train_X, train_y)
print('intercept ', logit_reg_default.intercept_[0])
print(pd.DataFrame({'coeff': logit_reg_default.coef_[0]}, index=X.columns).transpose())


intercept  -12.563228801220024
            Age  Experience    Income    Family     CCAvg  Mortgage  \
coeff -0.035391    0.036948  0.058903  0.612773  0.240771  0.001014   

       Securities_Account  CD_Account    Online  CreditCard  Education_2  \
coeff           -1.030563    3.662657 -0.679408   -0.960795     4.207529   

       Education_3  
coeff     4.358037  
