## **Descriptive / Predictive Logistic Regression**

In [1]:
import pandas as pd
import numpy as np

## Universal Bank
## Goal: probability to get a personal loan (marketing)
df = pd.read_csv('https://raw.githubusercontent.com/martinwg/ISA591/refs/heads/main/data/UniversalBank.csv')
df.head()

Unnamed: 0,ID,Personal Loan,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Securities Account,CD Account,Online,CreditCard
0,1,0,25,1,49,91107,4,1.6,1,0,1,0,0,0
1,2,0,45,19,34,90089,3,1.5,1,0,1,0,0,0
2,3,0,39,15,11,94720,1,1.0,1,0,0,0,0,0
3,4,0,35,9,100,94112,1,2.7,2,0,0,0,0,0
4,5,0,35,8,45,91330,4,1.0,2,0,0,0,0,1


In [2]:
## drop uninformative vars ID and ZIP Code
df = df.drop(['ID', 'ZIP Code'], axis=1)
df.head()

Unnamed: 0,Personal Loan,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Securities Account,CD Account,Online,CreditCard
0,0,25,1,49,4,1.6,1,0,1,0,0,0
1,0,45,19,34,3,1.5,1,0,1,0,0,0
2,0,39,15,11,1,1.0,1,0,0,0,0,0
3,0,35,9,100,1,2.7,2,0,0,0,0,0
4,0,35,8,45,4,1.0,2,0,0,0,0,1


In [3]:
## Check for Missing
df.isna().sum()

Unnamed: 0,0
Personal Loan,0
Age,0
Experience,0
Income,0
Family,0
CCAvg,0
Education,0
Mortgage,0
Securities Account,0
CD Account,0


In [4]:
# prompt: print the variables with missing values

# Print the variables with missing values
for col in df.columns:
    if df[col].isna().sum() > 0:
        print(f"Variable '{col}' has {df[col].isna().sum()} missing values.")

In [5]:
## Linear do NOT accept PERFECT CORRELATION
## Before you fit LR, make sure to check for perfect or almost perfect collinearity
# Create a correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than a threshold (e.g., 0.99)
threshold = 0.99
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

# Drop the highly correlated features
df = df.drop(columns=to_drop)

print(f"Columns removed: {to_drop}")

Columns removed: ['Experience']


In [6]:
## split the data into 70% and 30% (random_state = 13)
from sklearn.model_selection import train_test_split

X = df.drop('Personal Loan', axis=1)
y = df['Personal Loan']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)

## **Logistic as a Descriptive Model**

Check for GOODNESS OF FIT metrics, e.g., p-values, hypothesis tests, ...

In [7]:
import statsmodels.api as sm

# instance and fit
lr = sm.Logit(y_train, sm.add_constant(X_train)).fit()

# summary
print(lr.summary())

Optimization terminated successfully.
         Current function value: 0.136589
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:          Personal Loan   No. Observations:                 3500
Model:                          Logit   Df Residuals:                     3489
Method:                           MLE   Df Model:                           10
Date:                Thu, 14 Nov 2024   Pseudo R-squ.:                  0.5838
Time:                        21:54:46   Log-Likelihood:                -478.06
converged:                       True   LL-Null:                       -1148.7
Covariance Type:            nonrobust   LLR p-value:                4.549e-282
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                -13.6059      0.770    -17.680      0.000     -15.114     -12.098
Age  

In [None]:
## Iterations = 9: Took 9 steps to get to the min of loss function (e.g., SSE)

## LLR (Log-likelihood Ratio Test) p-value: 4.549e-282
## H0: Beta_Age = Beta_Income = ... = 0  ## no relationships with p(get a loan)
## HA: At least ONE has a relationship with p(get a loan)
## p-value is the probability that we get -478.06 or MORE EXTREME if H0 is true
## We reject H0 and conclude OVERALL the model adequate (at least 1 var is important)

## EACH PREDICTOR HAS ITS OWN TEST
## H0: Beta_Age = 0
## HA: Beta_Age != 0
## p-value; 0.109
## In the presence of the other predictors , AGE is not statistically significant

## TO IMPROVE THE FIT: remove insignificant predictors (one at a time)

In [9]:
## let's use alpha = 0.05
column_drop_list = ['Age', 'Mortgage']

# instance and fit
lr = sm.Logit(y_train, sm.add_constant(X_train.drop(columns = column_drop_list))).fit()

# summary
print(lr.summary())

Optimization terminated successfully.
         Current function value: 0.137056
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:          Personal Loan   No. Observations:                 3500
Model:                          Logit   Df Residuals:                     3491
Method:                           MLE   Df Model:                            8
Date:                Thu, 14 Nov 2024   Pseudo R-squ.:                  0.5824
Time:                        22:05:33   Log-Likelihood:                -479.70
converged:                       True   LL-Null:                       -1148.7
Covariance Type:            nonrobust   LLR p-value:                1.378e-283
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                -12.9590      0.643    -20.143      0.000     -14.220     -11.698
Incom

In [10]:
## Signs show relationship
## Income +: Income has a positive effect on prob(get a loan)
## Online -: Having an online banking acct is associated with a lower prob(get a loan)

In [None]:
## Interpretations (log(odds) or odds)
## Slope of Income Interpretation:
## (LogOdds) As income increases by $1, then the estimated Log(Odds) of getting of loan increase by 0.0531 controlling for OTHER factors
## (Odds) As income increases by $1, then the estimated Odds of getting of loan will change by a factor of 1.0545 (increase 5.45%) controlling for OTHER factors

In [None]:
## Online = {1: has online banking, 0: not} -0.5625
## (LogOdds) Customers with an online banking acct have log(odds) of getting a loan 0.5625 lower than those who do not
## (Odds) Customers with an online banking acct have odds of getting a loan decreased by a factor of 0.5697 compared to  those who do not

In [12]:
np.exp(-0.5625)

0.569782824730923

## **Predictive Logistic Regression**

We do NOT consider goodness of fit metrics:

* p-values of ANY slope
* hypothesis test

We care about how good the model predicts NEW DATA.

In [25]:
from sklearn.linear_model import LogisticRegression

## instance
lr = LogisticRegression(penalty = 'l1', max_iter=1000, C = 0.05, solver = 'liblinear')

## fit
lr.fit(X_train, y_train)

In [26]:
## check for convergence. If not converged it is NOT the best model
## What if you get iterations higher and STILL not converge? CHANGE THE SOLVER
## solver='lbfgs' is default. It is faster but SOMETIMES DOES NOT CONVERGE
## solver = 'liblinear' most of times converge (lower step size). Takes LONGER

In [27]:
## estimates of the SLOPE
lr.coef_

array([[-2.49005076e-02,  3.41547750e-02,  2.79723351e-01,
         5.37550801e-02,  7.72571134e-01,  5.64769312e-04,
         0.00000000e+00,  1.19033002e+00,  0.00000000e+00,
        -4.42408099e-02]])