# Credit Risk Modeling – Logistic Regression

## Objective
The objective of this notebook is to build an interpretable credit risk model that estimates the probability of loan default, suitable for real-world banking decision-making.

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score, classification_report

In [3]:
df = pd.read_csv("../data/processed/model_data.csv")

df.head()

Unnamed: 0,loan_amnt,term,int_rate,annual_inc,dti,revol_util,emp_length,default,fico_avg,loan_to_income
0,3600.0,36,13.99,55000.0,5.91,29.7,10.0,0,677.0,0.065455
1,24700.0,36,11.99,65000.0,16.06,19.2,10.0,0,717.0,0.38
2,20000.0,60,10.78,63000.0,10.78,56.2,10.0,0,697.0,0.31746
3,10400.0,60,22.45,104433.0,25.37,64.5,3.0,0,697.0,0.099585
4,11950.0,36,13.44,34000.0,10.2,68.4,4.0,0,692.0,0.351471


In [4]:
X = df.drop(columns=["default"])
y = df["default"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

In [6]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
log_reg = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    solver="lbfgs"
)

log_reg.fit(X_train_scaled, y_train)

In [8]:
y_test_proba = log_reg.predict_proba(X_test_scaled)[:, 1]

In [9]:
roc_auc = roc_auc_score(y_test, y_test_proba)
roc_auc

np.float64(0.7002167729099027)

In [10]:
coefficients = pd.DataFrame({
    "feature": X.columns,
    "coefficient": log_reg.coef_[0]
}).sort_values(by="coefficient", ascending=False)

coefficients

Unnamed: 0,feature,coefficient
2,int_rate,0.464242
1,term,0.203831
4,dti,0.161063
8,loan_to_income,0.151675
0,loan_amnt,-0.013474
3,annual_inc,-0.03734
6,emp_length,-0.039437
5,revol_util,-0.076578
7,fico_avg,-0.251606
