In [11]:
import pandas as pd

# Load dataset
df = pd.read_csv("../data/credit_default.csv")

# Rename target column for consistency across project
df = df.rename(columns={
    "default.payment.next.month": "default_payment_next_month"
})

df.head()


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default_payment_next_month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [12]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=["default_payment_next_month"])
y = df["default_payment_next_month"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape


((24000, 24), (6000, 24))

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

pipe_lr = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("model", LogisticRegression(max_iter=1000))
])

pipe_lr.fit(X_train, y_train)

lr_pred = pipe_lr.predict(X_test)

lr_acc = accuracy_score(y_test, lr_pred)
lr_f1 = f1_score(y_test, lr_pred)

lr_acc, lr_f1


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b


(0.8078333333333333, 0.3562255723059743)

Baseline Observation:
Logistic Regression demonstrates limited performance, indicating underfitting due to its linear decision boundary. Financial datasets often contain complex non-linear relationships that linear models fail to capture effectively.

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42
)

rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)

rf_acc = accuracy_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred)

rf_acc, rf_f1


(0.8175, 0.46297204512015694)

In [15]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score

gb = GradientBoostingClassifier(random_state=42)

gb.fit(X_train, y_train)

gb_pred = gb.predict(X_test)

gb_acc = accuracy_score(y_test, gb_pred)
gb_f1 = f1_score(y_test, gb_pred)

gb_acc, gb_f1


(0.8193333333333334, 0.470703125)

In [16]:
results = pd.DataFrame({
    "Model": [
        "Logistic Regression (Baseline)",
        "Random Forest (Improved)",
        "Gradient Boosting (Improved)"
    ],
    "Accuracy": [lr_acc, rf_acc, gb_acc],
    "F1 Score": [lr_f1, rf_f1, gb_f1]
})

results


Unnamed: 0,Model,Accuracy,F1 Score
0,Logistic Regression (Baseline),0.807833,0.356226
1,Random Forest (Improved),0.8175,0.462972
2,Gradient Boosting (Improved),0.819333,0.470703


Baseline vs Improved Model Comparison:
Logistic Regression underfits the data due to its linear nature. Random Forest improves performance by modeling non-linear feature interactions. Gradient Boosting achieves the highest F1-score, demonstrating the best bias–variance trade-off and making it the strongest baseline-era model for this task.