In [13]:
import pandas as pd

df = pd.read_csv("../data/credit_default.csv")

# Rename target for consistency
df = df.rename(columns={
    "default.payment.next.month": "default_payment_next_month"
})

df.head()


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default_payment_next_month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [14]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=["default_payment_next_month"])
y = df["default_payment_next_month"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [8, 10, 12],
    "min_samples_split": [2, 5],
}

rf_tuned = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=3,
    scoring="f1",
    n_jobs=-1
)

rf_tuned.fit(X_train, y_train)

print("Best Parameters:", rf_tuned.best_params_)

rf_best = rf_tuned.best_estimator_
rf_pred = rf_best.predict(X_test)

rf_acc_tuned = accuracy_score(y_test, rf_pred)
rf_f1_tuned = f1_score(y_test, rf_pred)

rf_acc_tuned, rf_f1_tuned


Best Parameters: {'max_depth': 12, 'min_samples_split': 2, 'n_estimators': 300}


(0.817, 0.46070726915520627)

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

gb_params = {
    "learning_rate": [0.01, 0.05, 0.1],
    "n_estimators": [100, 150],
    "max_depth": [2, 3]
}

gb_tuned = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    gb_params,
    cv=3,
    scoring="f1",
    n_jobs=-1
)

gb_tuned.fit(X_train, y_train)

print("Best Parameters:", gb_tuned.best_params_)

gb_best = gb_tuned.best_estimator_
gb_pred = gb_best.predict(X_test)

gb_acc_tuned = accuracy_score(y_test, gb_pred)
gb_f1_tuned = f1_score(y_test, gb_pred)

gb_acc_tuned, gb_f1_tuned


Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150}


(0.8178333333333333, 0.46760837798343885)

In [17]:
results_tuned = pd.DataFrame({
    "Model": ["RF (Tuned)", "GB (Tuned)"],
    "Accuracy": [rf_acc_tuned, gb_acc_tuned],
    "F1 Score": [rf_f1_tuned, gb_f1_tuned]
})

results_tuned


Unnamed: 0,Model,Accuracy,F1 Score
0,RF (Tuned),0.817,0.460707
1,GB (Tuned),0.817833,0.467608


Hyperparameter Tuning Observations:
Tuning improved both Random Forest and Gradient Boosting performance.
Gradient Boosting achieved the highest tuned F1-score, confirming it as the best-performing model overall.
This shows that model performance improves significantly when hyperparameters are optimized rather than using default settings.

In [18]:
import joblib

if gb_f1_tuned > rf_f1_tuned:
    joblib.dump(gb_best, "../models/best_classifier.pkl")
    print("Saved Gradient Boosting as best model.")
else:
    joblib.dump(rf_best, "../models/best_classifier.pkl")
    print("Saved Random Forest as best model.")


Saved Gradient Boosting as best model.
