In [14]:
import pandas as pd
import numpy as np
from pathlib import Path
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
BASE_DIR = Path("..")
DATA_PATH = BASE_DIR / "data" / "processed" / "YourLastName.csv"

df = pd.read_csv(DATA_PATH)

print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

# HARD FAILS (important)
if "loan_default" not in df.columns:
    raise ValueError("loan_default column is missing")

if df.isnull().sum().sum() > 0:
    raise ValueError("Dataset contains missing values")

# Ensure numeric
df = df.apply(pd.to_numeric)

X = df.drop("loan_default", axis=1)
y = df["loan_default"]

print("Target distribution:")
print(y.value_counts())

Shape: (20000, 13)
Columns: ['income', 'loan_amount', 'loan_term', 'credit_history_years', 'debt_to_income', 'loan_default', 'loan_to_income', 'employment_status_informal', 'employment_status_self-employed', 'gender_male', 'age_group_26-35', 'age_group_36-45', 'age_group_46+']
Target distribution:
loan_default
0    14896
1     5104
Name: count, dtype: int64


In [3]:
# Train/Validation/Test Split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.3,
    stratify=y,
    random_state=42)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    stratify=y_temp,
    random_state=42)

print("Train:", X_train.shape)
print("Val:", X_val.shape)
print("Test:", X_test.shape)

Train: (14000, 12)
Val: (3000, 12)
Test: (3000, 12)


In [4]:
# Logistic Regression
lr = LogisticRegression(
    solver="liblinear",
    class_weight="balanced",
    max_iter=1000,
    random_state=42)

lr.fit(X_train, y_train)

val_probs_lr = lr.predict_proba(X_val)[:, 1]
val_auc_lr = roc_auc_score(y_val, val_probs_lr)

print("Logistic Regression Validation AUC:", round(val_auc_lr, 4))

Logistic Regression Validation AUC: 0.7423


In [5]:
# Random forest
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    random_state=42,
    n_jobs=-1)

rf.fit(X_train, y_train)

val_probs_rf = rf.predict_proba(X_val)[:, 1]
val_auc_rf = roc_auc_score(y_val, val_probs_rf)

print("Random Forest Validation AUC:", round(val_auc_rf, 4))

Random Forest Validation AUC: 0.8015


In [8]:
# Selecting best model
best_model = rf if val_auc_rf >= val_auc_lr else lr
print("Selected model:", type(best_model).__name__)

Selected model: RandomForestClassifier


In [11]:
from sklearn.metrics import roc_auc_score, classification_report

# Test evaluation
test_probs = best_model.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, test_probs)

print("Test ROC-AUC:", round(test_auc, 4))
print("\nClassification Report:\n")
print(classification_report(y_test, best_model.predict(X_test)))

Test ROC-AUC: 0.8019

Classification Report:

              precision    recall  f1-score   support

           0       0.80      0.92      0.86      2235
           1       0.58      0.34      0.43       765

    accuracy                           0.77      3000
   macro avg       0.69      0.63      0.64      3000
weighted avg       0.75      0.77      0.75      3000



In [12]:
from pathlib import Path
import joblib

PROJECT_DIR = Path.cwd()          # Notebook 3 is in project root
MODEL_DIR = PROJECT_DIR / "models"
MODEL_DIR.mkdir(exist_ok=True)

joblib.dump(best_model, MODEL_DIR / "credit_model.pkl")
print("Model saved at:", MODEL_DIR / "credit_model.pkl")

Model saved at: C:\Users\HP\OOP UCU\Data LifeCyle\Credit Scoring System\models\credit_model.pkl


In [13]:
from pathlib import Path

Path("../models/credit_model.pkl").exists()

True