### Import all required libraries.

pip install

In [49]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.metrics import (
    roc_auc_score,
    brier_score_loss,
    roc_curve,
    precision_recall_curve,
    f1_score,
    precision_score,
    recall_score
)


### Loads a dataset from a specified file path, convert it in pandas DataFrame and cleans it by removing rows with missing values.


In [50]:
base_dir = os.path.dirname(os.getcwd())
file_path = os.path.join(base_dir, "data", "credit_risk_dataset.csv")

df = pd.read_csv(file_path)

df = df.dropna()

df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


### Splitting the dataframe into four subsets: X contains the features (input parameters), and Y contains the target variable. 
### The training set (train) represents 80% of the data used to train the model, while the test set (test) represents the remaining 20% used to evaluate the model's performance.

In [51]:
X = df[[
    'person_age','person_income','person_home_ownership',
    'person_emp_length','loan_intent','loan_grade',
    'loan_amnt','loan_int_rate','loan_percent_income',
    'cb_person_default_on_file','cb_person_cred_hist_length'
]]
y = df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

### Preprocessing pipeline: imputes missing values and scales numerical features, while imputing and one-hot encoding categorical features, using a column transformer to apply the correct processing to each feature type.

In [52]:
num_cols = [
    'person_age','person_income','person_emp_length',
    'loan_amnt','loan_int_rate','loan_percent_income',
    'cb_person_cred_hist_length'
]
cat_cols = [
    'person_home_ownership','loan_intent','loan_grade',
    'cb_person_default_on_file'
]

preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imp', SimpleImputer(strategy='median')),
        ('sc',  StandardScaler())
    ]), num_cols),
    ('cat', Pipeline([
        ('imp', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(handle_unknown='ignore'))
    ]), cat_cols)
])

### Trains a calibrated random forest classifier to handle class imbalance, and predicts calibrated probabilities on the test set.

In [53]:
base_rf = RandomForestClassifier(
    class_weight='balanced', n_estimators=200,
    max_depth=10, random_state=42, n_jobs=-1
)

clf = Pipeline([
    ('prep', preprocessor),
    ('cal',  CalibratedClassifierCV(base_rf, cv=5, method='sigmoid'))
])

clf.fit(X_train, y_train)
proba = clf.predict_proba(X_test)[:, 1]

### Evaluates the model's performance using AUC to measure ranking ability and Brier score to assess the accuracy of predicted probabilities.

In [54]:
auc   = roc_auc_score(y_test, proba)
brier = brier_score_loss(y_test, proba)
print(f"AUC = {auc:.3f} \nBrier score = {brier:.3f}")


AUC = 0.926 
Brier score = 0.065
