In [46]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import joblib
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.tree import export_graphviz
import graphviz
from skopt import BayesSearchCV
from skopt.space import Integer, Categorical, Real
import gradio as gr

In [47]:
train_filepath = "../data/cr_loan_train.csv"
test_filepath = "../data/cr_loan_test.csv"
feats_filepath = "../model_features.json"

train_df = pd.read_csv(train_filepath)
test_df = pd.read_csv(test_filepath)
with open(feats_filepath, 'r') as f:
    feats = json.load(f)

target_column = "loan_status"
X_train = train_df.drop(columns=[target_column])
y_train = train_df[target_column]

X_test = test_df.drop(columns=[target_column])
y_test = test_df[target_column]

In [4]:
scaler_train = joblib.load("../models_and_scalers/scaler_train.pkl")
scaler_test = joblib.load("../models_and_scalers/scaler_test.pkl")

In [50]:
dt_feats = feats["dt_feats"]
X_train_dt = X_train[dt_feats]
X_test_dt = X_test[dt_feats]
param_grid_dt = {
    "criterion": ["gini","entropy"],
    "max_depth": [2, 5, 10, 15],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': [None, 'sqrt', 'log2'],
    "random_state": [42]
}

dt_model = DecisionTreeClassifier()
grid_search_dt = GridSearchCV(dt_model, param_grid_dt, cv=5, scoring="roc_auc")

grid_search_dt.fit(X_train_dt, y_train)

In [51]:
best_params_dt = grid_search_dt.best_params_
print(f"Best parameters for decision tree model: {best_params_dt}\n")

best_dt_model = DecisionTreeClassifier(**best_params_dt)
best_dt_model.fit(X_train_dt, y_train)

y_pred_dt_train = best_dt_model.predict(X_train_dt)
print("Evaluation on training set:")
print(classification_report(y_train, y_pred_dt_train))
print(f"ROC AUC score for decision tree: {roc_auc_score(y_train, y_pred_dt_train):.3f}")
print(f"Accuracy for decision tree: {accuracy_score(y_train, y_pred_dt_train):.3f}")
print(60 * "-")

y_pred_dt = best_dt_model.predict(X_test_dt)
print("Evaluation on test set:")
print(classification_report(y_test, y_pred_dt))
print(f"ROC AUC score for decision tree: {roc_auc_score(y_test, y_pred_dt):.3f}")
print(f"Accuracy for decision tree: {accuracy_score(y_test, y_pred_dt):.3f}")

Best parameters for decision tree model: {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 10, 'min_samples_split': 2, 'random_state': 42}

Evaluation on training set:
              precision    recall  f1-score   support

           0       0.92      0.99      0.96     17716
           1       0.95      0.71      0.81      4970

    accuracy                           0.93     22686
   macro avg       0.94      0.85      0.89     22686
weighted avg       0.93      0.93      0.92     22686

ROC AUC score for decision tree: 0.850
Accuracy for decision tree: 0.929
------------------------------------------------------------
Evaluation on test set:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95      7605
           1       0.94      0.70      0.80      2118

    accuracy                           0.93      9723
   macro avg       0.93      0.84      0.88      9723
weighted avg       0.93      0.93      0.92 

In [53]:
rf_feats = feats["rf_feats"]
X_train_rf = X_train[rf_feats]
X_test_rf = X_test[rf_feats]
param_space_rf = {
    'n_estimators': Integer(50, 300),  
    'criterion': Categorical(['gini', 'entropy']),  
    'max_depth':  Integer(2, 6),
    'min_samples_split': Integer(10, 30), 
    'min_samples_leaf': Integer(2, 8), 
    'max_features': Categorical([0.2, 0.5, 'sqrt', 'log2']),  
    'bootstrap': [True], 
    'random_state': [42]  
}

rf_model = RandomForestClassifier()
bayes_search_rf = BayesSearchCV(
    estimator=rf_model, 
    search_spaces=param_space_rf, 
    n_iter=50, 
    scoring="roc_auc",
    cv=5, 
    n_jobs=-1, 
    random_state=42)

bayes_search_rf.fit(X_train_rf, y_train)

In [54]:
best_params_rf = bayes_search_rf.best_params_
print(f"Best parameters for random forest model: {best_params_rf}\n")

best_rf_model = RandomForestClassifier(**best_params_rf)
best_rf_model.fit(X_train_rf, y_train)

y_pred_rf_train = best_rf_model.predict(X_train_rf)
print("Evaluation on training set:")
print(classification_report(y_train, y_pred_rf_train))
print(f"ROC AUC score for random forest model: {roc_auc_score(y_train, y_pred_rf_train):.3f}")
print(f"Accuracy for random forest model: {accuracy_score(y_train, y_pred_rf_train):.3f}")
print(60 * "-")

y_pred_rf = best_rf_model.predict(X_test_rf)
print("Evaluation on test set:")
print(classification_report(y_test, y_pred_rf))
print(f"ROC AUC score for random forest model: {roc_auc_score(y_test, y_pred_rf):.3f}")
print(f"Accuracy for random forest model: {accuracy_score(y_test, y_pred_rf):.3f}")

Best parameters for random forest model: OrderedDict({'bootstrap': True, 'criterion': 'entropy', 'max_depth': 6, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 300, 'random_state': 42})

Evaluation on training set:
              precision    recall  f1-score   support

           0       0.90      0.96      0.93     17716
           1       0.81      0.60      0.69      4970

    accuracy                           0.88     22686
   macro avg       0.85      0.78      0.81     22686
weighted avg       0.88      0.88      0.88     22686

ROC AUC score for random forest model: 0.782
Accuracy for random forest model: 0.882
------------------------------------------------------------
Evaluation on test set:
              precision    recall  f1-score   support

           0       0.90      0.96      0.92      7605
           1       0.79      0.60      0.68      2118

    accuracy                           0.88      9723
   macro avg       0.84      

In [56]:
xgb_feats = feats["xgb_feats"]
X_train_xgb = X_train[xgb_feats]
X_test_xgb = X_test[xgb_feats]
param_space_xgb = {
    'n_estimators': Integer(50, 200),  
    'learning_rate': Real(0.01, 0.2, prior='log-uniform'), 
    'max_depth': Integer(1, 6),  
    'min_child_weight': Integer(1, 5),  
    'gamma': Real(0, 2),  
    'subsample': Real(0.6, 1.0),  
    'colsample_bytree': Real(0.6, 1.0),  
    'lambda': Real(0, 5),  
    'alpha': Real(0, 5), 
    'scale_pos_weight': Real(1, 5),
}

xgb_model = XGBClassifier()
bayes_search_xgb = BayesSearchCV(
    estimator=xgb_model, 
    search_spaces=param_space_xgb, 
    n_iter=50, 
    scoring='roc_auc', 
    cv=5,
    n_jobs=-1, 
    random_state=42
)

bayes_search_xgb.fit(X_train_xgb, y_train)

In [57]:
best_params_xgb = bayes_search_xgb.best_params_
print(f"Best parameters for XGBoost model: {best_params_xgb}\n")

best_xgb_model = XGBClassifier(**best_params_xgb)
best_xgb_model.fit(X_train_xgb, y_train)

y_pred_xgb_train = best_xgb_model.predict(X_train_xgb)
print("Evaluation on training set:")
print(classification_report(y_train, y_pred_xgb_train))
print(f"ROC AUC score for XGBoost model: {roc_auc_score(y_train, y_pred_xgb_train):.3f}")
print(f"Accuracy for XGBoost model: {accuracy_score(y_train, y_pred_xgb_train):.3f}")
print(60 * "-")

y_pred_xgb = best_xgb_model.predict(X_test_xgb)
print("Evaluation on test set:")
print(classification_report(y_test, y_pred_xgb))
print(f"ROC AUC score for XGBoost model: {roc_auc_score(y_test, y_pred_xgb):.3f}")
print(f"Accuracy for XGBoost model: {accuracy_score(y_test, y_pred_xgb):.3f}")

Best parameters for XGBoost model: OrderedDict({'alpha': 5.0, 'colsample_bytree': 1.0, 'gamma': 0.0, 'lambda': 0.0, 'learning_rate': 0.1404397474595105, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 200, 'scale_pos_weight': 5.0, 'subsample': 0.6})

Evaluation on training set:
              precision    recall  f1-score   support

           0       0.95      0.84      0.89     17716
           1       0.60      0.86      0.71      4970

    accuracy                           0.84     22686
   macro avg       0.78      0.85      0.80     22686
weighted avg       0.88      0.84      0.85     22686

ROC AUC score for XGBoost model: 0.848
Accuracy for XGBoost model: 0.845
------------------------------------------------------------
Evaluation on test set:
              precision    recall  f1-score   support

           0       0.89      0.84      0.86      7605
           1       0.52      0.61      0.56      2118

    accuracy                           0.79      9723
   macro av

In [61]:
folder_path = "../models_and_scalers"

joblib.dump(best_dt_model, os.path.join(folder_path, "decision_tree.pkl"))
joblib.dump(best_rf_model, os.path.join(folder_path, "random_forest.pkl"))
joblib.dump(best_xgb_model, os.path.join(folder_path, "xgboost.pkl"))

['../models_and_scalers\\xgboost.pkl']

Among the three models, the decision tree model demonstrates the best performance in predicting default status. It achieves the highest accuracy and ROC AUC score on both the training and test sets, with minimal overfitting.

*Training Set Evaluation:*

- Precision:
  - Class 0 (Non-Default): 0.92
  - Class 1 (Default): 0.95
- Recall:
  - Class 0 (Non-Default): 0.99
  - Class 1 (Default): 0.71
- F1-Score:
  - Class 0 (Non-Default): 0.96
  - Class 1 (Default): 0.81
- Accuracy: 0.93
- ROC AUC Score: 0.850

*Test Set Evaluation:*

- Precision:
  - Class 0 (Non-Default): 0.92
  - Class 1 (Default): 0.94
- Recall:
  - Class 0 (Non-Default): 0.99
  - Class 1 (Default): 0.70
- F1-Score:
  - Class 0 (Non-Default): 0.95
  - Class 1 (Default): 0.80
- Accuracy: 0.93
- ROC AUC Score: 0.844

The training and test results for the Decision Tree model are similar, with the training results being slightly better. This indicates that the model does not display overfitting.

In [None]:
graph_folder_path = "../graphs"
if not os.path.exists(graph_folder_path):
    os.makedirs(graph_folder_path)
graph_dt_dot_filepath = os.path.join(graph_folder_path, "decision_tree.dot")

export_graphviz(best_dt_model, 
                out_file=graph_dt_dot_filepath, 
                feature_names=dt_feats,
                class_names=["Default", "Non-Default"],
                impurity=True, 
                filled=True,
                proportion=False,
                rotate=True)


with open(graph_dt_dot_filepath) as f:
    dt_dot = f.read()
graph_dt = graphviz.Source(dt_dot)

graph_dt.render(engine='dot',format='pdf', filename=os.path.splitext(graph_dt_dot_filepath)[0])
os.remove(graph_dt_dot_filepath)
os.remove(os.path.splitext(graph_dt_dot_filepath)[0])

In [44]:
def predict_default_dt(annual_income, emp_length, loan_amount, loan_grade, loan_intent, pays_rent):
    assert annual_income > 0, "Annual income must be greater than 0."
    assert emp_length >= 0, "Employment duration must be non-negative."
    assert loan_amount > 0, "Loan amount must be greater than 0."
    assert loan_grade in {'A', 'B', 'C', 'D', 'E', 'F', 'G'}, "Invalid loan grade."
    assert loan_intent in {'Education', 'Home Improvement', 'Personal', 'Venture', 'Other'}, "Invalid loan intent."
    assert pays_rent in {"Yes", "No"}, "Invalid value for rent payment status."
    
    loan_percent_income = loan_amount / (annual_income + 1e-6)
    income_emp_length_product = annual_income * emp_length
    loan_grade_encoding = {'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7}
    home_ownership_RENT = 1 if pays_rent == "Yes" else 0
    loan_intent_encoding = {
        'loan_intent_EDUCATION': 0,
        'loan_intent_HOMEIMPROVEMENT': 0,
        'loan_intent_PERSONAL': 0,
        'loan_intent_VENTURE': 0,
    }
    if loan_intent != 'Other':
        loan_intent_key = f'loan_intent_{loan_intent.upper().replace(" ", "")}'
        loan_intent_encoding[loan_intent_key] = 1

    input_feats = pd.DataFrame(columns=X.columns)
    input_feats.loc[0] = {
        "person_income": annual_income,
        "loan_grade": loan_grade_encoding[loan_grade], 
        "loan_percent_income": loan_percent_income,
        "person_home_ownership_RENT": home_ownership_RENT,
        **loan_intent_encoding,
        "income_emp_length_product": income_emp_length_product
    }
    input_feats = pd.DataFrame(scaler.transform(input_feats),columns=X.columns)[dt_feats]

    predicted_loan_status = best_dt_model.predict(input_feats)[0]
    default_prob = best_dt_model.predict_proba(input_feats)[0, 1]
    default_prob = f'{default_prob:.3f}'
    return predicted_loan_status, default_prob

interface = gr.Interface(
    fn=predict_default_dt,
    inputs=[
        gr.Number(label="Annual income"),
        gr.Number(label="Employment duration"),
        gr.Number(label="Loan amount"),
        gr.Dropdown(choices=['A','B','C','D','E','F','G'], label="Loan grade"),
        gr.Dropdown(choices=['Education','Home Improvement','Personal', 'Venture', 'Other'], label="Loan intent"),
        gr.Dropdown(choices=["Yes", "No"], label="Do you pay rent for your place?"),
    ],
    outputs=[
        gr.Textbox(label="Default prediction (0 = No, 1 = Yes)"),
        gr.Textbox(label="Probability of default")
    ],
    title="Credit Risk Assessment",
    description="Predict the default risk status based on input features."
)

interface.launch()

Running on local URL:  http://127.0.0.1:7890

To create a public link, set `share=True` in `launch()`.


