# Model Comparison

Author: Marco Pellegrino<br>
Year: 2024

This overall project aims to build a simple model to predict the probability of loan default based on loan application data. This information helps assess business risk and improve loan approval decisions.

In this notebook, metrics of previously trained models are compared.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Import paths
from config import *

In [None]:
# Check if the directory for the plots exists
if not os.path.exists(PATH_PLOTS_COMPARISON):
    # If it doesn't exist, create the directory
    os.makedirs(PATH_PLOTS_COMPARISON)

# Load data

All major recorded metrics

In [None]:
df_XGBoost = pd.read_csv(PATH_RESULTS+'all/evaluation-XGBoost.csv', index_col=False)
df_DT = pd.read_csv(PATH_RESULTS+'all/evaluation-DT.csv', index_col=False)
df_RF = pd.read_csv(PATH_RESULTS+'all/evaluation-RF.csv', index_col=False)

In [None]:
# Concatenate them vertically
concatenated_df = pd.concat([df_XGBoost, df_DT, df_RF], ignore_index=True)
concatenated_df

ROC true and false positive rates

In [None]:
fpr_XGBoost = pd.read_csv(PATH_RESULTS+'fpr/evaluation_fpr-XGBoost.csv', index_col=False)
fpr_DT = pd.read_csv(PATH_RESULTS+'fpr/evaluation_fpr-DT.csv', index_col=False)
fpr_RF = pd.read_csv(PATH_RESULTS+'fpr/evaluation_fpr-RF.csv', index_col=False)

In [None]:
tpr_XGBoost = pd.read_csv(PATH_RESULTS+'tpr/evaluation_tpr-XGBoost.csv', index_col=False)
tpr_DT = pd.read_csv(PATH_RESULTS+'tpr/evaluation_tpr-DT.csv', index_col=False)
tpr_RF = pd.read_csv(PATH_RESULTS+'tpr/evaluation_tpr-RF.csv', index_col=False)

# F1 Score Comparison

In [None]:
# Melt the dataframe
melted_df = pd.melt(concatenated_df[['Model', 'F1 Weighted-averaged', 'F1 Default=1', 'F1 Default=0']], id_vars=['Model'], var_name='metric', value_name='value')

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(data=melted_df, x='metric', y='value', hue='Model')
plt.xticks(rotation=45)
plt.xlabel('Metrics')
plt.ylabel('Values')
plt.title('Comparison of F1 Score for Different Models')
plt.tight_layout()

# Save the plot
plt.savefig(PATH_PLOTS_COMPARISON+"F1_scores.png")

plt.show()

# Log Loss Comparison

In [None]:
# Melt the dataframes
melted_df = pd.melt(concatenated_df[['Model', 'LogLoss']], id_vars=['Model'], var_name='metric', value_name='value')

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(data=melted_df, x='metric', y='value', hue='Model')
plt.xticks(rotation=45)
plt.xlabel('Metrics')
plt.ylabel('Values')
plt.title('Comparison of Log Loss for Different Models')
plt.tight_layout()

# Save the plot
plt.savefig(PATH_PLOTS_COMPARISON+"logloss.png")

plt.show()

# AUC-ROC comparison

In [None]:
# Assuming you have ROC AUC scores, FPR, and TPR for three models stored in lists or arrays
model_names = ['XGBoost', 'Decision Tree', 'Random Forest']
roc_auc_scores = concatenated_df['ROC-AUC'].values

fpr_list = [fpr_XGBoost['XGBoost'].values, fpr_DT['Decision Tree'].values, fpr_RF['Random Forest'].values]  # List of FPR values for each model
tpr_list = [tpr_XGBoost['XGBoost'].values, tpr_DT['Decision Tree'].values, tpr_RF['Random Forest'].values]  # List of TPR values for each model

plt.figure(figsize=(10, 8))

# Plot ROC curve for each model
for i in range(len(model_names)):
    plt.plot(fpr_list[i], tpr_list[i], label=f'{model_names[i]} (AUC = {roc_auc_scores[i]:.2f})')

# Plot ROC curve for random guessing (diagonal dotted line)
plt.plot([0, 1], [0, 1], 'r--', label='Random Guessing')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)

# Save the plot
plt.savefig(PATH_PLOTS_COMPARISON+"roc_auc.png")

plt.show()