In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set default font to sans-serif
plt.rcParams['font.family'] = 'sans-serif'

from utils.preprocessing import *
from utils.visualization import *
from utils.trainer import *
from utils.config import *
from utils.helpers import *

In [2]:
# Get lists of data by folds
X_train_list = [pd.read_csv(path) for path in X_train_paths]
y_train_list = [pd.read_csv(path) for path in y_train_paths]
y_train_list = [y_train[TARGET].to_numpy() for y_train in y_train_list]

X_val_list   = [pd.read_csv(path) for path in X_val_paths]
y_val_list   = [pd.read_csv(path) for path in y_val_paths]
y_val_list   = [y_val[TARGET].to_numpy() for y_val in y_val_list]

y_val_full = []
for y_val_fold in y_val_list:
    y_val_full.extend(y_val_fold)
assert len(y_val_full) == 273

In [3]:
model_names = ['catboost', 'xgboost', 'lgbm', 'rf', 'svm', 'lr']
n_folds = 5

In [4]:
# Get metrics for all models accross all folds
pred_dict = {}
pred_proba_dict = {}
metric_dict = {}

for model_name in model_names:
    pred_list = []
    pred_proba_list = []

    for fold in range(n_folds):
        # Load data for the current fold
        X_train, y_train, X_val, y_val = load_data_fold(fold+1)

        # Load trained model for the current fold
        model = load_model_fold(fold+1, model_name=model_name)

        # Get predictions for the current fold
        y_pred = model.predict(X_val)
        y_pred_proba = model.predict_proba(X_val)[:,-1]

        pred_list.extend(y_pred)
        pred_proba_list.extend(y_pred_proba)
    
    # Save predictions for this model
    pred_dict[model_name] = pred_list
    pred_proba_dict[model_name] = pred_proba_list

    # Save metrics for this model
    metric_dict[model_name] = calculate_metrics(y_val_full, pred_list, pred_proba_list)

In [5]:
#save_json('data/metrics.json', metric_dict)

## Table 2. Evaluation Metrics for Six Machine Learning Models Across Five Folds

In [20]:
metric_df = pd.DataFrame(metric_dict).T
metric_df = round(metric_df, 4)
metric_df.drop(columns='mse', inplace=True)
metric_df.columns = ['ACC', 'AUC', 'F1 Score', 'Precision', 'Recall', 'Specificity']
metric_df

Unnamed: 0,ACC,AUC,F1 Score,Precision,Recall,Specificity
catboost,0.8791,0.9312,0.8675,0.871,0.864,0.8919
xgboost,0.8645,0.9202,0.8514,0.8548,0.848,0.8784
lgbm,0.8791,0.915,0.8675,0.871,0.864,0.8919
rf,0.8388,0.9097,0.8295,0.8045,0.856,0.8243
svm,0.8168,0.8985,0.8031,0.7907,0.816,0.8176
lr,0.8132,0.8922,0.7935,0.8033,0.784,0.8378
