In [1]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from utils.preprocessing import *
from utils.visualization import *
from utils.trainer import *
from utils.config import *
from utils.helpers import *

In [2]:
# Get lists of data by folds
X_train_list = [pd.read_csv(path) for path in X_train_paths]
y_train_list = [pd.read_csv(path) for path in y_train_paths]
y_train_list = [y_train[TARGET].to_numpy() for y_train in y_train_list]

X_val_list   = [pd.read_csv(path) for path in X_val_paths]
y_val_list   = [pd.read_csv(path) for path in y_val_paths]
y_val_list   = [y_val[TARGET].to_numpy() for y_val in y_val_list]

y_val_full = []
for y_val_fold in y_val_list:
    y_val_full.extend(y_val_fold)
assert len(y_val_full) == 273

In [3]:
model_names = ['catboost', 'xgboost', 'lgbm', 'rf', 'svm', 'lr']
n_folds = 5

In [4]:
# Get metrics for all models accross all folds
pred_dict = {}
pred_proba_dict = {}
metric_dict = {}

for model_name in model_names:
    pred_list = []
    pred_proba_list = []

    for fold in range(n_folds):
        # Load data for the current fold
        X_train, y_train, X_val, y_val = load_data_fold(fold+1)

        # Load trained model for the current fold
        model = load_model_fold(fold+1, model_name=model_name)

        # Get predictions for the current fold
        y_pred = model.predict(X_val)
        y_pred_proba = model.predict_proba(X_val)[:,-1]

        pred_list.extend(y_pred)
        pred_proba_list.extend(y_pred_proba)
    
    # Save predictions for this model
    pred_dict[model_name] = pred_list
    pred_proba_dict[model_name] = pred_proba_list

    # Save metrics for this model
    metric_dict[model_name] = calculate_metrics(y_val_full, pred_list, pred_proba_list)

In [5]:
#save_json('data/metrics.json', metric_dict)