In [4]:
import torch
import math
import os, os.path
import pandas as pd
import numpy as np
from numpy import random
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedKFold
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
from MainFunctions import  evalModel, buildRealLoadersFromDict, oversample, discretizeData, prepareRealData, transform_padding, objective, analyzemodel, stratifiedtraintestsplit

  from .autonotebook import tqdm as notebook_tqdm


# Main Global Variables

In [5]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available() : torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = True
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
path = os.path.abspath(os.getcwd())

log_folder = "results"
trainingData_folder = "trainingData"
imgloc = "ECGcutouts/realimgs"

# K-Folding Real Data

In [30]:
batch_size = 32
n_outer_splits = 5
n_inner_splits = 5
n_trials = 42
num_epochs = 30
max_num_epochs = 2000
reduceindex = None
criterion = torch.nn.L1Loss()

gridlogname = f"{log_folder}/gridsearchlogkFold.csv"
gridresultsname = f"{log_folder}/gridresultskFold.csv"
resultsname = f"{log_folder}/crossvalidationkFold.csv"

grid_results =  pd.DataFrame()
crossval_avgs = pd.DataFrame()
datatype = None
paramColumns = ["inner_fold", "blocks", "kernel_size", "optimizer",	"padding",	"stride"] 
paramColumns2 = [ "blocks", "kernel_size", "optimizer",	"padding",	"stride"] 
    

## Loading DataFrame with ECG Data
### Creating bins for stratification

In [None]:
df = prepareRealData(directory = imgloc)
df = discretizeData(df=df, start=-99, stop=109, step=10, seed=SEED)
skf_outer = StratifiedKFold(n_splits=n_outer_splits, shuffle=True, random_state=SEED)
transform = "transform_padding"
transform_func = globals()[transform]

## Executing main cross validation with 5 inner and 5 outer folds

In [None]:
for i, (train_index, test_index) in enumerate(skf_outer.split(df.index, df["bins"])):
    print(f"Training at Outer Fold: {i}")
    df_train = df.iloc[train_index].reset_index(drop=True)
    df_test = df.iloc[test_index].reset_index(drop=True)
    skf_inner = StratifiedKFold(n_splits=n_inner_splits, shuffle=True, random_state=SEED)

    for j, (train_index_inner, test_index_inner) in enumerate(skf_inner.split(df_train.index, df_train["bins"])):
        print(f"Training at Inner Fold: {j}")
        # creating inner train and test dataframe
        df_train_inner = df_train.iloc[train_index_inner].reset_index(drop=True)
        df_test_inner = df_train.iloc[test_index_inner].reset_index(drop=True)
        # oversampling inner train fold
        df_train_inner =  oversample(df_train_inner)
        data_dict = {"train" : df_train_inner , "test" : df_test_inner}
        train_loader, test_loader = buildRealLoadersFromDict(data_dict=data_dict, batch_size=batch_size, transform=transform_func, imgloc=imgloc, invert=False, augment=True).values()
        # execute grid search
        search = lambda trial: objective(trial, datatype= datatype, transform=transform, loaders={"train" : train_loader, "real" : test_loader}, printing=False, num_epochs=num_epochs, gridlogname=gridlogname)
        study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=SEED),pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=10, interval_steps=5))
        study.optimize(search, n_trials=n_trials)
        print(f"Finished Grid Search best params: {study.best_params}, best value: {study.best_value}" )
        # log results
        df_study = study.trials_dataframe()
        df_study.columns = df_study.columns.str.removeprefix("params_")
        df_study["inner_fold"] = j
        df_study["outer_fold"] = i
        # rounding learning rate to 4 figures
        df_study["learning_rate"] = df_study["learning_rate"].round(4)
        grid_results = grid_results.append(df_study)
        grid_results.to_csv(gridresultsname, index=False)
            
    # grouping all results calculating mean MAE
    # selecting best result
    # omitting failed runs, averaging result mae, used seed and learning rate
    selectresult = grid_results[(grid_results["value"] < 10000) & (grid_results["outer_fold"] == i)].groupby(paramColumns)[["value", "learning_rate", "modelSeed"]].agg({"mean"})
    selectresult = selectresult.groupby(paramColumns2).agg({"count", "mean"}).sort_values(("value", "mean", "mean"), ascending=True)
    # min threshold of 4 folds for a setup
    selectresult = selectresult[selectresult[("value", "mean", "count")] > 3].head(1)

    result = {item[0]: value for item, value in selectresult.reset_index()[paramColumns2].to_dict('r')[0].items()}
    result["value"], result["learning_rate"], result["modelSeed"] = selectresult[("value", "mean", "mean")].item(), selectresult[("learning_rate", "mean", "mean")].item(), math.floor(selectresult[("modelSeed", "mean", "mean")].item())
    
    # execute best epoch search with 5-fold cross validation
    bestepoch_log = pd.DataFrame()
    for j, (train_index_inner, test_index_inner) in enumerate(skf_inner.split(df_train.index, df_train["bins"])):
        print(f"Best Epoch Search at Inner Fold: {j}")
        # creating inner train and test dataframe
        df_train_inner = df_train.iloc[train_index_inner].reset_index(drop=True)
        df_test_inner = df_train.iloc[test_index_inner].reset_index(drop=True)
        # oversampling inner train fold
        df_train_inner =  oversample(df_train_inner)
        data_dict = {"train" : df_train_inner , "test" : df_test_inner}
        train_loader, val_loader = buildRealLoadersFromDict(data_dict=data_dict, batch_size=batch_size, transform=transform_func, imgloc=imgloc, invert=False, augment=True).values()
        # searching for best epoch with patients method
        log, model = analyzemodel(result, loaders={"train": train_loader, "real" : val_loader}, num_epochs=max_num_epochs, stop=True ) 
        print(f"Found best epoch with mae { log['real_mae']}, {log['real_mae_epoch'] }")
        bestepoch_log = bestepoch_log.append(log, ignore_index = True)

    result["best_mae"], result["num_epochs"], result["best_r2"], result["best_mse"] = bestepoch_log["real_mae"].mean(), math.ceil(bestepoch_log["real_mae_epoch"].mean()), bestepoch_log['real_r2'].mean(), bestepoch_log['real_mse'].mean()
    print(f"Found best average with mae { result['best_mae']}, {result['num_epochs']}")

    # execute on outer test fold
    result["modelname"] = "kFoldModel"
    train_loader, test_loader = buildRealLoadersFromDict(data_dict={"train" :  oversample(df_train), "test" : df_test}, batch_size=batch_size, transform=transform_func, imgloc=imgloc, invert=False, augment=True).values()
    # retraining model with optimal epoch on outer training data set
    log, model = analyzemodel(netparams=result, loaders={"train" : train_loader}, num_epochs=result["num_epochs"], saveModel=False) 
    # evaluating trained model on test fold
    real_test = evalModel (model, test_loader, criterion, evalmode=True) 
    # logging results for averaging
    finalresult = pd.DataFrame(real_test, index=[0]).apply(lambda x : x.item().cpu().item(), axis = 0)[["mae", "mse", "r2"]].to_dict() | {"model" : model.modelname, "data" : "artificial"} | result
    print(f"appending result:  { finalresult['mae']}")
    finalresult["fold"] = i
    crossval_avgs = crossval_avgs.append(finalresult, ignore_index=True)
    crossval_avgs.to_csv(resultsname, index=False)

In [None]:
# checking number of similar configs
selectresult = grid_results[(grid_results["value"] < 10000) ].groupby(["outer_fold"] + paramColumns)[["value"]].agg({"mean"})
selectresult = selectresult.groupby(["outer_fold"] + paramColumns2).agg({"count", "mean"})
# min threshold of 4 folds for a setup
selectresult.groupby("outer_fold").apply(lambda grp: grp.nsmallest(7,("value", "mean", "mean")))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,value,value
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,mean,mean
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,count,mean
outer_fold,outer_fold,blocks,kernel_size,optimizer,padding,stride,Unnamed: 7_level_3,Unnamed: 8_level_3
0,0,4,7,SGD,2,2,1,0.14496
0,0,5,7,SGD,2,2,1,0.156406
0,0,4,7,RMSprop,2,2,1,0.160228
0,0,5,7,SGD,6,2,1,0.163125
0,0,3,5,RMSprop,4,2,3,0.165409
0,0,4,7,SGD,4,2,5,0.165664
0,0,4,3,SGD,2,2,1,0.168792
1,1,4,7,RMSprop,2,2,1,0.158587
1,1,4,7,SGD,2,2,2,0.175799
1,1,5,7,SGD,2,2,1,0.177823


In [160]:
crossval_avgs

Unnamed: 0,mae,mse,r2,model,data,blocks,kernel_size,optimizer,padding,stride,value,learning_rate,modelSeed,modelname,best_mae,num_epochs,best_r2,best_mse,fold
0,0.188095,0.098248,0.698183,kFoldModel,artificial,4,7,SGD,4,2,0.165664,0.036917,81,kFoldModel,0.150404,72,0.808077,0.060966,0
1,0.135947,0.052445,0.760805,kFoldModel,artificial,4,7,SGD,4,2,0.194274,0.039159,73,kFoldModel,0.175977,74,0.73393,0.089454,1
2,0.172304,0.074533,0.762356,kFoldModel,artificial,4,7,SGD,4,2,0.184383,0.034735,74,kFoldModel,0.161082,68,0.760543,0.079211,2
3,0.129863,0.068144,0.807141,kFoldModel,artificial,4,7,SGD,4,2,0.177051,0.032745,69,kFoldModel,0.16833,65,0.788544,0.073161,3
4,0.133398,0.047067,0.864256,kFoldModel,artificial,4,7,SGD,4,2,0.192933,0.032027,73,kFoldModel,0.17125,55,0.754559,0.082846,4
