In [1]:
import torch
from torch.utils.data import DataLoader

import os, os.path
import pandas as pd
import numpy as np
from numpy import random
import math
import cv2 as cv2
import warnings
warnings.filterwarnings("ignore")

import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
from MainFunctions import objective, analyzemodel, buildmodelFromParams, buildDataFromParams, evalModel, buildLoaderFromIdx

  from .autonotebook import tqdm as notebook_tqdm


# Globals
## Main Variables

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available() : torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = True
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
batch_size = 128 if torch.cuda.is_available() else 64
path = os.path.abspath(os.getcwd())
trainingData_folder = "trainingData"
log_folder = "results"
num_epochs = 20
reduceindex = None
criterion = torch.nn.L1Loss()

# Preliminary Experiment best Method

In [3]:
results = pd.DataFrame()
gridresultsname = f"{log_folder}/preliminaryresults.csv"
gridlogname =  f"{log_folder}/preliminarylog.csv"
n_trials = 30
datatype = "pearson"

for transform in ["transform_padding", "transform_resize"]:
    train_loader, val_loader, test_loader, real_train_loader, real_test_loader = buildDataFromParams(datatype=datatype, transform=transform, batch_size=batch_size, reduceindex=reduceindex, seed=SEED, oversamp=False, augment=True)
    search = lambda trial: objective(trial, datatype=datatype, transform=transform, loaders={"train" : train_loader, "val" : val_loader, "real" : real_train_loader}, num_epochs=num_epochs, gridlogname=gridlogname)
    study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=SEED),pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=10, interval_steps=5))
    study.optimize(search, n_trials=n_trials)
    results = results.append(({"datatype" : datatype, "transform" : transform, "mae" : study.best_value } | study.best_params ) , ignore_index=True)

Training Model with params: {'datatype': 'pearson', 'transform': 'transform_padding', 'modelSeed': 38, 'learning_rate': 0.06351221010640701, 'optimizer_name': 'Adam', 'blocks': 5, 'padding': 4, 'stride': 2, 'kernel_size': 7}
EPOCH RESULTS: train MAE: 0.61 train R2: -0.681 val MAE: 0.992 val R2: -2.978 real MAE: 0.558 real R2: -0.957
EPOCH RESULTS: train MAE: 0.537 train R2: -0.228 val MAE: 0.993 val R2: -3.02 real MAE: 0.552 real R2: -0.935
EPOCH RESULTS: train MAE: 0.527 train R2: -0.17 val MAE: 0.997 val R2: -3.026 real MAE: 0.515 real R2: -0.836
EPOCH RESULTS: train MAE: 0.526 train R2: -0.157 val MAE: 0.994 val R2: -2.998 real MAE: 0.557 real R2: -0.899
EPOCH RESULTS: train MAE: 0.521 train R2: -0.135 val MAE: 1.001 val R2: -3.069 real MAE: 0.565 real R2: -0.902
EPOCH RESULTS: train MAE: 0.519 train R2: -0.13 val MAE: 0.995 val R2: -3.035 real MAE: 0.574 real R2: -0.954
EPOCH RESULTS: train MAE: 0.522 train R2: -0.143 val MAE: 0.996 val R2: -3.018 real MAE: 0.571 real R2: -0.913
EP

In [4]:
results.to_csv(gridresultsname, index=False)
results

Unnamed: 0,datatype,transform,mae,modelSeed,learning_rate,optimizer,blocks,padding,stride,kernel_size
0,pearson,transform_padding,0.266086,73,0.033894,RMSprop,3,2,4,5
1,pearson,transform_resize,0.310706,35,0.000915,Adam,3,6,6,7


# Starting Grid Search

In [5]:
n_trials = 42
results = pd.DataFrame()
gridresultsname = f"{log_folder}/gridseachresults.csv"
gridlogname =  f"{log_folder}/gridsearchlog.csv"
transform = "transform_padding"

for datatype in  ["pearson", "random_pearson"]:
    train_loader, val_loader, test_loader, real_train_loader, real_test_loader = buildDataFromParams(datatype=datatype, transform=transform, batch_size=batch_size, reduceindex=reduceindex, seed=SEED, oversamp=False, augment=True)
    search = lambda trial: objective(trial, datatype=datatype, transform=transform, loaders={"train" : train_loader, "val" : val_loader, "real" : real_train_loader}, num_epochs=num_epochs, gridlogname=gridlogname)
    study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=SEED),pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=10, interval_steps=5))
    study.optimize(search, n_trials=n_trials)
    results = results.append(({"datatype" : datatype, "transform" : transform, "mae" : study.best_value } | study.best_params ) , ignore_index=True)

Training Model with params: {'datatype': 'pearson', 'transform': 'transform_padding', 'modelSeed': 38, 'learning_rate': 0.06351221010640701, 'optimizer_name': 'Adam', 'blocks': 5, 'padding': 4, 'stride': 2, 'kernel_size': 7}
EPOCH RESULTS: train MAE: 0.6 train R2: -0.643 val MAE: 0.994 val R2: -2.991 real MAE: 0.559 real R2: -0.961
EPOCH RESULTS: train MAE: 0.537 train R2: -0.237 val MAE: 0.992 val R2: -3.014 real MAE: 0.552 real R2: -0.933
EPOCH RESULTS: train MAE: 0.526 train R2: -0.162 val MAE: 0.997 val R2: -3.024 real MAE: 0.515 real R2: -0.835
EPOCH RESULTS: train MAE: 0.527 train R2: -0.167 val MAE: 0.995 val R2: -3.001 real MAE: 0.557 real R2: -0.899
EPOCH RESULTS: train MAE: 0.52 train R2: -0.139 val MAE: 1.002 val R2: -3.07 real MAE: 0.565 real R2: -0.902
EPOCH RESULTS: train MAE: 0.521 train R2: -0.148 val MAE: 0.995 val R2: -3.036 real MAE: 0.574 real R2: -0.954
EPOCH RESULTS: train MAE: 0.521 train R2: -0.141 val MAE: 0.996 val R2: -3.018 real MAE: 0.571 real R2: -0.913
EP

In [6]:
results.to_csv(gridresultsname, index=False)
results

Unnamed: 0,datatype,transform,mae,modelSeed,learning_rate,optimizer,blocks,padding,stride,kernel_size
0,pearson,transform_padding,0.281879,78,0.057279,SGD,4,4,2,7
1,random_pearson,transform_padding,0.144018,84,0.0193,Adam,4,4,2,7


# Choose Number of Epochs

## Selecting best Hyperparameter Setup

In [10]:
result = results.sort_values(by="mae", ascending=True).head(1).reset_index(drop=True).to_dict('r')[0]
result["modelname"] = "bestArtificialModel"
filename = f"{log_folder}/bestEpochResults.csv"
num_epochs = 2000
saved_log = pd.read_csv(filename) if os.path.exists(filename) else pd.DataFrame()

## Executing Best Epoch Search

In [11]:
bestEpochResult = pd.DataFrame()
train_loader, val_loader, test_loader, real_train_loader, real_test_loader = buildDataFromParams(datatype=result["datatype"], transform=result["transform"], batch_size=batch_size, reduceindex=reduceindex, seed=SEED, oversamp=False, augment=True)
log, model = analyzemodel(netparams=result, loaders={"train" : train_loader, "val" : val_loader, "real" : real_train_loader}, num_epochs=num_epochs, stop=True, saveModel=True)
bestEpochResult = bestEpochResult.append(log, ignore_index=True)   
bestEpochResult.to_csv(filename, index=False)  
bestEpochResult

EPOCH RESULTS: train MAE: 0.622 train R2: -0.756 val MAE: 0.992 val R2: -2.986 real MAE: 0.547 real R2: -0.871
SAVED MODEL WITH VALUES: MAE: 0.5473849773406982 R2: -0.8707857728004456
EPOCH RESULTS: train MAE: 0.535 train R2: -0.235 val MAE: 0.98 val R2: -2.901 real MAE: 0.566 real R2: -0.895
EPOCH RESULTS: train MAE: 0.506 train R2: -0.083 val MAE: 0.975 val R2: -2.883 real MAE: 0.517 real R2: -0.827
SAVED MODEL WITH VALUES: MAE: 0.5167005062103271 R2: -0.8274645805358887
EPOCH RESULTS: train MAE: 0.493 train R2: -0.052 val MAE: 0.949 val R2: -2.672 real MAE: 0.532 real R2: -0.772
EPOCH RESULTS: train MAE: 0.479 train R2: 0.005 val MAE: 0.878 val R2: -2.142 real MAE: 0.511 real R2: -0.783
SAVED MODEL WITH VALUES: MAE: 0.5107994079589844 R2: -0.7828750014305115
EPOCH RESULTS: train MAE: 0.411 train R2: 0.238 val MAE: 0.825 val R2: -1.704 real MAE: 0.519 real R2: -0.708
EPOCH RESULTS: train MAE: 0.235 train R2: 0.708 val MAE: 0.249 val R2: 0.658 real MAE: 0.366 real R2: 0.168
SAVED MODE

Unnamed: 0,datatype,transform,mae,modelSeed,learning_rate,optimizer,blocks,padding,stride,kernel_size,...,val_r2,val_mse,val_mae_epoch,val_max_epoch,real_mae,real_r2,real_mse,real_mae_epoch,real_max_epoch,tbdir
0,random_pearson,transform_padding,0.144018,84,0.0193,Adam,4,4,2,7,...,0.992068,0.002551,223,223,0.136075,0.837441,0.056259,74,74,runs\Jul08_00-13-02_CAD


## Updating result log with new found best epoch and best metric values

In [12]:
result["mae"],result["mse"],result["r2"], result["num_epochs"] =  bestEpochResult["real_mae"].item(), bestEpochResult["real_mse"].item(), bestEpochResult["real_r2"].item(),  bestEpochResult["real_mae_epoch"].item()
result = pd.DataFrame(result, index=[1])
result

Unnamed: 0,datatype,transform,mae,modelSeed,learning_rate,optimizer,blocks,padding,stride,kernel_size,modelname,mse,r2,num_epochs
1,random_pearson,transform_padding,0.136075,84,0.0193,Adam,4,4,2,7,bestArtificialModel,0.056259,0.837441,74


# Choose number of epochs for retraining with real Data using KFold Cross Validation

In [13]:
from MainFunctions import prepareRealData, discretizeData, stratifiedtraintestsplit, loadTrainLoaders, transform_padding, oversample, buildRealLoadersFromDict, kfoldLoader
from sklearn.model_selection import StratifiedKFold

filename = f"{log_folder}/findbestEpochForRetraining.csv"
num_epochs = 2000
n_splits = 5
saved_log = pd.read_csv(filename) if os.path.exists(filename) else pd.DataFrame()
bestEpochRetrainingResult = pd.DataFrame()
result = result.to_dict('r')[0]
transform_func = transform_padding
realimgloc = "ECGcutouts/realimgs"

In [14]:
train_loader, val_loader, test_loader = loadTrainLoaders(batch_size=batch_size, datatype=result["datatype"], transform=transform_func, percentages=np.array([0.70,0.15,0.15]), reduceindex=reduceindex, augment=True, invert=False, seed=SEED).values()
skf_ai, df_ai = kfoldLoader(loader=val_loader, n_splits=n_splits, start=-99, stop=101, step=1, seed=SEED) 

df = prepareRealData(directory=realimgloc )
df = discretizeData(df=df, start=-99, stop=109, step=10, seed=SEED)
idxs = stratifiedtraintestsplit(df=df, percentages=np.array([0.6, 0.4]), reduceindex=None, seed=SEED)
df_real = df.iloc[idxs["train"]]
skf_real = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)

In [15]:
for (i, (train_idx_ai, val_idx_ai) ), (j, (train_idx_real, val_idx_real) ) in zip(enumerate(skf_ai.split(df_ai.index, df_ai["bins"])), enumerate(skf_real.split(df_real.index, df_real["bins"]))):
    print(f"Training at Fold: {i}")
    model = buildmodelFromParams(result, load=True, seed=SEED)

    df_train_real = oversample(df.iloc[train_idx_real].reset_index(drop=True))
    df_val_real = oversample(df.iloc[val_idx_real].reset_index(drop=True))

    real_loaders  = buildRealLoadersFromDict(data_dict= {"train" : df_train_real , "val" : df_val_real}, batch_size=batch_size, transform=transform_func, imgloc=realimgloc , invert=False, augment=True)
    ai_loaders = buildLoaderFromIdx(loader=val_loader, idx_dict={"train" : train_idx_ai, "val" : val_idx_ai}, transform=result["transform"], augment=True, batch_size=batch_size)
    
    combined_train_loader = DataLoader(torch.utils.data.ConcatDataset([ai_loaders["train"].dataset, real_loaders["train"].dataset]), batch_size=batch_size, shuffle=True, pin_memory=True, drop_last = True)  
    combined_real_loader = DataLoader(torch.utils.data.ConcatDataset([ai_loaders["val"].dataset, real_loaders["val"].dataset]), batch_size=batch_size, shuffle=True, pin_memory=True, drop_last = True)  

    log, model = analyzemodel(netparams=result, loaders={"train" : combined_train_loader, "real" : combined_real_loader}, num_epochs=2000, model=model, saveModel=False, stop=True)
    bestEpochRetrainingResult = bestEpochRetrainingResult.append(log, ignore_index=True) 
    bestEpochRetrainingResult.to_csv(filename, index=False)



Training at Fold: 0
EPOCH RESULTS: train MAE: 0.099 train R2: 0.933 real MAE: 0.144 real R2: 0.793
EPOCH RESULTS: train MAE: 0.078 train R2: 0.963 real MAE: 0.126 real R2: 0.818
EPOCH RESULTS: train MAE: 0.07 train R2: 0.969 real MAE: 0.125 real R2: 0.83
EPOCH RESULTS: train MAE: 0.067 train R2: 0.971 real MAE: 0.135 real R2: 0.816
EPOCH RESULTS: train MAE: 0.062 train R2: 0.975 real MAE: 0.133 real R2: 0.82
EPOCH RESULTS: train MAE: 0.06 train R2: 0.976 real MAE: 0.121 real R2: 0.85
EPOCH RESULTS: train MAE: 0.059 train R2: 0.978 real MAE: 0.132 real R2: 0.829
EPOCH RESULTS: train MAE: 0.058 train R2: 0.978 real MAE: 0.123 real R2: 0.837
EPOCH RESULTS: train MAE: 0.056 train R2: 0.98 real MAE: 0.126 real R2: 0.827
EPOCH RESULTS: train MAE: 0.057 train R2: 0.978 real MAE: 0.135 real R2: 0.81
EPOCH RESULTS: train MAE: 0.055 train R2: 0.98 real MAE: 0.132 real R2: 0.824
EPOCH RESULTS: train MAE: 0.053 train R2: 0.981 real MAE: 0.122 real R2: 0.833
EPOCH RESULTS: train MAE: 0.053 train R2

In [16]:
samplesdist = pd.DataFrame({"AI Samples" : val_loader.dataset.labels.shape, "Real Samples" : df_real["label"].shape, "Real Train Split" : real_loaders["train"].dataset.labels.shape, "AI Train Split" : ai_loaders["train"].dataset.labels.shape, "Real Val Split" : real_loaders["val"].dataset.labels.shape, "AI Val Split" :  ai_loaders["val"].dataset.labels.shape})
samplesdist.to_csv("results/samplesdist.csv", index=False) 

In [17]:
bestEpochRetrainingResult


Unnamed: 0,datatype,transform,mae,modelSeed,learning_rate,optimizer,blocks,padding,stride,kernel_size,modelname,mse,r2,num_epochs,real_mae,real_r2,real_mse,real_mae_epoch,real_max_epoch,tbdir
0,random_pearson,transform_padding,0.136075,84,0.0193,Adam,4,4,2,7,bestArtificialModel,0.056259,0.837441,74,0.115863,0.862298,0.045721,14,14,runs\Jul08_02-02-05_CAD
1,random_pearson,transform_padding,0.136075,84,0.0193,Adam,4,4,2,7,bestArtificialModel,0.056259,0.837441,74,0.107309,0.889642,0.036762,4,4,runs\Jul08_02-37-40_CAD
2,random_pearson,transform_padding,0.136075,84,0.0193,Adam,4,4,2,7,bestArtificialModel,0.056259,0.837441,74,0.076109,0.958157,0.012131,48,48,runs\Jul08_02-46-02_CAD
3,random_pearson,transform_padding,0.136075,84,0.0193,Adam,4,4,2,7,bestArtificialModel,0.056259,0.837441,74,0.08261,0.948766,0.016279,8,8,runs\Jul08_03-04-33_CAD
4,random_pearson,transform_padding,0.136075,84,0.0193,Adam,4,4,2,7,bestArtificialModel,0.056259,0.837441,74,0.078402,0.952205,0.014881,71,71,runs\Jul08_03-27-13_CAD


### Take mean of all best split epochs as new epoch for retraining

In [18]:
result["num_retraining_epochs"] = math.ceil(bestEpochRetrainingResult["real_mae_epoch"].mean())

# Execute with all Data

In [19]:
filename = f"{log_folder}/bestNetResult.csv"
saved_log = pd.read_csv(filename) if os.path.exists(filename) else pd.DataFrame()
saved_modelparams = pd.read_csv("neuralnets/netparams.csv") if os.path.exists("neuralnets/netparams.csv") else pd.DataFrame()

## Retraining model on all data

In [20]:
allDataResult = pd.DataFrame()
train_loader, val_loader, test_loader, real_train_loader, real_test_loader = buildDataFromParams(datatype=result["datatype"], transform=result["transform"], batch_size=batch_size, reduceindex=reduceindex, seed=SEED, oversamp=True, augment=True)
model = buildmodelFromParams(result, load=True, seed=SEED)

train_loader = DataLoader(torch.utils.data.ConcatDataset([val_loader.dataset, real_train_loader.dataset]), batch_size=batch_size, shuffle=True, pin_memory=True, drop_last = True)  
model.modelname = "allDataModel"
result["modelname"] = model.modelname 
log, model = analyzemodel(netparams=result, loaders={"train" : train_loader}, num_epochs=result["num_retraining_epochs"], model=model, saveModel=False)
allDataResult = allDataResult.append(log, ignore_index=True) 
allDataResult.to_csv(filename, index=False)

torch.save(model.state_dict(), f"neuralnets/{model.modelname}")  
saved_modelparams = saved_modelparams[saved_modelparams["model"] != model.modelname].append( pd.DataFrame({"model" : model.modelname} | result, index=[0]), ignore_index=True)
saved_modelparams.to_csv("neuralnets/netparams.csv", index=False) 

allDataResult

EPOCH RESULTS: train MAE: 0.113 train R2: 0.893
EPOCH RESULTS: train MAE: 0.09 train R2: 0.932
EPOCH RESULTS: train MAE: 0.081 train R2: 0.945
EPOCH RESULTS: train MAE: 0.076 train R2: 0.951
EPOCH RESULTS: train MAE: 0.068 train R2: 0.96
EPOCH RESULTS: train MAE: 0.066 train R2: 0.965
EPOCH RESULTS: train MAE: 0.063 train R2: 0.967
EPOCH RESULTS: train MAE: 0.06 train R2: 0.97
EPOCH RESULTS: train MAE: 0.06 train R2: 0.971
EPOCH RESULTS: train MAE: 0.06 train R2: 0.972
EPOCH RESULTS: train MAE: 0.057 train R2: 0.976
EPOCH RESULTS: train MAE: 0.057 train R2: 0.976
EPOCH RESULTS: train MAE: 0.055 train R2: 0.978
EPOCH RESULTS: train MAE: 0.056 train R2: 0.977
EPOCH RESULTS: train MAE: 0.055 train R2: 0.979
EPOCH RESULTS: train MAE: 0.053 train R2: 0.98
EPOCH RESULTS: train MAE: 0.052 train R2: 0.982
EPOCH RESULTS: train MAE: 0.05 train R2: 0.983
EPOCH RESULTS: train MAE: 0.049 train R2: 0.983
EPOCH RESULTS: train MAE: 0.051 train R2: 0.983
EPOCH RESULTS: train MAE: 0.049 train R2: 0.984


Unnamed: 0,datatype,transform,mae,modelSeed,learning_rate,optimizer,blocks,padding,stride,kernel_size,modelname,mse,r2,num_epochs,num_retraining_epochs,tbdir
0,random_pearson,transform_padding,0.136075,84,0.0193,Adam,4,4,2,7,allDataModel,0.056259,0.837441,74,29,runs\Jul08_03-41-15_CAD


## Final Evaluation on both test holdouts

In [None]:
finalresultsname = f"{log_folder}/finalresults.csv"
finalresults = pd.DataFrame()
ai_test = evalModel (model, test_loader, criterion, evalmode=True) 
real_test = evalModel (model, real_test_loader, criterion, evalmode=True) 

finalresults = finalresults.append(pd.DataFrame(ai_test, index=[0]).apply(lambda x : x.item().cpu().item(), axis = 0)[["mae", "mse", "r2"]].to_dict() | {"model" : model.modelname, "data" : "artificial"}, ignore_index=True)
finalresults = finalresults.append(pd.DataFrame(real_test, index=[0]).apply(lambda x : x.item().cpu().item(), axis = 0)[["mae", "mse", "r2"]].to_dict() | {"model" : model.modelname ,"data" : "real"}, ignore_index=True)
finalresults.to_csv(finalresultsname, index=False)
finalresults 

Unnamed: 0,mae,mse,r2,model,data
0,0.053079,0.005958,0.981705,allDataModel,artificial
1,0.079263,0.018816,0.950012,allDataModel,real
