In [None]:
import tensorflow as tf
import numpy as np
import random
import pandas as pd
import sklearn
import seaborn as sns

import matplotlib.pyplot as plt

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import pickle

import sys
sys.path.append("../../../libraries/")
import utils
import TFT_library_masking as TFT

sys.path.append("../../../classification_architectures/")
import fhsi

# Functions

In [None]:
def load_static_data(df_static, keys, categorical_keys):
    # Eliminate the non-interesting features and get only one sample per patient
    df_static = df_static[keys].drop_duplicates()
    
    # Fill the NaNs
    if "SAPSIIIScore" in df_static:
        df_static["SAPSIIIScore"] = df_static.SAPSIIIScore.fillna(df_static.SAPSIIIScore.mean())

    # Reduce the dimensionality of two categorical features
    if 'Origin' in df_static:
        booleanMapping = (df_static.Origin == "rehabilitation") | (df_static.Origin == "hemodynamics") | \
        (df_static.Origin == "cma") | (df_static.Origin == "ICU")  | (df_static.Origin == "paediatrics")  | \
        (df_static.Origin == "obstetrics")  | (df_static.Origin == "anaesthesia")  | (df_static.Origin == "other floor") | \
         (df_static.Origin == "gynaecology") | (df_static.Origin == "psychiatry") | (df_static.Origin == "dermatology")
        df_static.Origin = df_static.Origin.where(~(booleanMapping), "others")
    if 'ReasonAdmission' in df_static:
        booleanMapping = (df_static.ReasonAdmission == "endocrine other") | (df_static.ReasonAdmission == "coagulopatía") | \
        (df_static.ReasonAdmission == "obstetric pathology") | (df_static.ReasonAdmission == "infection other")  | \
        (df_static.ReasonAdmission == "other")  | (df_static.ReasonAdmission == "hydroelectrolytic alteration")  | \
        (df_static.ReasonAdmission == "respiratory other")  | (df_static.ReasonAdmission == "severe trauma") | \
         (df_static.ReasonAdmission == "hepatic insufficiency") | (df_static.ReasonAdmission == "diabetic decompensation") | \
        (df_static.ReasonAdmission == "neuromuscular") | (df_static.ReasonAdmission == "severe arrhythmia")
        df_static.ReasonAdmission = df_static.ReasonAdmission.where(~(booleanMapping), "others")
    
    # Convert the categories of categorical features into numbers
    for i in range(len(categorical_keys)):
        df_static[categorical_keys[i]] = pd.factorize(df_static[categorical_keys[i]])[0] + 1
        
    return df_static

df_MR = pd.read_csv("../../../ORIGINAL_DATA/MDR/df_estaticas_v2_covid.csv", low_memory=False)

keys = [
    'Admissiondboid', 
    'Age', 'Gender','Origin', 'ReasonAdmission', 'PatientCategory',
    'SAPSIIIScore',          
    'MonthOfAdmission', 'YearOfAdmission'
       ]
categorical_keys = ["Origin", "ReasonAdmission", "PatientCategory"]


df_static = load_static_data(df_MR, keys, categorical_keys)
df_static.head()

category_counts = []
categorical_features = ['Origin', 'ReasonAdmission', 'PatientCategory']
for i in range(len(categorical_features)):
    category_counts.append(np.unique(df_static[categorical_features[i]], return_counts=True)[0].shape[0] + 1)

# Hyperparameters

In [None]:
seeds = [20, 30, 45, 70]

n_categorical_features = 3
n_numerical_features = 5
n_static_features = n_categorical_features + n_numerical_features
n_dynamic_features = 56
n_timesteps = 14

# Hyperparamas of network
balance = True
epochs = 10000
batch_size = 128
num_heads = 1

layers = [3, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50]
lr_scheduler = [0.0001, 0.001, 0.01, 0.1]
dropout_rate = [0.0, 0.1, 0.2, 0.3]

w2 = 0.18
w1 = 0.82

tensor = True
debug = True

hyperparameters = {
    "n_categorical_features": n_categorical_features,
    "n_numerical_features": n_numerical_features,
    "n_static_features": n_static_features,
    "n_dynamic_features": n_dynamic_features,
    "n_timesteps": n_timesteps,
    "num_heads": num_heads,
    "w1":w1, "w2":w2, 

    "category_counts": category_counts,
    "epochs":epochs,
    'batch_size': batch_size,
    'maskValue':666,
    'monitor': 'val_loss', 
    "mindelta": 0,
    "patience":30,
    'balance': balance,
    'optimizer':'adam',
    'kfold':5,
    'level':3, 
    'verbose':0
}

# Code

In [None]:
v_early = []
loss_dev = []
v_models = []
bestHyperparameters_bySplit = {}
y_pred_by_split = {}

for i in range(4):
    
    path = f'../../../ORIGINAL_DATA/MDR/splits_14_days/notbalanced/split_{str(i)}/'

    X_test_dynamic = np.load(path + f"/X_test_tensor.npy")
    X_test_static = pd.read_csv(path + f"/X_test_static.csv", index_col=0)
    y_test = pd.read_csv(path + f"/y_test.csv", index_col=0)

    bestHyperparameters, X_train_dyn, y_train, X_train_static, X_val_dyn, y_val, X_val_static = fhsi.myCVGrid(hyperparameters,
                                                                                                 dropout_rate,
                                                                                                 lr_scheduler,
                                                                                                 layers,
                                                                                                 i,                                                              
                                                                                                 seeds[i],
                                                                                                 path
                                                                                                 )
    bestHyperparameters_bySplit[str(i)] = bestHyperparameters
    

    # Save best hyperparameters for current split
    split_directory = './Results_FHSI/split_' + str(i)
    if not os.path.exists(split_directory):
        os.makedirs(split_directory)

    with open(os.path.join(split_directory, f"bestHyperparameters_split_{i}.pkl"), 'wb') as f:
        pickle.dump(bestHyperparameters, f)


    hyperparameters = {
        "n_categorical_features": hyperparameters["n_categorical_features"],
        "n_numerical_features": hyperparameters["n_numerical_features"],
        "n_static_features": hyperparameters["n_static_features"],
        "n_dynamic_features": hyperparameters["n_dynamic_features"],
        "w1":hyperparameters["w1"], "w2":hyperparameters["w2"],                                    

        "n_timesteps": hyperparameters["n_timesteps"],
        "category_counts": hyperparameters["category_counts"],
        'epochs':  hyperparameters["epochs"],
        "num_heads": num_heads,
        'batch_size': hyperparameters["batch_size"],
        'maskValue': hyperparameters["maskValue"],
        'earlyStopping': True,
        'kfold': hyperparameters["kfold"],
        'monitor': hyperparameters["monitor"],
        "mindelta": hyperparameters["mindelta"],
        "patience": hyperparameters["patience"],
        'balance': hyperparameters["balance"],
        "dropout_rate": bestHyperparameters["dropout_rate"],
        "layers": bestHyperparameters["layers"],
        "lr_scheduler": bestHyperparameters["lr_scheduler"],
        "level": 3, 'verbose': 0
    }

    #Try on test
    utils.reset_keras()

    model, hist, early = fhsi.run_network(
        X_train_dyn, X_train_static, y_train.individualMRGerm.values,
        X_val_dyn, X_val_static, y_val.individualMRGerm.values,
        hyperparameters, 
        seeds[i]
    )    

    v_models.append(model)
    loss_dev.append(hist.history['val_loss'])

    y_pred = model.predict(x=[X_test_static.values, X_test_dynamic])
    y_pred_by_split[str(i)] = y_pred
    
    with open(os.path.join(split_directory, f"y_pred_split_{i}.pkl"), 'wb') as f:
        pickle.dump(y_pred, f)

    # Save model for current split
    model_filename = os.path.join(split_directory, f"model_split_{i}.h5")
    model.save(model_filename)

    # Calculate metrics
    metrics_dict = utils.calculate_and_save_metrics(
    y_test.individualMRGerm.values, 
    y_pred, 
    split_directory, 
    split_index=i
    )