In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import datetime
import warnings
warnings.filterwarnings('ignore')

import sklearn
from sklearn import metrics
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import KFold

import random, os, json
# Configurar variables de entorno
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # "-1" significa deshabilitar todas las GPUs


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Masking, GRU, Dropout, Dense
from tensorflow.keras import backend as K

from joblib import Parallel, delayed
import multiprocessing

# FUNCTIONS OF THE MODEL

In [None]:
def calculateKPI(parameter):
    """
    This function calculate the mean and deviation of a set of values of
    a given performance indicator.
    
    Returns: Mean and std (float)
    """
    mean = round(np.mean(parameter)*100, 2)
    deviation = round(np.sqrt(np.sum(np.power(parameter - np.mean(parameter), 2) / len(parameter)))*100, 2)
    return mean, deviation

#### Reset Keras

In [None]:
def reset_keras(seed=42):
    """Function to ensure that results from Keras models
    are consistent and reproducible across different runs"""
    
    K.clear_session()
    # 1. Set `PYTHONHASHSEED` environment variable at a fixed value
    os.environ['PYTHONHASHSEED']=str(seed)
    # 2. Set `python` built-in pseudo-random generator at a fixed value
    random.seed(seed)
    # 3. Set `numpy` pseudo-random generator at a fixed value
    np.random.seed(seed)
    # 4. Set `tensorflow` pseudo-random generator at a fixed value
    tf.random.set_seed(seed)

#### Building GRU

In [None]:
def build_model(hyperparameters):
    """
    Builds a GRU model with optional regularization.
    """
    reg_type = hyperparameters.get('regularizer', {}).get('type', None)
    reg_value = hyperparameters.get('regularizer', {}).get('value', 0.0)

    if reg_type == 'l1':
        regularizer = tf.keras.regularizers.l1(reg_value)
    elif reg_type == 'l2':
        regularizer = tf.keras.regularizers.l2(reg_value)
    elif reg_type == 'l1_l2':
        regularizer = tf.keras.regularizers.l1_l2(reg_value)
    else:
        regularizer = None

    input_layer = tf.keras.layers.Input(shape=(hyperparameters["n_time_steps"], hyperparameters["layers"][0]))
    masked = tf.keras.layers.Masking(mask_value=hyperparameters['mask_value'])(input_layer)

    gru = tf.keras.layers.GRU(
        hyperparameters["layers"][1],
        dropout=hyperparameters['dropout'],
        return_sequences=False,
        activation=hyperparameters['activation'],
        kernel_regularizer=regularizer,
        bias_regularizer=regularizer,
        use_bias=True
    )(masked)

    output = tf.keras.layers.Dense(
        1,
        activation="sigmoid",
        use_bias=True,
        kernel_regularizer=regularizer,
        bias_regularizer=regularizer
    )(gru)

    model = tf.keras.Model(input_layer, [output])
    model.compile(
        loss='binary_crossentropy',
        optimizer=tf.keras.optimizers.Adam(learning_rate=hyperparameters["lr_scheduler"]),
        metrics=['accuracy', "AUC"]
    )

    return model

#### Running the model

In [None]:
def run_network(X_train, X_val, y_train, y_val, hyperparameters, seed):
    """
    Trains and evaluates the built LSTM model based on the provided data and hyperparameters.

    Args:
        - X_train, X_val, y_train, y_val: numpy.ndarray. Training (T) and Validation (V) data labels.
        - sample_weights_train, sample_weights_val: numpy.ndarray. Weights for the T and V data to handle class imbalance.
        - hyperparameters: Dictionary containing the hyperparameters.
        - seed: Integer seed for reproducibility.
    Returns:
        - model: A tf.keras.Model with the trained model.
        - hist:  The training history.
        - earlystopping: The early stopping callback.
    """
    batch_size = hyperparameters['batch_size']
    n_epochs_max = hyperparameters['n_epochs_max']    

    model = None
    model = build_model(hyperparameters)
    earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                  min_delta=hyperparameters["mindelta"],
                                                  patience=hyperparameters["patience"],
                                                  restore_best_weights=True,
                                                  mode="min")
    hist = model.fit(X_train, y_train,
                     validation_data=(X_val, y_val),
                     callbacks=[earlystopping], batch_size=batch_size, epochs=n_epochs_max,
                     verbose=hyperparameters['verbose'])
    
    return model, hist, earlystopping


#### GridSearch

In [None]:
def evaluate_combination(k, l, m, b, r, hyperparameters, dropout, layers, lr_scheduler, activation, regularizer, seed, split, norm, n_time_steps):
    hyperparameters_copy = hyperparameters.copy()
    hyperparameters_copy['dropout'] = dropout[k]
    hyperparameters_copy['layers'] = layers[l]
    hyperparameters_copy['lr_scheduler'] = lr_scheduler[m]
    hyperparameters_copy['activation'] = activation[b]
    hyperparameters_copy['regularizer'] = regularizer[r]

    v_val_loss = []

    X_train = np.load("../../DATA/s" + str(i) + "/X_train_tensor_" + norm + ".npy")
    y_train = pd.read_csv("../../DATA/s" + str(i) + "/y_train_tensor_" + norm + ".csv")[['individualMRGerm_stac']].individualMRGerm_stac.values

    X_val = np.load("../../DATA/s" + str(i) + "/X_val_tensor_" + norm + ".npy")
    y_val = pd.read_csv("../../DATA/s" + str(i) + "/y_val_tensor_" + norm + ".csv")[['individualMRGerm_stac']].individualMRGerm_stac.values

    reset_keras()

    model, hist, early = run_network(
        X_train, X_val,
        y_train,
        y_val,
        hyperparameters_copy,
        seed
    )

    v_val_loss.append(np.min(hist.history["val_loss"]))

    metric_dev = np.mean(v_val_loss)
    return (metric_dev, k, l, m, b, r, X_train, y_train, X_val, y_val)

def myCVGridParallel(hyperparameters, dropout, lr_scheduler, layers, activation, regularizer, seed, split, norm, n_time_steps=14):
    bestHyperparameters = {}
    bestMetricDev = np.inf

    num_cores = multiprocessing.cpu_count()
    results = Parallel(n_jobs=3)(
        delayed(evaluate_combination)(k, l, m, b, r, hyperparameters, dropout, layers, lr_scheduler, activation, regularizer, seed, split, norm, n_time_steps)
        for k in range(len(dropout))
        for l in range(len(layers))
        for m in range(len(lr_scheduler))
        for b in range(len(activation))
        for r in range(len(regularizer))
    )

    for metric_dev, k, l, m, b, r, X_train, y_train, X_val, y_val in results:
        if metric_dev < bestMetricDev:
            bestMetricDev = metric_dev
            bestHyperparameters = {
                'dropout': dropout[k],
                'layers': layers[l],
                'lr_scheduler': lr_scheduler[m],
                'activation': activation[b],
                'regularizer': regularizer[r],
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val
            }

    return bestHyperparameters

## Hyperparameters
In the dictionary, hyperparameters related to: data, training, evaluation, regularization

In [None]:
seeds = [42, 76, 124, 163, 192, 205]

input_shape = 80
n_time_steps = 14
batch_size = 32
n_epochs_max = 1000

layer_list = [
    [input_shape, 10, 1],  [input_shape, 20, 1], [input_shape, 30, 1], [input_shape, 40, 1], 
    [input_shape, 50, 1],  [input_shape, 60, 1]
]

dropout = [0.1, 0.15, 0.2]
lr_scheduler = [1e-1]

regularizer = [
    {'type': 'l2', 'value': 0},
    {'type': 'l2', 'value': 0.01}
]

activation = ['LeakyReLU']
 
norm = "normPower2"

hyperparameters = {
    "n_time_steps": n_time_steps,
    "mask_value": 666,
    "batch_size": batch_size,
    "n_epochs_max": n_epochs_max,
    "monitor": "val_loss",
    "mindelta": 0,
    "patience": 15,
    "dropout": 0.0,
    "verbose": 0,
}

tab = "\t" 

## Predictions

In [None]:
import os
import pickle
import time
import numpy as np
import pandas as pd


run_model = True
debug = True

if run_model:
    loss_train = []
    loss_dev = []
    v_models = []
    v_accuracy_test = []
    v_specificity = []
    v_precision = []
    v_recall = []
    v_f1score = []
    v_roc = []
    v_early = []
    v_probs = []
    results = ""

    bestHyperparameters_bySplit = {}
    y_pred_by_split = {}

    for i in [1, 2, 3]:
        init = time.time()
        
        X_test = np.load("../../DATA/s" +str(i)+ "/X_test_tensor_"+norm+".npy")
        y_test = pd.read_csv("../../DATA/s" +str(i) + "/y_test_tensor_"+norm+".csv")[['individualMRGerm_stac']].individualMRGerm_stac.values

        # GridSearch of hyperparameters 
        bestHyperparameters = myCVGridParallel(hyperparameters,
                                               dropout,
                                               lr_scheduler,
                                               layer_list,
                                               activation,
                                               regularizer,
                                               seeds[i],
                                               "s"+str(i),
                                               norm)
        fin = time.time()
        X_train = bestHyperparameters["X_train"]
        y_train = bestHyperparameters["y_train"]
        X_val = bestHyperparameters["X_val"]
        y_val = bestHyperparameters["y_val"]

        bestHyperparameters_bySplit[str(i)] = bestHyperparameters

        # Save best hyperparameters for current split
        split_directory = './Results_GRU/split_' + str(i)
        if not os.path.exists(split_directory):
            os.makedirs(split_directory)

        with open(os.path.join(split_directory, f"bestHyperparameters_split_{i}.pkl"), 'wb') as f:
            pickle.dump(bestHyperparameters, f)

        hyperparameters = {
            'n_time_steps': hyperparameters["n_time_steps"],
            'mask_value': hyperparameters["mask_value"],

            'batch_size': hyperparameters["batch_size"],
            'n_epochs_max': hyperparameters["n_epochs_max"],
            'monitor':  hyperparameters["monitor"],
            "mindelta": hyperparameters["mindelta"],
            "patience": hyperparameters["patience"],
            "dropout": bestHyperparameters["dropout"],
            "layers": bestHyperparameters["layers"],
            "lr_scheduler": bestHyperparameters["lr_scheduler"],
            "activation": bestHyperparameters["activation"],
            "regularizer": bestHyperparameters["regularizer"],

            'verbose': 0
        }

        # --- TRY ON TEST ----------------------------------------------------------------------
        reset_keras()

        model, hist, early = run_network(
            X_train, X_val,
            y_train,
            y_val,
            hyperparameters,
            seeds[i-1]
        )

        v_models.append(model)
        loss_train.append(hist.history['loss'])
        loss_dev.append(hist.history['val_loss'])

        y_pred = model.predict(x=X_test)
        y_pred_by_split[str(i)] = y_pred

        # Save y_pred for current split
        with open(os.path.join(split_directory, f"y_pred_split_{i}.pkl"), 'wb') as f:
            pickle.dump(y_pred, f)

        # Save model for current split
        model_filename = os.path.join(split_directory, f"model_split_{i}.h5")
        model.save(model_filename)
        
    
        accuracy_test = sklearn.metrics.accuracy_score(y_test, np.round(y_pred))
        tn, fp, fn, tp = confusion_matrix(y_test, np.round(y_pred)).ravel()
        roc = sklearn.metrics.roc_auc_score(y_test, y_pred)


        v_accuracy_test.append(accuracy_test)
        v_specificity.append(tn / (tn + fp))
        v_precision.append(tp / (tp + fp))
        v_recall.append(tp / (tp + fn))
        v_f1score.append((2 * v_recall[i-1] * v_precision[i-1]) / (v_recall[i-1] + v_precision[i-1]))
        v_roc.append(roc)

        if debug:
            results = results + tab + "\tPositivos bien predichos" + str(tp) + "\n"
            results = results + tab + "\tPositivos mal predichos" + str(fp) + "\n"
            results = results + tab + "\tNegativos bien predichos" + str(tn) + "\n"
            results = results + tab + "\tNegativos mal predichos" + str(fn) + "\n"
        
    

    # END EXECUTION - SAVE AGGREGATED RESULTS
    directory = './Results_GRU'
    if not os.path.exists(directory):
        os.makedirs(directory)

    def save_to_pickle(data, filename):
        with open(filename, 'wb') as f:
            pickle.dump(data, f)

    save_to_pickle(bestHyperparameters_bySplit, os.path.join(directory, "bestHyperparameters_bySplit.pkl"))
    save_to_pickle(y_pred_by_split, os.path.join(directory, "y_pred_by_split.pkl"))
    
    for i, model in enumerate(v_models):
        model_filename = os.path.join(directory, f"model_{i}.h5")
        model.save(model_filename)


In [None]:
def format_metric_line(metric_name, mean_value, deviation_value):
    return f"{metric_name}: {mean_value:.2f} +- {deviation_value:.2f}\n"

# Calculate the metrics
mean_test, deviation_test = calculateKPI(v_accuracy_test)
mean_specificity, deviation_specificity = calculateKPI(v_specificity)
mean_recall, deviation_recall = calculateKPI(v_recall)
mean_f1, deviation_f1 = calculateKPI(v_f1score)
mean_precision, deviation_precision = calculateKPI(v_precision)
mean_roc, deviation_roc = calculateKPI(v_roc)

# Generate the results string
results = ""
results += format_metric_line("Test Accuracy", mean_test, deviation_test)
results += format_metric_line("Specificity", mean_specificity, deviation_specificity)
results += format_metric_line("Sensitivity", mean_recall, deviation_recall)
results += format_metric_line("Precision", mean_precision, deviation_precision)
results += format_metric_line("F1-score", mean_f1, deviation_f1)
results += format_metric_line("ROC-AUC", mean_roc, deviation_roc)

# Final formatted string for all metrics
final_results = (
    f"Sensitivity: {mean_recall:.2f} +- {deviation_recall:.2f}\n"
    f"Specificity: {mean_specificity:.2f} +- {deviation_specificity:.2f}\n"
    f"Precision: {mean_precision:.2f} +- {deviation_precision:.2f}\n"
    f"F1-score: {mean_f1:.2f} +- {deviation_f1:.2f}\n"
    f"ROC-AUC: {mean_roc:.2f} +- {deviation_roc:.2f}\n"
    f"Test Accuracy: {mean_test:.2f} +- {deviation_test:.2f}\n"
)

# Optionally, you can print or log the results
print(final_results)