In [None]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
print(F"Devices: {physical_devices}")
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc
import time

import optuna
from tensorflow.keras.models import Model
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Input, Dense, Embedding, Concatenate, \
                        Dropout, Average, Maximum, Dot, Reshape, TimeDistributed,\
                        Flatten, Conv1D, MaxPooling1D, AveragePooling1D, \
                        LeakyReLU, LSTM, Lambda,UpSampling1D,BatchNormalization, AdditiveAttention, Attention
from keras.utils.vis_utils import plot_model
from tensorflow.keras.backend import clear_session
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from IPython.display import HTML, display
import tabulate

import pipelineLibrary
import utilsCode

import seaborn as sns

np.warnings.filterwarnings('ignore')

In [None]:
# Parameters
seed = 1
BATCHSIZE = 1024

n_studies = 3
n_trials = 10
n_splits = 5

# Config
output_filename = "../../../MCL-dashboard.html"
input_filename = '../../../dados_anon_fulldataset.csv'
checkpoint_filepath = '/tmp/checkpoint-MCL'

validation_freq = 10
monitor_evaluation = "val_acc"

In [None]:
utilsCode.set_seed(seed)

In [None]:
df = pd.read_csv(input_filename)

df = df.drop(columns=['Unnamed: 0'])\
        .drop_duplicates()\
        .copy()

new_state = True
alunos_list = df['ID_CURSO_ALUNO'].unique()
if (new_state):
    train_validation_perc = 0.7
    alunos_train_validation = np.random.choice(len(alunos_list), size=int(len(alunos_list)*train_validation_perc), replace=False)
    np.save('random_state_.npy', alunos_train_validation)
else:
    alunos_train_validation = np.load('random_state_.npy')
data_train_validation_ = df[df['ID_CURSO_ALUNO'].isin(alunos_list[alunos_train_validation])]
data_test_ = df[~df['ID_CURSO_ALUNO'].isin(alunos_list[alunos_train_validation])]

alunos_list_train_validation = data_train_validation_['ID_CURSO_ALUNO'].unique()
alunos_list_test = data_test_['ID_CURSO_ALUNO'].unique()

kf = KFold(n_splits=5, shuffle=True)

threshold = 0.5

In [None]:
def getAUC(predicted, label):
    auc = metrics.roc_auc_score(label, predicted)    
    return auc

In [None]:
def customRecall(predicted, label):
    tn, fp, fn, tp = confusion_matrix(predicted>=threshold, label).ravel()
    return tp/(tp+fn)

def customAccuracy(predicted, label):
    return sum((predicted>=threshold)==label)/len(label)

In [None]:
def model_cnn(parameters, np_estatico, np_series_cnn):
    _, ncolumns_static = np_estatico.shape
    _, timestep_cnn, width_cnn, height_cnn = np_series_cnn.shape
    
    static_input = Input(name='static_input', shape=(ncolumns_static,))
    series_input = Input(name='series_input', shape=(timestep_cnn, width_cnn, height_cnn))
    
    conv_layer_1 = TimeDistributed(Conv1D(filters=10, kernel_size=5), name='conv_layer_1')(series_input)
    maxpooling_layer_1 = TimeDistributed(MaxPooling1D(3), name='maxpooling_layer_1')(conv_layer_1)
    drop_cnn_1 = Dropout(parameters['cnn_dropout_1'])(maxpooling_layer_1)
    reshape_layer_attention1 = Reshape((4, 100))(drop_cnn_1)
    
    lstm = LSTM(parameters['lstm_output'], 
                dropout=parameters['lstm_dropout'], 
                recurrent_dropout=parameters['recurrent_dropout'], 
                return_sequences=True,
                name='lstm_layer')(reshape_layer_attention1)
        
    flat = Flatten()(lstm)
    
    dense_static = Dense(parameters['dense_1_static'], activation="relu", name='dense_static')(static_input)
    bnorm = BatchNormalization()(dense_static)
    drop = Dropout(parameters['dense_dropout_1'])(bnorm)
    dense_static2 = Dense(parameters['dense_2_static'], activation="relu", name='dense_static2')(drop)
    bnorm2 = BatchNormalization()(dense_static2)
    drop2 = Dropout(parameters['dense_dropout_2'])(bnorm2)
    
    concanate_layer = Concatenate(name='concanate_layer')([flat, drop2])

    dense1 = Dense(parameters['merged_dense_1'], activation='relu', name='dense1')(concanate_layer)
    bnorm3 = BatchNormalization()(dense1)
    drop3 = Dropout(parameters['merged_dense_dropout_1'])(bnorm3)
    dense2 = Dense(parameters['merged_dense_2'], activation='relu', name='dense2')(drop3)
    bnorm4 = BatchNormalization()(dense2)
    drop4 = Dropout(parameters['merged_dense_dropout_2'])(bnorm4)
    output = Dense(1, activation='sigmoid', name='output_layer')(drop4)
    
    optimizer = optimizers.Adadelta(learning_rate=parameters['learning_rate'], rho=parameters['rho'])

    model = Model(inputs=[static_input, series_input] , outputs=output)
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=optimizer, metrics=["acc"])
    
    return model

#  CNN ----------------------------------------------------------

In [None]:
def objective_cnn(trial):
    parameters = {'learning_rate': trial.suggest_loguniform('learning_rate', 0.00095, 0.005),
                    'rho': trial.suggest_uniform('rho', 0.9, 1),
                    'epoach': trial.suggest_int('epoach', 250, 750),

                    'dropout': trial.suggest_uniform('dropout', 0, 0.5),
                    'recurrent_dropout': trial.suggest_uniform('recurrent_dropout', 0, 0.5),

                    'lstm_output': trial.suggest_int('lstm_output', 100, 500),

                    'dense_static': trial.suggest_int('dense_static', 100, 500),

                    'merged_dense': trial.suggest_int('merged_dense', 100, 500),
                  }

    parameters_cnn = {'learning_rate': parameters['learning_rate'],
                    'rho': parameters['rho'],
                    'epoach': parameters['epoach'],

                    'cnn_dropout_1': parameters['dropout'],

                    'lstm_output': parameters['lstm_output'],
                    'lstm_dropout': parameters['dropout'],
                    'recurrent_dropout': parameters['recurrent_dropout'],

                    'dense_1_static': parameters['dense_static'],
                    'dense_dropout_1': parameters['dropout'],
                    'dense_2_static': parameters['dense_static'],
                    'dense_dropout_2': parameters['dropout'],

                    'merged_dense_1': parameters['merged_dense'],
                    'merged_dense_dropout_1': parameters['dropout'],
                    'merged_dense_2': parameters['merged_dense'],
                    'merged_dense_dropout_2': parameters['dropout'],
                    }

    acc_list = []
    for train_index, test_index in kf.split(alunos_list_train_validation):
        data_train = df[df['ID_CURSO_ALUNO'].isin(alunos_list_train_validation[train_index])]
        time_series_dataframe_train, np_estatico_train, label_train = pipelineLibrary.pipe_default.fit_transform(data_train)
        np_series_cnn_train = pipelineLibrary.pipe_cnn.fit_transform(time_series_dataframe_train)

        data_validation = df[df['ID_CURSO_ALUNO'].isin(alunos_list_train_validation[test_index])]
        time_series_dataframe_validation, np_estatico_validation, label_validation = pipelineLibrary.pipe_default.transform(data_validation)
        np_series_cnn_validation = pipelineLibrary.pipe_cnn.transform(time_series_dataframe_validation)

        gc.collect()
        model = model_cnn(parameters_cnn, np_estatico_train, np_series_cnn_train)

        # Train
        history = model.fit([np_estatico_train, np_series_cnn_train],
              label_train,
              epochs=parameters_cnn['epoach'],
              verbose=0,
              batch_size=BATCHSIZE,
                )
        
        del time_series_dataframe_train, np_estatico_train, label_train, np_series_cnn_train, data_train
        gc.collect()

        # Evaluate
        predicted = model.predict([np_estatico_validation, np_series_cnn_validation])
        acc_list.append(customAccuracy(predicted, label_validation))
        
        clear_session()
        del model, time_series_dataframe_validation, np_estatico_validation, label_validation, np_series_cnn_validation, data_validation
        gc.collect()         
    return np.mean(acc_list)

In [None]:
def evaluation():
    # EVALUATION
    accuracyList = []
    sensibilidadeList = []
    especificidadeList = []
    aucList = []

    parameters = study.best_params
    parameters_cnn = {'learning_rate': parameters['learning_rate'],
                    'rho': parameters['rho'],
                    'epoach': parameters['epoach'],

                    'cnn_dropout_1': parameters['dropout'],

                    'lstm_output': parameters['lstm_output'],
                    'lstm_dropout': parameters['dropout'],
                    'recurrent_dropout': parameters['recurrent_dropout'],

                    'dense_1_static': parameters['dense_static'],
                    'dense_dropout_1': parameters['dropout'],
                    'dense_2_static': parameters['dense_static'],
                    'dense_dropout_2': parameters['dropout'],

                    'merged_dense_1': parameters['merged_dense'],
                    'merged_dense_dropout_1': parameters['dropout'],
                    'merged_dense_2': parameters['merged_dense'],
                    'merged_dense_dropout_2': parameters['dropout'],
                    }

    for train_index, test_index in kf.split(alunos_list_test):
        # Train model using all training data
        data_train = df[df['ID_CURSO_ALUNO'].isin(alunos_list_train_validation)]
        time_series_dataframe_train, np_estatico_train, label_train = pipelineLibrary.pipe_default.fit_transform(data_train)
        np_series_cnn_train = pipelineLibrary.pipe_cnn.fit_transform(time_series_dataframe_train)

        model = model_cnn(parameters_cnn, np_estatico_train, np_series_cnn_train)

        # Save best model
        monitor = monitor_evaluation
        model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=checkpoint_filepath,
            save_weights_only=True,
            monitor=monitor,
            save_best_only=True)

        callbacks = [model_checkpoint_callback]

        # Train
        model.fit([np_estatico_train, np_series_cnn_train],
              label_train,
              epochs=study.best_params['epoach'],
              verbose=0,
              batch_size=BATCHSIZE,
              validation_split=0.3,
              callbacks=callbacks,
                 )
        del time_series_dataframe_train, np_estatico_train, label_train, np_series_cnn_train
        gc.collect()

        # Evaluate
        # Evaluate using k-fold test dataset partition
        data_test = df[df['ID_CURSO_ALUNO'].isin(alunos_list_test[test_index])]
        time_series_dataframe_test, np_estatico_test, label_test = pipelineLibrary.pipe_default.transform(data_test)
        np_series_cnn_test = pipelineLibrary.pipe_cnn.transform(time_series_dataframe_test)

        model.load_weights(checkpoint_filepath)
        predicted = model.predict([np_estatico_test, np_series_cnn_test])
        tn, fp, fn, tp = confusion_matrix(predicted>=threshold, label_test).ravel()

        unique, counts = np.unique(label_test, return_counts=True)
        print(F"Labels: {dict(zip(unique, counts))}")
        unique, counts = np.unique((predicted>=threshold).astype(int), return_counts=True)
        print(F"Predicted: {dict(zip(unique, counts))}")


        sensibilidadeList.append(tp/(tp+fn))
        especificidadeList.append(tn/(tn+fp))
        aucList.append(getAUC(predicted, label_test))
        accuracyList.append(customAccuracy(predicted, label_test))

        clear_session()
        del model, time_series_dataframe_test, np_estatico_test, label_test, np_series_cnn_test
        gc.collect()

    print(accuracyList)
    print(sensibilidadeList)
    print(especificidadeList)
    print(aucList)
    return np.array(accuracyList).mean(), np.array(sensibilidadeList).mean(), np.array(especificidadeList).mean(), np.array(aucList).mean()

In [None]:
acc = []
sens = []
esp = []
auc = []

figs = []
for i in range(n_studies):
    study = optuna.create_study(direction='maximize')

    study.optimize(objective_cnn, n_trials=n_trials)

    print(F"Study: {i}")
    print(F"Best Value: {study.best_value}")
    print(F"Best Parameters: {study.best_params}")

    fig = optuna.visualization.plot_optimization_history(study)
    figs.append(fig)

    a, b, c, d = evaluation()
    acc.append(a)
    sens.append(b)
    esp.append(c)
    auc.append(d)

In [None]:
utilsCode.figures_to_html(figs, output_filename)

In [None]:
table = [
    ['', 'média', 'desvio padrão'],
    ['Acurácia', round(np.array(acc).mean(),4),  round(np.array(acc).std(),4)],
    ['Sensibilidade', round(np.array(sens).mean(),4),  round(np.array(sens).std(),4)],
    ['Especificidade', round(np.array(esp).mean(),4),  round(np.array(esp).std(),4)],
    ['AUC', round(np.array(auc).mean(),4),  round(np.array(auc).std(),4)]
]
display(HTML(tabulate.tabulate(table, tablefmt='html')))

In [None]:
print(acc)
print(sens)
print(esp)
print(auc)