In [1]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
print(F"Devices: {physical_devices}")
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc
import time

import optuna
from tensorflow.keras.models import Model
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Input, Dense, Embedding, Concatenate, \
                        Dropout, Average, Maximum, Dot, Reshape, TimeDistributed,\
                        Flatten, Conv1D, MaxPooling1D, AveragePooling1D, \
                        LeakyReLU, LSTM, Lambda,UpSampling1D,BatchNormalization, AdditiveAttention, Attention
from keras.utils.vis_utils import plot_model
from tensorflow.keras.backend import clear_session
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from IPython.display import HTML, display
import tabulate

import pipelineLibrary
import utilsCode

import seaborn as sns

np.warnings.filterwarnings('ignore')

Devices: []


In [2]:
# Parameters
seed = 1
BATCHSIZE = 1024

n_studies = 3
n_trials = 10
n_splits = 5

# Config
output_filename = "../../../ML-dashboard.html"
input_filename = '../../../dados_anon_fulldataset.csv'
checkpoint_filepath = '/tmp/checkpoint-ML'

validation_freq = 10
monitor_evaluation = "val_acc"

In [3]:
utilsCode.set_seed(seed)

In [4]:
df = pd.read_csv(input_filename)

df = df.drop(columns=['Unnamed: 0'])\
        .drop_duplicates()\
        .copy()

new_state = True
alunos_list = df['ID_CURSO_ALUNO'].unique()
if (new_state):
    train_validation_perc = 0.7
    alunos_train_validation = np.random.choice(len(alunos_list), size=int(len(alunos_list)*train_validation_perc), replace=False)
    np.save('random_state_.npy', alunos_train_validation)
else:
    alunos_train_validation = np.load('random_state_.npy')
data_train_validation_ = df[df['ID_CURSO_ALUNO'].isin(alunos_list[alunos_train_validation])]
data_test_ = df[~df['ID_CURSO_ALUNO'].isin(alunos_list[alunos_train_validation])]

alunos_list_train_validation = data_train_validation_['ID_CURSO_ALUNO'].unique()
alunos_list_test = data_test_['ID_CURSO_ALUNO'].unique()

kf = KFold(n_splits=n_splits, shuffle=True)

threshold = 0.5

In [5]:
def getAUC(predicted, label):
    auc = metrics.roc_auc_score(label, predicted)    
    return auc

1.0

In [6]:
def customRecall(predicted, label):
    tn, fp, fn, tp = confusion_matrix(predicted>=threshold, label).ravel()
    return tp/(tp+fn)

def customAccuracy(predicted, label):
    return sum((predicted>=threshold)==label)/len(label)

In [7]:
def model_default_lstm(parameters, np_estatico, np_series_lstm):
    _, ncolumns_static = np_estatico.shape
    _, timestep_lstm, width_lstm = np_series_lstm.shape
    
    static_input = Input(name='static_input', shape=(ncolumns_static,))
    series_input = Input(name='series_input', shape=(timestep_lstm, width_lstm,))

    lstm = LSTM(parameters['lstm_output'],
                dropout=parameters['lstm_dropout'],
                recurrent_dropout=parameters['recurrent_dropout'],
                return_sequences=True,
                name='lstm_layer')(series_input)
    
    flat = Flatten()(lstm)
    
    
    dense_static = Dense(parameters['dense_1_static'], activation="relu", name='dense_static')(static_input)
    bnorm = BatchNormalization()(dense_static)
    drop = Dropout(parameters['dense_dropout_1'])(bnorm)
    dense_static2 = Dense(parameters['dense_2_static'], activation="relu", name='dense_static2')(drop)
    bnorm2 = BatchNormalization()(dense_static2)
    drop2 = Dropout(parameters['dense_dropout_2'])(bnorm2)
    
    concanate_layer = Concatenate(name='concanate_layer')([flat, drop2])

    dense1 = Dense(parameters['merged_dense_1'], activation='relu', name='dense1')(concanate_layer)
    bnorm3 = BatchNormalization()(dense1)
    drop3 = Dropout(parameters['merged_dense_dropout_1'])(bnorm3)
    dense2 = Dense(parameters['merged_dense_2'], activation='relu', name='dense2')(drop3)
    bnorm4 = BatchNormalization()(dense2)
    drop4 = Dropout(parameters['merged_dense_dropout_2'])(bnorm4)
    output = Dense(1, activation='sigmoid', name='output_layer')(drop4)
    
    optimizer = optimizers.Adadelta(learning_rate=parameters['learning_rate'], rho=parameters['rho'])


    
    model = Model(inputs=[static_input, series_input] , outputs=output)
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=optimizer, metrics=["acc"])

    return model

#  LSTM ----------------------------------------------------------

In [8]:
# OPTIMIZATION
def objective_lstm(trial):
    parameters = {'learning_rate': trial.suggest_loguniform('learning_rate', 0.00095, 0.005),
                    'rho': trial.suggest_uniform('rho', 0.9, 1),
                    'epoach': trial.suggest_int('epoach', 250, 750),

                    'dropout': trial.suggest_uniform('dropout', 0, 0.5),
                    'recurrent_dropout': trial.suggest_uniform('recurrent_dropout', 0, 0.5),

                    'lstm_output': trial.suggest_int('lstm_output', 100, 500),

                    'dense_static': trial.suggest_int('dense_static', 100, 500),

                    'merged_dense': trial.suggest_int('merged_dense', 100, 500),
                  }

    parameters_lstm = {'learning_rate': parameters['learning_rate'],
                    'rho': parameters['rho'],
                    'epoach': parameters['epoach'],
                                              
                    'lstm_output': parameters['lstm_output'],
                    'lstm_dropout': parameters['dropout'],
                    'recurrent_dropout': parameters['recurrent_dropout'],
                        
                    'dense_1_static': parameters['dense_static'],
                    'dense_dropout_1': parameters['dropout'],
                    'dense_2_static': parameters['dense_static'],
                    'dense_dropout_2': parameters['dropout'],
                      
                    'merged_dense_1': parameters['merged_dense'],
                    'merged_dense_dropout_1': parameters['dropout'],
                    'merged_dense_2': parameters['merged_dense'],
                    'merged_dense_dropout_2': parameters['dropout'],
                    }
        
    acc_list = []
    for train_index, test_index in kf.split(alunos_list_train_validation):
        data_train = df[df['ID_CURSO_ALUNO'].isin(alunos_list_train_validation[train_index])]
        time_series_dataframe_train, np_estatico_train, label_train = pipelineLibrary.pipe_default.fit_transform(data_train)
        np_series_lstm_train = pipelineLibrary.pipe_lstm.fit_transform(time_series_dataframe_train)
        
        data_validation = df[df['ID_CURSO_ALUNO'].isin(alunos_list_train_validation[test_index])]
        time_series_dataframe_validation, np_estatico_validation, label_validation = pipelineLibrary.pipe_default.transform(data_validation)
        np_series_lstm_validation = pipelineLibrary.pipe_lstm.transform(time_series_dataframe_validation)

        model = model_default_lstm(parameters_lstm, np_estatico_train, np_series_lstm_train)
        
        # Train
        history = model.fit([np_estatico_train, np_series_lstm_train],
              label_train,
              epochs=parameters_lstm['epoach'],
              verbose=0,
              batch_size=BATCHSIZE,
                )
        
#         # Evaluate
        predicted = model.predict([np_estatico_validation, np_series_lstm_validation])
        acc_list.append(customAccuracy(predicted, label_validation))

        clear_session()
        del model, time_series_dataframe_train, np_series_lstm_train, label_validation, np_series_lstm_validation, \
            np_estatico_train, label_train, time_series_dataframe_validation, np_estatico_validation
            
        gc.collect()

    return np.mean(acc_list)

In [9]:
def evaluation():
    # EVALUATION
    accuracyList = []
    sensibilidadeList = []
    especificidadeList = []
    aucList = []

    parameters = study.best_params
    parameters_lstm = {'learning_rate': parameters['learning_rate'],
                    'rho': parameters['rho'],
                    'epoach': parameters['epoach'],

                    'lstm_output': parameters['lstm_output'],
                    'lstm_dropout': parameters['dropout'],
                    'recurrent_dropout': parameters['recurrent_dropout'],

                    'dense_1_static': parameters['dense_static'],
                    'dense_dropout_1': parameters['dropout'],
                    'dense_2_static': parameters['dense_static'],
                    'dense_dropout_2': parameters['dropout'],

                    'merged_dense_1': parameters['merged_dense'],
                    'merged_dense_dropout_1': parameters['dropout'],
                    'merged_dense_2': parameters['merged_dense'],
                    'merged_dense_dropout_2': parameters['dropout'],
                    }

    for train_index, test_index in kf.split(alunos_list_test):
        # Train model using all training data
        data_train = df[df['ID_CURSO_ALUNO'].isin(alunos_list_train_validation)]
        time_series_dataframe_train, np_estatico_train, label_train = pipelineLibrary.pipe_default.fit_transform(data_train)
        np_series_lstm_train = pipelineLibrary.pipe_lstm.fit_transform(time_series_dataframe_train)

        model = model_default_lstm(parameters_lstm, np_estatico_train, np_series_lstm_train)

        # Save best model
        monitor = monitor_evaluation
        model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=checkpoint_filepath,
            save_weights_only=True,
            monitor=monitor,
            save_best_only=True)
        
        callbacks = [model_checkpoint_callback]
        
        # Train
        model.fit([np_estatico_train, np_series_lstm_train],
              label_train,
              epochs=study.best_params['epoach'],
              verbose=0,
              batch_size=BATCHSIZE,
              validation_split=0.3,
              callbacks=callbacks,
                 )    
        del time_series_dataframe_train, np_series_lstm_train, np_estatico_train, label_train
        gc.collect()

        # Evaluate
        # Evaluate using k-fold test dataset partition
        data_test = df[df['ID_CURSO_ALUNO'].isin(alunos_list_test[test_index])]
        time_series_dataframe_test, np_estatico_test, label_test = pipelineLibrary.pipe_default.transform(data_test)
        np_series_lstm_test = pipelineLibrary.pipe_lstm.transform(time_series_dataframe_test) 

        model.load_weights(checkpoint_filepath)
        predicted = model.predict([np_estatico_test, np_series_lstm_test])
        tn, fp, fn, tp = confusion_matrix(predicted>=threshold, label_test).ravel()

        unique, counts = np.unique(label_test, return_counts=True)
        print(F"Labels: {dict(zip(unique, counts))}")
        unique, counts = np.unique((predicted>=threshold).astype(int), return_counts=True)
        print(F"Predicted: {dict(zip(unique, counts))}")

        sensibilidadeList.append(tp/(tp+fn))
        especificidadeList.append(tn/(tn+fp))
        aucList.append(getAUC(predicted, label_test))
        accuracyList.append(customAccuracy(predicted, label_test))

        clear_session()
        del model, label_test, np_series_lstm_test, time_series_dataframe_test, np_estatico_test
        gc.collect()

    print(accuracyList)
    print(sensibilidadeList)
    print(especificidadeList)
    print(aucList)
    return np.array(accuracyList).mean(), np.array(sensibilidadeList).mean(), np.array(especificidadeList).mean(), np.array(aucList).mean()

In [10]:
acc = []
sens = []
esp = []
auc = []

figs = []
for i in range(n_studies):
    study = optuna.create_study(direction='maximize')

    study.optimize(objective_lstm, n_trials=n_trials)
    
    print(F"Study: {i}")
    print(F"Best Value: {study.best_value}")
    print(F"Best Parameters: {study.best_params}")
    
    fig = optuna.visualization.plot_optimization_history(study)
    figs.append(fig)
    
    a, b, c, d = evaluation()
    acc.append(a)
    sens.append(b)
    esp.append(c)
    auc.append(d)

[32m[I 2021-01-08 23:19:37,347][0m A new study created in memory with name: no-name-91a219e3-67ad-4b74-909a-c7a470809350[0m
[32m[I 2021-01-08 23:20:14,112][0m Trial 0 finished with value: 0.6954427659511566 and parameters: {'learning_rate': 0.001627121194946629, 'rho': 0.9271964095852862, 'epoach': 647, 'lstm_output': 361, 'lstm_dropout': 0.3420757846157449, 'recurrent_dropout': 0.3421411267904899, 'dense_1_static': 126, 'dense_dropout_1': 0.40624974973504163, 'dense_2_static': 103, 'dense_dropout_2': 0.24358400929889257, 'merged_dense_1': 370, 'merged_dense_dropout_1': 0.12183746330839668, 'merged_dense_2': 320, 'merged_dense_dropout_2': 0.0586619878364536}. Best is trial 0 with value: 0.6954427659511566.[0m


Mean runned epochs: 66.0


[32m[I 2021-01-08 23:21:59,546][0m Trial 1 finished with value: 0.5181376188993454 and parameters: {'learning_rate': 0.0027010860958801113, 'rho': 0.9840728368519109, 'epoach': 401, 'lstm_output': 432, 'lstm_dropout': 0.20564589244172649, 'recurrent_dropout': 0.44917416363800716, 'dense_1_static': 207, 'dense_dropout_1': 0.2798976389387046, 'dense_2_static': 101, 'dense_dropout_2': 0.1960181575425382, 'merged_dense_1': 121, 'merged_dense_dropout_1': 0.05408993704747306, 'merged_dense_2': 174, 'merged_dense_dropout_2': 0.1006139867554337}. Best is trial 0 with value: 0.6954427659511566.[0m


Mean runned epochs: 206.0
Study: 0
Best Value: 0.6954427659511566
Best Parameters: {'learning_rate': 0.001627121194946629, 'rho': 0.9271964095852862, 'epoach': 647, 'lstm_output': 361, 'lstm_dropout': 0.3420757846157449, 'recurrent_dropout': 0.3421411267904899, 'dense_1_static': 126, 'dense_dropout_1': 0.40624974973504163, 'dense_2_static': 103, 'dense_dropout_2': 0.24358400929889257, 'merged_dense_1': 370, 'merged_dense_dropout_1': 0.12183746330839668, 'merged_dense_2': 320, 'merged_dense_dropout_2': 0.0586619878364536}
(7, 11, 0, 0)



invalid value encountered in long_scalars



(6, 3, 7, 3)


In [11]:
utilsCode.figures_to_html(figs, output_filename)

In [12]:
table = [
    ['', 'média', 'desvio padrão'],
    ['Acurácia', round(np.array(acc).mean(),4),  round(np.array(acc).std(),4)],
    ['Sensibilidade', round(np.array(sens).mean(),4),  round(np.array(sens).std(),4)],
    ['Especificidade', round(np.array(esp).mean(),4),  round(np.array(esp).std(),4)],
    ['AUC', round(np.array(auc).mean(),4),  round(np.array(auc).std(),4)]
]
display(HTML(tabulate.tabulate(table, tablefmt='html')))

0,1,2
,média,desvio padrão
Acurácia,0.4313,0.0
Sensibilidade,,
Especificidade,0.5278,0.0
AUC,0.3483,0.0


In [13]:
print(acc)
print(sens)
print(esp)
print(auc)

[0.43128654970760233]
[nan]
[0.5277777777777778]
[0.3483183483183483]
