## Inception CNN (based on Di Mauro et al, 2019)

Adapted from the original code by: 

- Di Mauro, N., Appice, A., & Basile, T. M. A. (2019). Activity Prediction of Business Process Instances with Inception CNN Models. Lecture Notes in Computer Science (Including Subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics), 11946 LNAI(November), 348–361. https://doi.org/10.1007/978-3-030-35166-3_25 

github link: https://github.com/TaXxER/rnnalpha

This notebook contains the code that tests an Inception CNN model originally created by Di Mauro et al. on the data provided by the Catharina hospital

## General imports



In [1]:
#general imports
import sys
import numpy
import pandas as pd
import numpy as np
from itertools import product
import datetime
from datetime import datetime
import time
import os
from keras.preprocessing.sequence import pad_sequences
from glob import glob
pd.options.mode.chained_assignment = None
import warnings
warnings.filterwarnings("ignore")

#hyperas imports for hyperparameter optimization
from hyperas import optim
from hyperas.distributions import choice, uniform
from hyperopt import Trials, STATUS_OK, tpe

#tensorflow imports for building neural networks
import tensorflow as tf
from tensorflow.keras.layers import Input, Concatenate, Conv1D, GlobalAveragePooling1D, GlobalMaxPooling1D, Reshape, MaxPooling1D, Flatten, Dense, Embedding, Dropout
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.utils import to_categorical, custom_object_scope
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.metrics import Precision, Recall, AUC, Accuracy, MeanSquaredError, MeanAbsoluteError
from tensorflow_addons.metrics import F1Score
import tensorflow.keras.backend as K
import tensorflow.keras.optimizers
tf.config.run_functions_eagerly(True)

#sklearn imports for preprocessing, measuring performance and cross validation
from sklearn import metrics 
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler, normalize
from sklearn import feature_selection
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import RandomOverSampler
from other_lib import globalvar
from other_lib.auk_score import AUK
from other_lib.general_functions import prepare_dataset_for_model, find_all_csv_locations


## Building the model
code use to build the prediction model

In [11]:
#function that calculates the embeddings for each activity for each trace
def create_model(x_train, x_val, x_test, y_train, y_val, y_test, binary):
    
    #create input layer
    inputs = [] #create empty list to store the input layers in, will be updated and used when compiling the model
    input_layer = Input(shape=(x_train.shape[1], 1)) #input layer, width of data = timesteps with each timestep having 1 feature (1 value per column)
    inputs = [input_layer]
    
    #add first inception module
    filters = []
    for i in range({{choice([3, 4])}}): #number of different conv modules in the inception layer
        filters.append(Conv1D(filters=32, strides=1, kernel_size=1+i, activation='relu', padding='same')(input_layer)) #add the conv layers of different sizes
    filters.append(MaxPooling1D(pool_size=3, strides=1, padding='same')(input_layer)) #add the max pool layer
    concat_layer = Concatenate(axis=2)(filters) #concatenate the output of the different conv modules and max pool layer to get output of inception module
    
    for m in range({{choice([0,1,2])}}): #number of inception modules you want to stack on top of the first one (for a total of either 1, 2 or 3)
        filters = []
        for i in range({{choice([3, 4])}}): #number of different conv modules in the inception layer
            filters.append(Conv1D(filters=32, strides=1, kernel_size=1+i, activation='relu', padding='same')(concat_layer)) #add the conv layers of different sizes
        filters.append(MaxPooling1D(pool_size=3, strides=1, padding='same')(concat_layer)) #add the max pool layer
        concat_layer = Concatenate(axis=2)(filters) #concatenate the output of the different conv modules and max pool layer to get output of inception module

    pool = GlobalMaxPooling1D()(concat_layer)
    
    choiceval = {{choice(['adam', 'sgd', 'rmsprop'])}}
    if choiceval == 'adam':
        optim = tensorflow.keras.optimizers.Adam(learning_rate={{choice([10 ** -4, 10 ** -3, 10 ** -2])}}, clipnorm=1.)
    elif choiceval == 'rmsprop':
        optim = rmsprop = tensorflow.keras.optimizers.RMSprop(learning_rate={{choice([10 ** -4, 10 ** -3, 10 ** -2])}}, clipnorm=1.)
    else:
        optim = tensorflow.keras.optimizers.SGD(learning_rate={{choice([10 ** -4, 10 ** -3, 10 ** -2])}}, clipnorm=1.)
    
    #determine output shape based on prediction task, either for binary/length of stay prediction
    if binary:
        output_layer = Dense(1, activation='sigmoid')(pool)
        model = Model(inputs=inputs, outputs=output_layer)
        model.compile(optimizer=optim, loss='binary_crossentropy',
                      metrics=['accuracy', globalvar.f1, globalvar.precision, globalvar.recall, globalvar.auc])
        print('Created binary model!')
    else:
        output_layer = Dense(1, activation='linear')(pool)
        model = Model(inputs=inputs, outputs=output_layer)
        model.compile(optimizer=optim, loss='mae', metrics=['mae', 'mse', 'mape'])
    
    earlystop = EarlyStopping(monitor='val_loss', min_delta=0.000001, patience=15, verbose=0, mode='min')
    callbacks_list = [earlystop]
    
    model.summary()
    model.fit(x_train, y_train, epochs={{choice([50, 100])}}, 
              validation_data=(x_val, y_val), callbacks=callbacks_list, 
              batch_size={{choice([2**7, 2**8])}}, verbose=0)
    
    score = model.evaluate(x_test, y_test, verbose=0)
    
    print('score evaluated: ', score)
    print('binary: ', binary)
    
    if binary:
        f1 = score[2]
        return {'loss': -f1, 'status': STATUS_OK, 'model': model} #take the negative of f1 here since objective is to minimize and f1 usually means higher is better
    else:
        mae = score[1]
        return {'loss': mae, 'status': STATUS_OK, 'model': model} #dont take negative value here since you want to minimize the mae

## With best model, calculate cv scores
function below is used to crossvalidate the results

In [10]:
def cross_validate_best_model(X, y, best_model, best_run, binary, output_dir, model_name, model_type):
    if binary: 
        kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) #cannot do stratifiedkfold for regression tasks
        cv_accuracy_scores = []
        cv_f1_scores = []
        cv_precision_scores = []
        cv_recall_scores = []
        cv_auc_scores = []
        cv_auk_scores = []
    else:
        kfold = KFold(n_splits=5, shuffle=True, random_state=42) #regular kfold here
        cv_mae_scores = []
        cv_mse_scores = []
        cv_mape_scores = []

    callbacks_list = [globalvar.earlystop]
    fold_counter = 1

    for train, test in kfold.split(X, y): #cross validation to obtain stable results, only have to do padded_X since padded_X1 has same 
        print('Now starting fold: {} for model: {}'.format(fold_counter, model_name))
        
        x_train = X.loc[train]
        x_test = X.loc[test]
        y_train = y.loc[train]
        y_test = y.loc[test]

        #fill NaN value with mean of training data for both train and test data. Cant do mean per group since many groups have no data at all
        x_train.fillna(x_train.mean(), inplace=True)
        x_test.fillna(x_train.mean(), inplace=True)

        #scaling for non-additional features, only on train/test data to prevent data leakage, complete X returned without scaling
        additional_features = ['MedicationCode_B01AA04', 'MedicationCode_B01AA07', 'MedicationCode_B01AE07', 'MedicationCode_B01AF01', 
                               'MedicationCode_B01AF02', 'MedicationCode_B01AF03', 'MedicationCode_N02AJ13', 'MedicationCode_N02BE01',
                               'PlannedDuration', 'Duration', 'MedicationType', 'NOAC', 'MedicationStatus', 'temperature', 
                               'bloodPressure', 'Test_Hemoglobine', 'Test_eGFR', 'Test_INR', 'Test_Trombocyten']

        scaler = StandardScaler()    

        if 'tokenized' in model_name and 'transformer' not in model_type: #means all columns need to be encoded, regardless of additional or not
            x_train = pd.DataFrame(scaler.fit_transform(x_train))
            x_test = pd.DataFrame(scaler.fit_transform(x_test))
        elif 'additional' in model_name.lower() and 'ae_agg' not in model_name.lower(): #means only the additionally added columns need to be scaled
            x_train[additional_features] = scaler.fit_transform(x_train[additional_features])
            x_test[additional_features] = scaler.fit_transform(x_test[additional_features])

        #For lstm models, the input needs to be 3d instead of 2d. Therefore, add another dimension to the data
        if model_type == 'lstm' or model_type=='transformer':
            x_train = np.expand_dims(x_train, -1)
            x_test= np.expand_dims(x_test, -1)
            
        #oversample train data for cancellation datasets
        if 'can' in model_name:
            oversampler = RandomOverSampler(sampling_strategy='minority')
            x_train, y_train = oversampler.fit_resample(x_train, y_train)
            
            
        best_model.fit(x_train, #use the same [train] indexes for both padded_X and padded_X1 to get correct values
                       y_train, 
                       epochs=best_run['epochs'], 
                       callbacks=callbacks_list, 
                       batch_size=best_run['batch_size'],
                       verbose=0)

        scores = best_model.evaluate(x_test, y_test, verbose=0)

        if binary:
            y_pred = best_model.predict(x_test, verbose=0)
            scores.append(AUK(y_test, y_pred.flatten()).calculate_auk()) #add AUK scores
            
            print("%s: %.2f%%" % (best_model.metrics_names[1], scores[1] * 100)) #accuracy of the test prediction
            cv_accuracy_scores.append(scores[1])
            cv_f1_scores.append(scores[2])
            cv_precision_scores.append(scores[3])
            cv_recall_scores.append(scores[4])
            cv_auc_scores.append(scores[5])
            cv_auk_scores.append(scores[6])
        else:
            print('{} score: {}'.format(best_model.metrics_names[1], scores[1]))
            cv_mae_scores.append(scores[1])
            cv_mse_scores.append(scores[2])
            cv_mape_scores.append(scores[3])

        fold_counter += 1 #update fold counter

    #calculate measures
    if binary:
        print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cv_accuracy_scores)*100, numpy.std(cv_accuracy_scores)*100))
        measures = [numpy.mean(cv_accuracy_scores), 
                    numpy.std(cv_accuracy_scores),
                    numpy.mean(cv_f1_scores), 
                    numpy.std(cv_f1_scores),
                    numpy.mean(cv_precision_scores),
                    numpy.std(cv_precision_scores),
                    numpy.mean(cv_recall_scores), 
                    numpy.std(cv_recall_scores),
                    numpy.mean(cv_auc_scores), 
                    numpy.std(cv_auc_scores),
                    numpy.mean(cv_auk_scores),
                    numpy.std(cv_auk_scores)] #average over all splits
    else:
        print('average mae score over all splits: {} (+/- {}%)'.format(numpy.mean(cv_mae_scores), numpy.std(cv_mae_scores)))
        measures = [numpy.mean(cv_mae_scores),
                    numpy.std(cv_mae_scores),
                    numpy.mean(cv_mse_scores),
                    numpy.std(cv_mse_scores),
                    numpy.mean(cv_mape_scores),
                    numpy.std(cv_mape_scores)]

    #save and write results + model
    if binary:
        numpy.savetxt(output_dir + 'results\\' + model_name + '-' + str(numpy.mean(cv_accuracy_scores).round(2)) + '.csv', numpy.atleast_2d(measures),
                      delimiter=',', fmt='%6f', header="acc, acc_std, f1, f1_std, precision, precision_std, recall, recall_std, auc, auc_std, auk, auk_std") #write the model scores to a csv file

        if model_type == 'transformer':
            best_model.save_weights(output_dir + 'models\\' + model_name + '_model-weights.h5', save_format='h5') #transformer models can only save weights, not complete models
        else:
            best_model.save(output_dir + 'models\\' + model_name + '.h5')

        text_file = open(output_dir + 'results\\hyperparameters\\' + model_name + "-" + str(numpy.mean(cv_accuracy_scores).round(2)) + ".txt", "w") #write hyperparameters of best run
        text_file.write(str(best_run))
        text_file.close()
    else:
        numpy.savetxt(output_dir + 'results\\' + model_name + '-' + str(numpy.mean(cv_mae_scores).round(2)) + '.csv', numpy.atleast_2d(measures),
                      delimiter=',', fmt='%6f', header='mae, mae_std, mse, mse_std, mape, mape_std') #write the model scores to a csv file

        if model_type == 'transformer':
            best_model.save_weights(output_dir + 'models\\' + model_name + '_model-weights.h5', save_format='h5') #transformer models can only save weights, not complete models
        else:
            best_model.save(output_dir + 'models\\' + model_name + '.h5')

        text_file = open(output_dir + 'results\\hyperparameters\\' + model_name + '-' + str(numpy.mean(cv_mae_scores).round(2)) + '.txt', 'w') #write hyperparameters of best run
        text_file.write(str(best_run))
        text_file.close() 



## Loop for all combinations
function below combines all functions into a single function

In [17]:
def di_mauro_et_al(file_location, output_dir):
    model_name = file_location.split("\\")[-1:][0].split('.')[0] #get filename (without.csv)
    print('Now starting with dataset: {}'.format(model_name))

    #preprocess and split training/test data
    x_train, x_val, x_test, y_train, y_val, y_test, binary, X, y, model_type = prepare_dataset_for_model(file_location, model_type='cnn')
    
    #optimize the model hyperparameters through hyperas 
    best_run, best_model = optim.minimize(model=create_model,
                                  data=prepare_dataset_for_model,
                                  algo=tpe.suggest,
                                  max_evals=5, #number of "random" parameter configurations that are tested
                                  trials=Trials(),
                                  data_args=(file_location, model_type), #supply the arguments for the prepare_dataset_for_model function here
                                  eval_space=True,
                                  notebook_name='(Di Mauro et al. - CNN)',
                                  verbose=False)
    
    print("Evalutation of best performing model:")
    best_scores = best_model.evaluate(x_test, y_test, verbose=0)
    print(best_scores)
    print(best_model.metrics_names)

    print("Best performing model chosen hyper-parameters:")
    print(best_run)
    
    #add AUK & Kappa scores and save the best performing optimized model
    if binary:
        y_pred = best_model.predict(x_test, verbose=0)
        best_scores.append(AUK(y_test, y_pred.flatten()).calculate_auk())
        best_scores.append(AUK(y_test, y_pred.flatten()).kappa_curve())
        pd.DataFrame(best_scores).transpose().to_csv(output_dir + 'opt_results\\' + model_name + '.csv')
    else:
        pd.DataFrame(best_scores).transpose().to_csv(output_dir + 'opt_results\\' + model_name + '.csv')

    
    #cross validate to obtain reliable performance of best performing model
    cross_validate_best_model(X=X, y=y, best_model=best_model, best_run=best_run, binary=binary, output_dir=output_dir, model_name=model_name, model_type=model_type)



Finally, generate the results using the code below

In [19]:
output_dir = 'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\model_results\\di_mauro_cnn\\'
file_locations = find_all_csv_locations('di_mauro_cnn')

for file_location in file_locations:
    di_mauro_et_al(file_location, output_dir)

0 csv files left
