## Inception CNN (based on Di Mauro et al, 2019) - Embedding encoding version

Adapted from the original code by: 

- Di Mauro, N., Appice, A., & Basile, T. M. A. (2019). Activity Prediction of Business Process Instances with Inception CNN Models. Lecture Notes in Computer Science (Including Subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics), 11946 LNAI(November), 348–361. https://doi.org/10.1007/978-3-030-35166-3_25 

github link: https://github.com/TaXxER/rnnalpha

This notebook contains the code used to calculate the embedding + time differences encoding. Note that this notebook has only been used to determine to retrieve the embedding + time differences encoding, as described in the original paper by Di Mauro et al. The performance of the Di Mauro model on the other encoding strategies is done through the '(Di Mauro et al. - CNN)' notebook.

## General imports and data preparation


In [3]:
#general imports
import sys
import numpy
import pandas as pd
import numpy as np
from itertools import product
import datetime
from datetime import datetime
import time
import os
from keras.preprocessing.sequence import pad_sequences
from glob import glob
pd.options.mode.chained_assignment = None
import warnings
warnings.filterwarnings("ignore")

#hyperas imports for hyperparameter optimization
from hyperas import optim
from hyperas.distributions import choice, uniform
from hyperopt import Trials, STATUS_OK, tpe

#tensorflow imports for building neural networks
import tensorflow as tf
from tensorflow.keras.layers import Input, Concatenate, Conv1D, GlobalAveragePooling1D, GlobalMaxPooling1D, Reshape, MaxPooling1D, Flatten, Dense, Embedding, Dropout
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.utils import to_categorical, custom_object_scope
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.metrics import Precision, Recall, AUC, Accuracy, MeanSquaredError, MeanAbsoluteError
from tensorflow_addons.metrics import F1Score
import tensorflow.keras.backend as K
import tensorflow.keras.optimizers
tf.config.run_functions_eagerly(True)

#sklearn imports for preprocessing, measuring performance and cross validation
from sklearn import metrics 
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, normalize, StratifiedKFold, KFold
from sklearn import feature_selection
from sklearn.compose import ColumnTransformer
from other_lib import globalvar
from other_lib.auk_score import AUK


In [4]:
#general data preparation function
def prepare_dataset_for_model(file_location, model_type):
    
    df = pd.read_csv(file_location)
    model_name = file_location.split("\\")[-1:][0].split('.')[0] #get filename (without.csv)
    binary = False if 'los' in model_name.lower() else True #check if a binary prediction (for paracetamol/cancel datasets) or a regression prediction (for length of stay) is being made
    
    #define label (aka outcome) and prediction data
    y = df['Label'] if 'Label' in df else df['outcome']
    X = df.loc[:, df.columns != 'Label'] if 'Label' in df else df.loc[:, df.columns != 'outcome']
    
    #remove TraceID (aka case_id) from the training and testing data
    if 'TraceID' in X.columns or 'case_id' in X.columns:
        X = X.drop('TraceID', 1) if 'TraceID' in X.columns else X.drop('case_id', 1)
    
    #train/test set split, must be done before scaling to prevent data leakage between train/test data
    if binary:
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)
    else:
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
        
    #fill NaN value with mean of training data for both train and test data. Cant do mean per group since many groups have no data at all
    x_train.fillna(x_train.mean(), inplace=True)
    x_test.fillna(x_train.mean(), inplace=True)
    
    #scaling for non-additional features, only on train/test data to prevent data leakage, complete X returned without scaling
    additional_features = ['PlannedDuration', 'Duration', 'MedicationType', 'NOAC', 'MedicationStatus', 'temperature', 
                          'bloodPressure', 'Test_Hemoglobine', 'Test_eGFR', 'Test_INR', 'Test_Trombocyten']

    scaler = StandardScaler()    
    
    if 'tokenized' in model_name and 'transformer' not in model_type: #means all columns need to be encoded, regardless of additional or not
        x_train = pd.DataFrame(scaler.fit_transform(x_train))
        x_test = pd.DataFrame(scaler.fit_transform(x_test))
    elif 'additional' in model_name.lower() and 'ae_agg' not in model_name.lower(): #means only the additionally added columns need to be scaled
        x_train[additional_features] = scaler.fit_transform(x_train[additional_features])
        x_test[additional_features] = scaler.fit_transform(x_test[additional_features])
        
    #For lstm models, the input needs to be 3d instead of 2d. Therefore, add another dimension to the data
    if model_type == 'lstm' or model_type=='transformer':
        x_train = np.expand_dims(x_train, -1)
        x_test= np.expand_dims(x_test, -1) 
    
    return x_train, x_test, y_train, y_test, binary, X, y, model_type


In [15]:
#pick one of three file locations of the processed data
#file_location = 'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\EventLog_Processed_Cancel.csv'
#file_location = 'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\EventLog_Processed_Paracetamol.csv'
file_location = 'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\EventLog_Processed_LOS.csv'

output_dir = 'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\model_results\\di_mauro_cnn\\'
model_name = file_location.split("\\")[-1:][0].split('.')[0] #get filename (without.csv)

x_train, x_test, y_train, y_test, binary, X, y, model_type = prepare_dataset_for_model(file_location, model_type='cnn')

print('x_train shape: ', x_train.shape, '| x_test shape: ', x_test.shape, '| X shape: ', X.shape)
print('y_train shape: ', y_train.shape, '| y_test shape: ', y_test.shape, '| y shape: ', y.shape)

x_train shape:  (15023, 14) | x_test shape:  (3756, 14) | X shape:  (18779, 14)
y_train shape:  (15023,) | y_test shape:  (3756,) | y shape:  (18779,)


In [16]:
#further data preparation function that is used to calculate the time differences
def prepare_data(file_location):

    vocabulary = set()

    logreader = pd.read_csv(file_location)
    logreader.sort_values(['TraceID', 'Timestamp'], axis=0, ascending=True, inplace=True)
    logreader.reset_index().drop(columns='index')
    
    lastcase = '' 
    casestarttime = None
    lasteventtime = None
    firstLine = True

    lines = [] #these are all the activity sequences
    timeseqs = [] #time sequences (differences between two events: current and previous event)

    numcases = 0
    max_length = 0

    #loop that calculates the time difference between activities
    for index, row in logreader.iterrows():
        t = datetime.strptime(row[1], "%Y-%m-%d %H:%M:%S") #timestamp column
        if row[0]!=lastcase:  #'lastcase' is to save the last executed case for the loop
            casestarttime = t
            lasteventtime = t
            lastcase = row[0]
            if not firstLine:
                lines.append(line)
                timeseqs.append(times)
                if len(line) > max_length:
                    max_length = len(line)
            line = []
            times = []
            numcases += 1

        vocabulary.add(row[2])
        line.append(row[2])
        timesincelastevent = t - lasteventtime
        timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds + timesincelastevent.microseconds/1000000
        times.append(timediff+1) # +1 to avoid zero values
        lasteventtime = t
        firstLine = False
        
    lines.append(line)
    timeseqs.append(times)

    vocabulary = {key: idx for idx, key in enumerate(vocabulary)}

    divisor = np.mean([item for sublist in timeseqs for item in sublist]) #average time between events
    print("Num cases: ", numcases)
    elems_per_fold = int(round(numcases/3))

    if len(line) > max_length:
        max_length = len(line)

    X = []
    X1 = []
    y = [] #list for outcome variables, either binary values or LoS values

    max_length = 0
    prefix_sizes = []
    seqs = 0
    vocab = set() #set so all unique values are stored to measure the vocab size
    for seq, time in zip(lines, timeseqs): #lines = acitivies, timeseqs = time differences
        
        code = [] #list with tokenized activities
        code1 = [] #list with log time differences
        
        for i in range(0,len(seq)):
            code.append(vocabulary[seq[i]])
            code1.append(np.log(time[i]+1))
            vocab.add(seq[i])
        
        #store size of the prefix
        prefix_sizes.append(len(code))
        
        #update max trace length
        if len(code)>max_length:
            max_length = len(code)
            
        X.append(code[:])
        X1.append(code1[:])

    prefix_sizes = np.array(prefix_sizes)

    print("Activities: ",vocab )
    vocab_size = len(vocab)
    embedding_size = (vocab_size + 1 ) // 2

    for label in logreader[['TraceID', 'Label']].groupby('TraceID').max()['Label']:
        y.append(label)
    
    X = np.array(X, dtype=object)
    X1 = np.array(X1, dtype=object)
    y = np.array(y)

    # padding
    padded_X = pad_sequences(X, maxlen=max_length, padding='pre', dtype='float64')
    padded_X1 = pad_sequences(X1, maxlen=max_length, padding='pre', dtype='float64')
    
    padded_X1 = padded_X1 / np.max(padded_X1) # normalizing time diff values

    padded_X_train, padded_X_test, y_train, y_test = train_test_split(padded_X, y, test_size=0.2, random_state=42)
    padded_X1_train, padded_X1_test, _, _ = train_test_split(padded_X1, y, test_size=0.2, random_state=42) #can ignore y_train/y_test vals here since same random state

    model_name = file_location.split("\\")[-1:][0].split('.')[0] #get filename (without.csv)

    #check if a binary prediction (for paracetamol/cancel datasets) or a regression prediction (for length of stay) is being made
    binary = False if 'los' in model_name.lower() else True
    pre_encoded = False
    
    input_length = max_length
    return padded_X_train, padded_X1_train, padded_X_test, padded_X1_test, y_train, y_test, input_length, vocab_size, embedding_size, binary, pre_encoded, padded_X, padded_X1, y

In [17]:
#file_location='C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/EventLog_Processed_Cancel.csv'
#file_location='C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/EventLog_Processed_Paracetamol.csv'
file_location='C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/EventLog_Processed_LOS.csv'


padded_X_train, padded_X1_train, padded_X_test, padded_X1_test, y_train, y_test, input_length, vocab_size, embedding_size, binary, pre_encoded, padded_X, padded_X1, y = prepare_data(file_location)


Num cases:  1150
Activities:  {'End_Operation', 'Prepare', 'Scheduled', 'Leave_Cathlab', 'Paracetamol', 'Discharge', 'Waitfor_Schedule', 'MeasureBPs', 'Test_Trombocyten', 'MeasureTemps', 'Arrive_Cathlab', 'Stop_AC', 'Recovery', 'Start_Operation', 'Test_INR', 'Admission', 'Test_Hemoglobine', 'Cancellation', 'Restart_NOAC', 'Test_eGFR', 'End_Introduction', 'Start_AC', 'Start_Introduction'}


## Building the model
code use to build the prediction model

In [19]:
#function that calculates the embeddings for each activity for each trace
def create_model(padded_X_train, padded_X1_train, padded_X_test, padded_X1_test, y_train, y_test,
              input_length=input_length, vocab_size=vocab_size, embedding_size=embedding_size,
              binary=binary, pre_encoded=pre_encoded):
    
    inputs = [] #create empty list to store the input layers in, will be updated and used when compiling the model
    
    if not pre_encoded: #if the model needs to calculate the encoding itself (tokenized activities + time differences)
        activity_input = Input(shape=(input_length)) #input layer for activity tokens
        activity_embedding = Embedding(input_dim=vocab_size, 
                                       output_dim=embedding_size, 
                                       input_length=input_length)(activity_input) #embedding layer for activity tokens

        timedif_input = Input(shape=(input_length)) #input layer for time differences
        timedif_reshape = Reshape((input_length, 1))(timedif_input) #reshape layer for time differences

        concat_layer = Concatenate(axis=2)([activity_embedding, timedif_reshape]) #concatenate the timedif and encoding values to finish the encoding
        inputs = [activity_input, timedif_input] #store input layers
    else: #if an pre-existing encoded dataset is used, such as the aggregated or regular tokenized datasets
        input_layer = Input(shape=(input_length))
        inputs = [input_layer]
        pass
    
    #After this point the model creation continues past the encoded data stage
    for m in range({{choice([1,2,3])}}): #number of inception modules you want
        filters = []
        for i in range(3):
            filters.append(Conv1D(filters=32, strides=1, kernel_size=1+i, activation='relu', padding='same')(concat_layer)) #add the conv layers of different sizes
        filters.append(MaxPooling1D(pool_size=3, strides=1, padding='same')(concat_layer)) #add the max pool layer
        concat_layer = Concatenate(axis=2)(filters) #concatenate the output of the different conv modules and max pool layer to get output of inception module

    pool = GlobalMaxPooling1D()(concat_layer)
    
    choiceval = {{choice(['adam', 'sgd', 'rmsprop'])}}
    if choiceval == 'adam':
        optim = tensorflow.keras.optimizers.Adam(learning_rate={{choice([10 ** -4, 10 ** -3, 10 ** -2])}}, clipnorm=1.)
    elif choiceval == 'rmsprop':
        optim = rmsprop = tensorflow.keras.optimizers.RMSprop(learning_rate={{choice([10 ** -4, 10 ** -3, 10 ** -2])}}, clipnorm=1.)
    else:
        optim = tensorflow.keras.optimizers.SGD(learning_rate={{choice([10 ** -4, 10 ** -3, 10 ** -2])}}, clipnorm=1.)
    
    #determine output shape based on prediction task, either for binary/length of stay prediction
    if binary:
        output_layer = Dense(1, activation='sigmoid')(pool)
        model = Model(inputs=inputs, outputs=output_layer)
        model.compile(optimizer=optim, loss='binary_crossentropy',
                      metrics=['accuracy', globalvar.f1, globalvar.precision, globalvar.recall, globalvar.auc])
        print('Created binary model!')
    else:
        output_layer = Dense(1, activation='linear')(pool)
        model = Model(inputs=inputs, outputs=output_layer)
        model.compile(optimizer=optim, loss='mean_absolute_error', 
                      metrics=['mean_absolute_error', 'mean_squared_error'])
    
    earlystop = EarlyStopping(monitor='val_loss', min_delta=0.000001, patience=15, verbose=1, mode='min')
    callbacks_list = [earlystop]
    
    model.summary()
    
    model.fit([padded_X_train, padded_X1_train],
               y_train, 
               epochs={{choice([50, 100])}}, 
               validation_split=0.2, 
               callbacks=callbacks_list, 
               batch_size={{choice([2**9, 2**10])}})
    
    score = model.evaluate([padded_X_test, padded_X1_test], y_test, verbose=0)

    if binary:
        accuracy = score[1]
        return {'loss': -accuracy, 'status': STATUS_OK, 'model': model} #take the negative of accuracy here since objective is to minimize and accuracy usually maens higher is better
    else:
        mae = score[1]
        return {'loss': mae, 'status': STATUS_OK, 'model': model} #dont take negative value here since you want to minimize the mae

In [None]:
#file_location = 'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\EventLog_Processed_Cancel.csv'
#file_location = 'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\EventLog_Processed_Paracetamol.csv'
file_location = 'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\EventLog_Processed_LOS.csv'

best_run, best_model = optim.minimize(model=create_model,
                                  data=prepare_data,
                                  algo=tpe.suggest,
                                  max_evals=5, #number of "random" parameter configurations that are tested
                                  trials=Trials(),
                                  data_args=(file_location,), #supply the arguments for the prepare_dataset_for_model function here
                                  eval_space=True,
                                  notebook_name='Di_Mauro_et_al_encoder',
                                  verbose=False)

### With best model, calculate cross validated scores

In [None]:
#choose one of three prediction targets
#file_location = 'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\EventLog_Processed_Cancel.csv'
#file_location = 'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\EventLog_Processed_Paracetamol.csv'
file_location = 'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\EventLog_Processed_LOS.csv'

model_name = file_location.split("\\")[-1:][0].split('.')[0] #get filename (without.csv)
padded_X_train, padded_X1_train, padded_X_test, padded_X1_test, y_train, y_test, input_length, vocab_size, embedding_size, binary, pre_encoded, padded_X, padded_X1, y = prepare_data(file_location)

if binary: 
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) #cannot do stratifiedkfold for regression tasks
    cv_accuracy_scores = []
    cv_f1_scores = []
    cv_precision_scores = []
    cv_recall_scores = []
    cv_auc_scores = []
else:
    kfold = KFold(n_splits=5, shuffle=True, random_state=42) #regular kfold here
    cv_mae_scores = []
    cv_mse_scores = []

callbacks_list = [globalvar.earlystop]

for train, test in kfold.split(padded_X, y): #cross validation to obtain stable results, only have to do padded_X since padded_X1 has same indexes
    best_model.fit([padded_X[train], padded_X1[train]], #use the same [train] indexes for both padded_X and padded_X1 to get correct values
                   y[train],
                   epochs=best_run['epochs'], 
                   callbacks=callbacks_list, 
                   batch_size=best_run['batch_size'],
                   verbose=0)

    scores = best_model.evaluate([padded_X[test], padded_X1[test]], y[test], verbose=0)
    
    if binary:
        print("%s: %.2f%%" % (best_model.metrics_names[1], scores[1] * 100)) #accuracy of te test prediction
        cv_accuracy_scores.append(scores[1])
        cv_f1_scores.append(scores[2])
        cv_precision_scores.append(scores[3])
        cv_recall_scores.append(scores[4])
        cv_auc_scores.append(scores[5])
    else:
        print('{} score: {}'.format(best_model.metrics_names[1], scores[1]))
        cv_mae_scores.append(scores[1])
        cv_mse_scores.append(scores[2])

if binary:
    print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cv_accuracy_scores)*100, numpy.std(cv_accuracy_scores)*100))
    measures = [numpy.mean(cv_accuracy_scores), numpy.mean(cv_f1_scores), numpy.mean(cv_precision_scores),
                numpy.mean(cv_recall_scores), numpy.mean(cv_auc_scores)] #average over all splits
else:
    print('average mae score over all splits: {} (+/- {}%)'.format(numpy.mean(cv_mae_scores), numpy.std(cv_mae_scores)))
    measures = [numpy.mean(cv_mae_scores), numpy.mean(cv_mse_scores)]

#save results
output_dir = 'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\model_results\\di_mauro_cnn\\'
model_name = file_location.split("\\")[-1:][0].split('.')[0] #get filename (without.csv)

if binary:
    numpy.savetxt(output_dir + 'results\\' + model_name + "-" + str(numpy.mean(cv_accuracy_scores).round(2)) + ".csv", numpy.atleast_2d(measures),
                  delimiter=',', fmt='%6f', header="acc, f1, precision, recall, auc") #write the model scores to a csv file

    best_model.save(output_dir + 'models\\' + model_name + '.h5')

    text_file = open(output_dir + 'results\\hyperparameters\\' + model_name + "-" + str(numpy.mean(cv_accuracy_scores).round(2)) + ".txt", "w") #write hyperparameters of best run
    text_file.write(str(best_run))
    text_file.close()
else:
    numpy.savetxt(output_dir + 'results\\' + model_name + "-" + str(numpy.mean(cv_mae_scores).round(2)) + ".csv", numpy.atleast_2d(measures),
                  delimiter=',', fmt='%6f', header="mae, mse") #write the model scores to a csv file

    best_model.save(output_dir + 'models\\' + model_name + '.h5')

    text_file = open(output_dir + 'results\\hyperparameters\\' + model_name + "-" + str(numpy.mean(cv_mae_scores).round(2)) + ".txt", "w") #write hyperparameters of best run
    text_file.write(str(best_run))
    text_file.close()    



## Retrieve embedding + timediff encoding

Take and store output from the embedding+timedif layers to use as encoded input for other models using the best scoring prediction model determine above 

In [24]:
#function that calculates the average embeddings for each trace 
def average_feature_vector(embeddings):
    avg_embedding = []
    for trace in embeddings: # for each trace
        avg_vector = []
        for activity in trace: # for each activity in the trace
            avg_vector.append(activity.mean())
        
        avg_embedding.append(np.array(avg_vector)) #store the avg value per activity per trace
    return avg_embedding

#function for saving the embedded traces
def export_and_save_traces(df, filename, folder_path = 'C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/'):
    
    #Save preprocessed data to new files, check if the file already exists or not
    if os.path.exists(folder_path + filename + '.csv'):
        overwrite = input('Warning, files already exist. Do you want to overwrite? type y/n: ') 
        
        if overwrite.lower() == 'y': #check if you want to overwrite the files if they already exist
            df.to_csv(path_or_buf = folder_path + filename + '.csv', sep=',', index=False)
            df.to_excel(excel_writer = folder_path + filename + '.xlsx', index=False)
            print('Files succesfully overwritten')
        else:
            print('Files not overwritten.')
    else:
        df.to_csv(path_or_buf = folder_path + filename + '.csv', sep=',', index=False)
        df.to_excel(excel_writer = folder_path + filename + '.xlsx', index=False)
        print('New preprocessed logs created: ', filename)
    
    return

In [None]:
#define custom metrics so the model can be loaded
custom_objects = {'f1': globalvar.f1, 'precision': globalvar.precision, 'recall': globalvar.recall, 'auc': globalvar.auc} #define custom metrics

#load the best_model from the .h5 file
with custom_object_scope(custom_objects):
    best_model_can = load_model('C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\model_results\\di_mauro_cnn\\models\\EventLog_Processed_Cancel.h5')
    best_model_par = load_model('C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\model_results\\di_mauro_cnn\\models\\EventLog_Processed_Paracetamol.h5')
    best_model_los = load_model('C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\model_results\\di_mauro_cnn\\models\\EventLog_Processed_LOS.h5')

#get only the model to the part where the concatenation layer is located, might have to use a different concatenate layer name here
can_embedding_model = Model(best_model_can.input,best_model_can.get_layer('concatenate_26').output)
par_embedding_model = Model(best_model_par.input,best_model_par.get_layer('concatenate_41').output)
los_embedding_model = Model(best_model_los.input,best_model_los.get_layer('concatenate_12').output)

#cancelation datasets:
file_location = 'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\EventLog_Processed_Cancel.csv'
padded_X_train, padded_X1_train, padded_X_test, padded_X1_test, y_train, y_test, input_length, vocab_size, embedding_size, binary, pre_encoded, padded_X, padded_X1, y = prepare_data(file_location)
encoded_can = can_embedding_model.predict([padded_X, padded_X1]) #make prediction to generate embedding
avg_encoded_can = average_feature_vector(encoded_can)#calculate the average over the embedding dimension
avg_encoded_can_df = pd.DataFrame(avg_encoded_can).add_prefix('feature_') #make a dataframe of embedding
avg_encoded_can_df['Label'] = y

#paracetamol datasets:
file_location = 'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\EventLog_Processed_Paracetamol.csv'
padded_X_train, padded_X1_train, padded_X_test, padded_X1_test, y_train, y_test, input_length, vocab_size, embedding_size, binary, pre_encoded, padded_X, padded_X1, y = prepare_data(file_location)
encoded_par = par_embedding_model.predict([padded_X, padded_X1]) #make prediction to generate embedding
avg_encoded_par = average_feature_vector(encoded_par)#calculate the average over the embedding dimension
avg_encoded_par_df = pd.DataFrame(avg_encoded_par).add_prefix('feature_') #make a dataframe of embedding
avg_encoded_par_df['Label'] = y #add label

#los datasets:
file_location = 'C:\\Users\\20190337\\Downloads\\Tracebook_v2 (Projectfolder)\\EventLog_Processed_LOS.csv'
padded_X_train, padded_X1_train, padded_X_test, padded_X1_test, y_train, y_test, input_length, vocab_size, embedding_size, binary, pre_encoded, padded_X, padded_X1, y = prepare_data(file_location)
encoded_los = los_embedding_model.predict([padded_X, padded_X1]) #make prediction to generate embedding
avg_encoded_los = average_feature_vector(encoded_los) #calculate the average over the embedding dimension
avg_encoded_los_df = pd.DataFrame(avg_encoded_los).add_prefix('feature_') #make a dataframe of embedding
avg_encoded_los_df['Label'] = y #add label


Finally, store the embedding + time differences encodings


In [28]:
folder_path_avg = 'C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/encoded_logs/tokenized_timedif/avg/'

#save cancellation encoding:
additional_data_df_can = pd.read_csv('C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/encoded_logs/additional_data/additional_can.csv').drop('Label', axis=1) #load the data to add additional features later
avg_encoded_can_df_additional = pd.concat([avg_encoded_can_df, additional_data_df_can], axis=1) #axis = 1 since we want to paste columns next to each other instead of rows
export_and_save_traces(avg_encoded_can_df, filename='embed_timedif_can_avg', folder_path=folder_path_avg) #save results without additional features
export_and_save_traces(avg_encoded_can_df_additional, filename='embed_timedif_can_avg_additional', folder_path=folder_path_avg) #save results with additional features

#save paracetamol encoding
additional_data_df_par = pd.read_csv('C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/encoded_logs/additional_data/additional_par.csv').drop('Label', axis=1) #load the data to add additional features later
avg_encoded_par_df_additional = pd.concat([avg_encoded_par_df, additional_data_df_par], axis=1) #axis = 1 since we want to paste columns next to each other instead of rows                            
export_and_save_traces(avg_encoded_par_df, filename='embed_timedif_par_avg', folder_path=folder_path_avg) #save results without additional features
export_and_save_traces(avg_encoded_par_df_additional, filename='embed_timedif_par_avg_additional', folder_path=folder_path_avg) #save results with additional features

#save los encoding
additional_data_df_los = pd.read_csv('C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/encoded_logs/additional_data/additional_los.csv').drop('Label', axis=1) #load the data to add additional features later
avg_encoded_los_df_additional = pd.concat([avg_encoded_los_df, additional_data_df_los], axis=1) #axis = 1 since we want to paste columns next to each other instead of rows                            
export_and_save_traces(avg_encoded_los_df, filename='embed_timedif_los_avg', folder_path=folder_path_avg) #save results without additional features
export_and_save_traces(avg_encoded_los_df_additional, filename='embed_timedif_los_avg_additional', folder_path=folder_path_avg) #save results with additional features




Files succesfully overwritten




Files succesfully overwritten




Files succesfully overwritten




Files succesfully overwritten
