# Preprocessing script


### Imports

In [1]:
#imports
import pandas as pd
import numpy as np
from datetime import datetime
import os
import json
from multiprocessing import  Pool
import csv
from time import time
from glob import glob

#for making embedding matrix
import io
import re
import shutil
import string
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, TextVectorization
from gensim.models import Word2Vec
from skmultiflow.utils import calculate_object_size
from tqdm import tqdm
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.log.exporter.xes import exporter as xes_exporter
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.util import constants

from sklearn import preprocessing
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from pandas.api.types import is_string_dtype, is_numeric_dtype

#For processtransformer preprocessing steps
from transformer_lib import constants
from transformer_lib.models import transformer
from transformer_lib.data.loader import LogsDataLoader
from transformer_lib.data.processor import LogsDataProcessor

In [49]:
# Optional installs, maybe needed if errors occur
#!pip install os
#!pip install gensim
#!pip install scikit-multiflow
#!pip install pm4py

## Preparing, cleaning and labeling data

In [50]:
def preprocess_data(file):    
    df = pd.read_excel(file)

    nrows = df.shape[0] #total number of rows
    ncols = df.shape[1] #total number of columns
    
    #Rename a number of columns to fix typos
    df = df.rename(columns={'EvantName': 'EventName',
                            'PlanedDuration': 'PlannedDuration'})
    
    #Remove traces with less than 3 activities
    df = df[df['TraceID'].map(df['TraceID'].value_counts()) > 2]
    
    #NOAC to binary
    df.loc[df['NOAC'] == 'T', 'NOAC'] = 1
    df.loc[df['NOAC'] == 'F', 'NOAC'] = 0
    
    #MedicationStatus to binary
    df.loc[df['MedicationStatus'] == 'continue', 'MedicationStatus'] = 1
    df.loc[df['MedicationStatus'] == 'stop', 'MedicationStatus'] = 0
    
    #MedicationType to binary
    df.loc[df['MedicationType'] == 'continue', 'MedicationType'] = 1
    df.loc[df['MedicationType'] == 'stop', 'MedicationType'] = 0

    #Change datatypes
    df['NOAC'] = df['NOAC'].apply(float)
    df['MedicationStatus'] = df['MedicationStatus'].apply(float)
    df['MedicationType'] = df['MedicationType'].apply(float)
    
    #one hot encoding for different testing types, and remove the testValue column
    df['Test_Hemoglobine'] = np.where(df['EventName'] == 'Test_Hemoglobine', df['testValue'], 0)
    df['Test_eGFR'] = np.where(df['EventName'] == 'Test_eGFR', df['testValue'], 0)
    df['Test_INR'] = np.where(df['EventName'] == 'Test_INR', df['testValue'], 0)
    df['Test_Trombocyten'] = np.where(df['EventName'] == 'Test_Trombocyten', df['testValue'], 0)
    df = df.drop(columns = ['testValue'], axis=1)
    
    #fix the broken temperature values, removing scientific notation and putting the decimal at the correct place in the values
    new_temp_vals = [x for x in df[df['temperature']>1]['temperature'].astype(str)] #save the messed up temperature values
    new_temp_vals = [''.join(c for c in value if c.isdigit()) for value in new_temp_vals] #remove the '.' that is often located at the wrong spot and keep just the numbers
    new_temp_vals = [round(float(''.join(value[:2] + '.' + value[2:])), 2) for value in new_temp_vals] #place a '.' after every 2 numbers, and convert to number to get the correct temperature values
    df.loc[df['temperature']>1, 'temperature'] = new_temp_vals
    
    #replace the 0 (measuring errors) values located in the temperature with the mean, as well as a single huge outlier
    df['temperature'] = df['temperature'].replace(0, round(df['temperature'].mean(), 2))
    df['temperature'] = df['temperature'].replace(27.12, round(df['temperature'].mean(), 2))
    
    #also replace 1 very significant outlier in the duration column with the mean value, further outliers are fixed later in code
    df['Duration'] = df['Duration'].replace(1588, round(df['Duration'].mean(), 0))
    
    # fix the broken test_hemoglobine and test_INRvalues
    new_hemo_vals = [x for x in df[df['Test_Hemoglobine']>0]['Test_Hemoglobine'].astype(str)] #save the messed up hemoglobine values
    new_hemo_vals = [''.join(c for c in value if c.isdigit()) for value in new_hemo_vals] #remove the '.' that is often located at the wrong spot and keep just the numbers
    new_hemo_vals = [round(float(''.join(value[:1] + '.' + value[1:])), 2) for value in new_hemo_vals] #place a '.' after every 2 numbers, and convert to number to get the correct hemoglobine values
    df.loc[df['Test_Hemoglobine']>0, 'Test_Hemoglobine'] = new_hemo_vals
    
    new_inr_vals = [x for x in df[df['Test_INR']>0]['Test_INR'].astype(str)] #save the messed up inr values
    new_inr_vals = [''.join(c for c in value if c.isdigit()) for value in new_inr_vals] #remove the '.' that is often located at the wrong spot and keep just the numbers
    new_inr_vals = [round(float(''.join(value[:1] + '.' + value[1:])), 2) for value in new_inr_vals] #place a '.' after every 2 numbers, and convert to number to get the correct inr values
    df.loc[df['Test_INR']>0, 'Test_INR'] = new_inr_vals
    
    return df

#Label function that creates different datasets for each prediction target: Cancellation (yes/no), Paracetemol (yes/no), LOS (total time)
def label_traces(df):
    
    df_can, df_par, df_los = df, df, df #create dataframe copies that can be manipulated
    labels = ['Cancellation', 'Paracetemol', 'LOS'] #prediction targets
    
    for label in labels:
        
        #print(labels)
        event_list, id_list, label_list = [], [], [] #create empty lists to store all trace_ids and trace_activites in
        print('Now labeling with label: ', label)
        
        #when labeling the cancellation dataframe
        if label == 'Cancellation':
            for trace in df.groupby('TraceID'): #for each unique trace
                events = list(trace[1].EventName)
                event_list.append([''.join(x) for x in events])
                id_list.append(list(trace[1].TraceID)[0])

                if 'Cancellation' in events: #check if prediction target occurs in this trace or not)
                    label_list.append(1)
                else:
                    label_list.append(0)

            #make a df containing of each unique trace and the label
            label_df = pd.DataFrame(list(zip(id_list, label_list)),
                                   columns =['TraceID', 'Label'])

            #merge the labels into the dataframe copy
            df_can = df_can.merge(label_df, 
                                  on='TraceID', 
                                  how='left')
            
            #remove the rows with cancellation event from the dataset, which contains the answer to the label
            df_can = df_can[df_can['EventName'] != 'Cancellation']

        
        #when labeling the paracetamol dataframe
        elif label == 'Paracetemol':
            for trace in df.groupby('TraceID'): #for each unique trace
                events = list(trace[1].EventName)
                event_list.append([''.join(x) for x in events])
                id_list.append(list(trace[1].TraceID)[0])

                if 'Paracetamol' in events: #check if prediction target occurs in this trace or not)
                    label_list.append(1)
                else:
                    label_list.append(0)

            #make a df containing of each unique trace and the label
            label_df = pd.DataFrame(list(zip(id_list, label_list)),
                                   columns =['TraceID', 'Label'])

            #merge the labels into the dataframe copy
            df_par = df_par.merge(label_df, 
                                  on='TraceID', 
                                  how='left')
            
            #remove the cancellation column from the dataset, which contains the answer to the label
            df_par = df_par[df_par['EventName'] != 'Paracetamol']
        
        elif label == 'LOS':
            los_calc_df = df[df['TraceID'].isin(list(df[df['EventName']== 'Admission']['TraceID']))] #filter the traces that don't include admission/discharge activity
            los_calc_df = los_calc_df[los_calc_df['EventName'].isin(['Admission', 'Discharge'])] #filter only the admission/discharge activities

            los_calc_df['Label'] = los_calc_df.groupby('TraceID')['Timestamp'].diff().dt.total_seconds() / 60 #calculate total length of stay (in minutes)
            los_calc_df = los_calc_df[los_calc_df['EventName']=='Discharge'][['TraceID', 'Label']] #filter on only the rows/columns that include the total LOS

            #merge the labels into the dataframe copy
            df_los = df_los.merge(los_calc_df, 
                                  on='TraceID', 
                                  how='left')
            
            #filter traces with no prediction label (no admission/discharge activities) as well as ~15 significant outliers by only taking records below 2500
            df_los = df_los.dropna(subset=['Label'])
            df_los = df_los[df_los['Label']<2500]
            df_los = df_los[df_los['Label']<500]
        
        
        else:
            print('Unrecognized Label! Stopping function now')
            return
    
    return df_can, df_par, df_los

def export_and_save_traces(df, filename, folder_path = 'C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/'):
    
    #Save preprocessed data to new files, check if the file already exists or not
    if os.path.exists(folder_path + filename + '.csv'):
        overwrite = input('Warning, file already exist. Do you want to overwrite? type y/n: ') 
        
        if overwrite.lower() == 'y': #check if you want to overwrite the files if they already exist
            df.to_csv(path_or_buf = folder_path + filename + '.csv', sep=',', index=False)
            df.to_excel(excel_writer = folder_path + filename + '.xlsx', index=False)
            print('Files succesfully overwritten')
        else:
            print('Files not overwritten.')
    else:
        df.to_csv(path_or_buf = folder_path + filename + '.csv', sep=',', index=False)
        df.to_excel(excel_writer = folder_path + filename + '.xlsx', index=False)
        print('New preprocessed logs created: ', filename)
    
    return


In [51]:
df = preprocess_data(file='C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/newEventLog.xlsx')

df_can, df_par, df_los = label_traces(df)

Now labeling with label:  Cancellation
Now labeling with label:  Paracetemol
Now labeling with label:  LOS


In [45]:
folder_path = 'C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/'

export_and_save_traces(df_can, 'EventLog_Processed_Cancel', folder_path)
export_and_save_traces(df_par, 'EventLog_Processed_Paracetamol', folder_path)
export_and_save_traces(df_los, 'EventLog_Processed_LOS', folder_path)



Files succesfully overwritten




Files succesfully overwritten




Files succesfully overwritten


## One hot encoding

One-hot encode the traces

In [47]:
#simple function that one hot encodes a column of a dataframe
def one_hot_encoding(df):
    #case id not encoded
    for column in df.columns[1:]:
        if not np.issubdtype(df[column], np.number):
            # One hot encoding - eventual categorical nans will be ignored
            one_hot = pd.get_dummies(df[column], prefix=column, prefix_sep='_')
            #print("Encoded column: {} - Different keys: {}".format(column, one_hot.shape[1]))
            # Drop column as it is now encoded
            df = df.drop(column, axis=1)
            # Join the encoded df
            df = df.join(one_hot)
    #print("Categorical columns encoded")
    column_names = (one_hot.columns.tolist())
    return df, column_names

#function that takes the one-hot-encoded traces and combines them into a single one hot combined trace
#duplicates (if events occur twice) are only registered once
def combine_sequences(df):
    data = [] #list to store all one-hot encoded traces
    trace = [] #list to store data on a single trace, which will be updated when each activity is added
    caseID = df.iloc[0][0] #the first TraceID
    trace.append(df.iloc[0][1:].tolist())
    for index, line in df.iloc[1:, :].iterrows(): #for all rows in the df
        case = line[0]
        if case == caseID:
            trace[0] = [x+y if x+y<2 else x for x, y in zip(trace[0], line.iloc[1:].tolist())] #add the one-hot encoding to trace[0] through sum, if the event already occured, keep the 1
        else:
            caseID = case #set new case
            data.append(trace[:len(trace)][0]) #add one-hot encoded trace to data
            trace = []
            trace.append(line.iloc[1:].tolist())
    # last case
    data.append(trace[:len(trace)][0])
    return data


def one_hot_and_combine(df, additional=False):
    
    #one-hot encode activities
    df = df.sort_values(by=['TraceID', 'Timestamp']).reset_index().drop(columns='index') #load data
    labels = df.groupby('TraceID').mean()['Label'].tolist()#.astype(int) #save labels so they can be added later
    
    one_hot_df = df[['TraceID', 'EventName']] #keep only relevant columns
    one_hot_df, column_names = one_hot_encoding(one_hot_df) #one hot encode every trace per event (each row = 1 event)
    data = combine_sequences(one_hot_df) #aggregate the one hot encoded events to a single trace
    one_hot_df = pd.DataFrame(data, columns=column_names) #make df of encoded data
    one_hot_df['Label'] = labels #add labels back
    
    #one-hot encode additional features and add to dataframe if required, also add numerical features
    if additional:
        one_hot_df_additional = df[['TraceID', 'MedicationCode']]
        one_hot_df_additional, column_names_additional = one_hot_encoding(one_hot_df_additional)
        data_additional = combine_sequences(one_hot_df_additional) 
        one_hot_df_additional = pd.DataFrame(data_additional, columns=column_names_additional)
        
        #merge with other df on index and also add numerical additional features
        additional_numerical_df = df.groupby('TraceID').mean().reset_index().drop(columns=['Label', 'TraceID']) #get numerical features
        one_hot_df = one_hot_df.merge(one_hot_df_additional, left_index=True, right_index=True) #first merge with one-hot additional features
        one_hot_df = one_hot_df.merge(additional_numerical_df, left_index=True, right_index=True) #then merge with numerical features
        
    
    return one_hot_df


In [52]:
#read excel file
processed_df_can = pd.read_excel('C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/EventLog_Processed_Cancel.xlsx')
processed_df_par = pd.read_excel('C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/EventLog_Processed_Paracetamol.xlsx')
processed_df_los = pd.read_excel('C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/EventLog_Processed_LOS.xlsx')

In [48]:
#one hot encode data
one_hot_can= one_hot_and_combine(processed_df_can, additional=False)
one_hot_par= one_hot_and_combine(processed_df_par, additional=False)
one_hot_los= one_hot_and_combine(processed_df_los, additional=False)

#one hot encode data with additional data
one_hot_can_additional = one_hot_and_combine(processed_df_can, additional=True)
one_hot_par_additional = one_hot_and_combine(processed_df_par, additional=True)
one_hot_los_additional = one_hot_and_combine(processed_df_los, additional=True)

#save results
one_hot_folder_path = 'C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/encoded_logs/one_hot_encoded_logs/'
export_and_save_traces(one_hot_can, 'one_hot_can', one_hot_folder_path)
export_and_save_traces(one_hot_par, 'one_hot_par', one_hot_folder_path)
export_and_save_traces(one_hot_los, 'one_hot_los', one_hot_folder_path)  
export_and_save_traces(one_hot_can_additional, 'one_hot_can_additional', one_hot_folder_path)
export_and_save_traces(one_hot_par_additional, 'one_hot_par_additional', one_hot_folder_path)
export_and_save_traces(one_hot_los_additional, 'one_hot_los_additional', one_hot_folder_path)  



Files succesfully overwritten




Files succesfully overwritten




Files succesfully overwritten




Files succesfully overwritten




Files succesfully overwritten




Files succesfully overwritten


## Additional data split

A seperate split is made that just keeps the additional data columns. This is for a an experiment to see how the models perform in a more traditional ML approach without use of any trace event data.

To achieve this, I simply take the already one-hot encoded data and remove the trace columns, so I don't have to regroup the data and re-add the label. I save the additional features seperately using the code below

In [21]:
#load the previously one-hot-encoded 
additional_can = pd.read_csv('C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/encoded_logs/one_hot_encoded_logs/one_hot_can_additional.csv')
additional_par = pd.read_csv('C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/encoded_logs/one_hot_encoded_logs/one_hot_par_additional.csv')
additional_los = pd.read_csv('C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/encoded_logs/one_hot_encoded_logs/one_hot_los_additional.csv')

#remove the trace event columns
additional_can = additional_can[[column for column in additional_can if 'EventName' not in column]]
additional_par = additional_par[[column for column in additional_par if 'EventName' not in column]]
additional_los = additional_los[[column for column in additional_los if 'EventName' not in column]]

#Save the dfs filled with solely the additional data
agg_folder_path = 'C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/encoded_logs/additional_data/'
export_and_save_traces(additional_can, 'additional_can', agg_folder_path)
export_and_save_traces(additional_par, 'additional_par', agg_folder_path)
export_and_save_traces(additional_los, 'additional_los', agg_folder_path)



Files succesfully overwritten




Files succesfully overwritten




Files succesfully overwritten


## Word2Vec embbeding

Adapated from: https://github.com/gbrltv/business_process_encoding/blob/master/compute_encoding/word2vec_cbow.py

The Gensim word2vec model uses a list of lists as input, where each list is filled with all activities in a trace. Therefore, each trace is represented by a list that includes all activities that have happened in that particular trace. Each of these lists is embedded into an embedding the size of the vector_size variable.

In [27]:
#Function to read an event log. Parameters: file path, file name, replace "space" with "-"
def read_log(path, log, replace_space: str = '-'):
    df_raw = pd.read_csv(f'{path}/{log}')
    df_raw['EventName'] = df_raw['EventName'].str.replace(' ', replace_space)
    df_proc = df_raw[['TraceID', 'EventName', 'Label']]

    return df_proc, df_raw

#Function that creates a txt model. Paremeters: text-based model containing encodings, list of cases treated as sentences by the model
def train_text_model(model, cases):
    model.build_vocab(cases)
    model.train(cases, total_examples=len(cases), epochs=10)

    return model

#Function that creates a list of cases for model training.
def retrieve_traces(df):
    traces, y, ids = [], [], []
    for group in df.groupby('TraceID'):
        events = list(group[1].EventName)
        traces.append([''.join(x) for x in events])
        y.append(list(group[1].Label)[0])
        ids.append(list(group[1].TraceID)[0])

    return ids, traces, y

#The Gensim Word2Vec model requires a list of lists, where each list contains a list of all activities in 1 trace. 
#This function converts the raw input to that mapping
def convert_traces_mapping(traces_raw, mapping):
    traces = []
    for trace in traces_raw:
        current_trace = []
        for act in trace:
            current_trace.append(mapping[act])
        traces.append(current_trace)
        
    return traces

#Function that orders a list alphanumerically
def sort_alphanumeric(data):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
    
    return sorted(data, key=alphanum_key)

#Function that computes the average/max feature vector for each trace. Paremeters: text-based model containing encodings and list of traces treated as sequences
def average_feature_vector(model, traces):
    vectors_average, vectors_max = [], []
    for trace in traces: # for each trace
        trace_vector = []
        for token in trace: # for each activity in the trace
            try:
                trace_vector.append(model.wv[token])# get numpy vector of the activity (token)
            except KeyError:
                pass
        vectors_average.append(np.array(trace_vector).mean(axis=0)) #aggregate all embedding values per activity into n dimensions
        vectors_max.append(np.array(trace_vector).max(axis=0)) #aggregate again, but this time with max value instead of mean

    return vectors_average, vectors_max


In [44]:
#define paths and embedding dimensions, also note time
t = time()

dimension = 8 #embedding dimension, 8 showed best performance in early experimentation
csv_path =  'C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/' #path where the preprocessed csv files are stored
save_path = 'C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/encoded_logs/word2vec' #path where results are saved
csv_list = ['EventLog_Processed_Cancel.csv', 'EventLog_Processed_LOS.csv', 'EventLog_Processed_Paracetamol.csv'] #file names for csv files
additional_csv_locs = ['C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/encoded_logs/additional_data/additional_can.csv',
                       'C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/encoded_logs/additional_data/additional_los.csv',
                       'C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/encoded_logs/additional_data/additional_par.csv'] #file paths to additional data 

#make directories to store results in
for type in ['average']:
    os.makedirs(f'{save_path}/{type}/{dimension}', exist_ok=True)

#for all csv files that need to be encoded
for file, additional_file in zip(csv_list, additional_csv_locs):
    print('Now encoding file: ', file)
    
    # read event log and import case id, traces and labels
    df_proc, df_raw = read_log(csv_path, file) # create two dataframes, one with and one without additional data
    ids, traces, y = retrieve_traces(df_proc) # retrieve the traces from the non-additional data dataframe
    
    # generate model
    model = Word2Vec(vector_size=dimension, window=3, min_count=1, sg=0, workers=-1)
    model = train_text_model(model, traces)

    # calculating the feature vector for each sentence (trace)
    vectors_average, vectors_max = average_feature_vector(model, traces)

    # generate file names including and excluding additional data
    filename_avg = file.split('.')[0] + '_' + str(dimension) + '_word2vec_avg.csv' 
    filename_additional_avg = file.split('.')[0] + '_' + str(dimension) + '_word2vec_avg_additional.csv'
    
    #generate the output df, also add back traceIDs and labels
    out_df = pd.DataFrame(vectors_average, columns=[f'feature_{i}' for i in range(dimension)]) #add feature prefix
    out_df['TraceID'] = ids
    out_df['Label'] = y
    
    #create version including additional data
    additional_df = pd.read_csv(additional_file)
    additional_df = additional_df.drop('Label', axis=1)#drop redundant 2nd label
    out_df_additional = pd.concat([out_df, additional_df], axis=1) #axis = 1 since we want to paste columns next to each other instead of rows

    #save results
    out_df.to_csv(f'{save_path}/average/{dimension}/{filename_avg}', index=False)
    out_df_additional.to_csv(f'{save_path}/average/{dimension}/{filename_additional_avg}', index=False)

print('Embedding the traces using Word2Vec took: {} mins'.format(round((time() - t) / 60, 2)))


Now encoding file:  EventLog_Processed_Cancel.csv
Now encoding file:  EventLog_Processed_LOS.csv
Now encoding file:  EventLog_Processed_Paracetamol.csv
Embedding the traces using Word2Vec took: 0.03 mins


## Aggregation encoding

This code calculates the aggregation encoding

In short:
- create seperate aggregated measures for all numeric values (min, max, mean, sum, std)
- measure frequencies of categorical values (activity/resources)

In [46]:
def aggregate_encoder(df, skip_columns):
    
    numeric_cols = [] #list to store numerical column names in
    cat_cols = [] #list to store categorical column names in
    agg_measures_dict = {} #dict that will be updated with new columns (trace_id - new column value pairs). Will be used to make aggregated df
    
    agg_df = df.groupby('TraceID')['TraceID']
    
    #determine numeric columns
    for column in df.columns:
        if column not in skip_columns: #check if column does not need to be encoded
            if is_numeric_dtype(df[column]):
                numeric_cols.append(column)
            else:
                cat_cols.append(column)
    
    for col in numeric_cols: #calculate aggregate measures for numeric values and add them to the dictionary
        agg_measures_dict.update(
            df.groupby('TraceID').agg(
                max_value=(col, max),
                min_value=(col, min),
                sum_value=(col, sum),
                mean_value=(col, np.mean),
                std_value=(col, np.std)
            ).to_dict()
        )

        #rename the keys to include the original column name
        agg_measures_dict['max_{}'.format(col)] = agg_measures_dict.pop('max_value')
        agg_measures_dict['min_{}'.format(col)] = agg_measures_dict.pop('min_value')
        agg_measures_dict['sum_{}'.format(col)] = agg_measures_dict.pop('sum_value')
        agg_measures_dict['mean_{}'.format(col)] = agg_measures_dict.pop('mean_value')
        agg_measures_dict['std_{}'.format(col)] = agg_measures_dict.pop('std_value')
        
    #make dataframe of the dictionaries and re-add the skip columns (trace id, timestamp, label)
    agg_df = pd.DataFrame.from_dict(agg_measures_dict).reset_index().rename(columns={'index': 'TraceID'})
    
    
    #for categorical columns, calculate frequency per value (activity, medicationcode) in trace
    agg_df = agg_df.merge(df.groupby(['TraceID', 'EventName'])['Label'].count().unstack().add_prefix('freq_'),
                          on = 'TraceID',
                          how = 'left')
   
    # Also calculate frequency of medicationcode per trace id, create this in a seperate df since its considered as additional data
    agg_df_additional = agg_df.merge(df.groupby(['TraceID', 'MedicationCode'])['Label'].count().unstack().add_prefix('freq_'),
                          on = 'TraceID',
                          how = 'left')


    #finally re-add the label column to both df's from the skipped columns, timestamp is not added since the data has been aggregated
    agg_df = agg_df.merge(df[['TraceID', 'Label']].groupby('TraceID').max(), 
                          on='TraceID', 
                          how='left') 
    
    agg_df_additional = agg_df_additional.merge(df[['TraceID', 'Label']].groupby('TraceID').max(), 
                          on='TraceID', 
                          how='left') 

    return agg_df, agg_df_additional
                      

def split_agg(df):
    
    skip_columns = []
    skip_terms = ['max', 'min', 'sum', 'mean', 'std']
    
    for term in skip_terms:
        skip_columns.append([col for col in df.columns if term in col])
    
    skip_columns = ([col for sublist in skip_columns for col in sublist])
    
    return df[[col for col in df.columns if col not in skip_columns]]
    #return df[df.columns not in [col for sublist in skip_columns for col in sublist]]

In [66]:
#create two versions, one with and one without inclusion of additional data
agg_df_can, agg_df_can_additional = aggregate_encoder(processed_df_can, ['TraceID', 'Timestamp', 'Label'])
agg_df_can = split_agg(agg_df_can)#.columns

agg_df_par, agg_df_par_additional = aggregate_encoder(processed_df_par, ['TraceID', 'Timestamp', 'Label'])
agg_df_par = split_agg(agg_df_par)#.columns

agg_df_los, agg_df_los_additional = aggregate_encoder(processed_df_los, ['TraceID', 'Timestamp', 'Label'])
agg_df_los = split_agg(agg_df_los)#.columns

#Save the agg dataframes
agg_folder_path = 'C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/encoded_logs/aggregated/'

export_and_save_traces(agg_df_can, 'aggregated_can', agg_folder_path)
export_and_save_traces(agg_df_can_additional, 'aggregated_can_additional', agg_folder_path)

export_and_save_traces(agg_df_par, 'aggregated_par', agg_folder_path)
export_and_save_traces(agg_df_par_additional, 'aggregated_par_additional', agg_folder_path)

export_and_save_traces(agg_df_los, 'aggregated_los', agg_folder_path)
export_and_save_traces(agg_df_los_additional, 'aggregated_los_additional', agg_folder_path)



Files succesfully overwritten




Files succesfully overwritten




Files succesfully overwritten




Files succesfully overwritten




Files succesfully overwritten




Files succesfully overwritten


## autoencoder
Since the aggregated encodings are highly dimensional, an autoencoder is used to significantly reduce dimensionality in the data. The result of this autoencoder will be passed to the prediction models.

In [67]:
from keras.layers import Input, Dense
from keras.models import Model
np.random.seed(42)
#from tensorflow import set_random_seed
#set_random_seed(seed)

def autoencoder(file_location):
    model_name = file_location.split("\\")[-1:][0].split('.')[0] #get filename (without.csv)
    new_model_name = 'ae_agg_' + '_'.join(model_name.split('_')[1:])
    output_dir = 'C:\\Users\\20190337\\Downloads\Tracebook_v2 (Projectfolder)\\encoded_logs\\ae_agg\\'
    
    print('Now autoencoding: ', model_name, '...')
    
    agg_df = pd.read_csv(file_location)
    agg_df = agg_df.fillna(0)

    agg_X = agg_df[agg_df.columns.difference(['TraceID', 'Label'])]
    agg_y = agg_df['Label']

    input_len = len(agg_X.columns)

    #train test val split
    if 'los' in model_name:
        x_train, x_test, y_train, y_test = train_test_split(agg_X, agg_y, test_size = 0.2, random_state = 42, shuffle = True)
        x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state = 42, shuffle = True)
    else:
        x_train, x_test, y_train, y_test = train_test_split(agg_X, agg_y, test_size = 0.2, random_state = 42, shuffle = True, stratify = agg_y)
        x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state = 42, shuffle = True, stratify = y_train)

    x_train = x_train.values.astype(float)
    x_val = x_val.values.astype(float)
    x_test = x_test.values.astype(float)

    #input layer
    input_layer = Input(shape=(input_len,))

    #encoder
    encoded = Dense(units=input_len, activation='tanh')(input_layer)
    encoded = Dense(units=(round(input_len*0.667)), activation='tanh')(encoded) #reduce by 1/3
    encoded = Dense(units=(round(input_len*0.445)), activation='tanh')(encoded) #reduce by another 1/3
    encoded = Dense(units=(round(input_len*0.297)), activation='tanh')(encoded) #reduce by another 1/3

    #latent-space
    encoded = Dense(units=(round(input_len*0.198)), activation='tanh')(encoded) # reduce once more, ending up at around 1/5th of original size

    #decoder
    decoded = Dense(units=(round(input_len*0.297)), activation='tanh')(encoded) #increase by 1/3
    decoded = Dense(units=(round(input_len*0.445)), activation='tanh')(decoded) #increase by another 1/3
    decoded = Dense(units=(round(input_len*0.667)), activation='tanh')(decoded) #increase by another 1/3
    decoded = Dense(units=input_len, activation='tanh')(decoded) #return to original size

    autoencoder = Model(input_layer, decoded)
    encoder = Model(input_layer, encoded)

    autoencoder.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
    autoencoder.fit(x_train, x_train,
                    epochs=50,
                    batch_size=128,
                    shuffle=True,
                    validation_data=(x_val, x_val),
                    verbose=0)

    #encoder.save("dataset/"+log+"/"+log+"_encoder.h5")
    encoded_agg = encoder.predict(agg_X)

    scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
    scaler.fit(encoded_agg)
    
    encoded_agg_norm = scaler.transform(encoded_agg)
    encoded_agg_norm = pd.DataFrame(encoded_agg_norm)
    encoded_agg_norm.insert(len(encoded_agg_norm.columns), 'Label', agg_y)
    
    #save new df
    encoded_agg_norm.to_csv(output_dir + new_model_name + '.csv', index=False)
    


In [68]:
def csv_file_list():
    # make a list of all csv files
    results_folder = 'C:\\Users\\20190337\\Downloads\Tracebook_v2 (Projectfolder)\\encoded_logs\\aggregated\\'
    file_extension = '*.csv' #define file extension
    all_csv_files = [] #store csv files in this list
    for path, subdir, files in os.walk(results_folder):
        for file in glob(os.path.join(path, file_extension)):
            all_csv_files.append(file)
            
    return all_csv_files

agg_csv_files= csv_file_list()
for file in agg_csv_files:
    autoencoder(file)
    

Now autoencoding:  aggregated_can ...
Now autoencoding:  aggregated_can_additional ...
Now autoencoding:  aggregated_los ...
Now autoencoding:  aggregated_los_additional ...
Now autoencoding:  aggregated_par ...
Now autoencoding:  aggregated_par_additional ...


## Tokenized encoding
This part of the code uses a custom adaption of the functions that were originally created by Bukhsh et al. (https://github.com/Zaharah/processtransformer/tree/178f4c0bde5efb6bb25834d3493841482ad58227). New functions have been added to their python package that allow for different prediction tasks: namely outcome prediction in my case. The adapted functions are stored in the transformer_lib folder.

In [75]:
def tokenize_traces(data_loader):
    #prepare train/test data
    (train_df, test_df, full_df, x_word_dict, y_word_dict, max_case_length, 
            vocab_size, num_output) = data_loader.load_data('outcome')

    # Prepare training and test examples for outcome prediction task
    token_full_x = data_loader.prepare_data_outcome(full_df, x_word_dict, max_case_length) #tokenizes all data

    # create new dataframe from the now tokenized traces
    tokenized_full_df = pd.concat([full_df.reset_index(drop=True),
                              pd.DataFrame(token_full_x).add_prefix('token_').reset_index(drop=True)], 
                              axis=1).drop(columns='activities')

    #put outcome at end of the dataframe for continuity with other datasets
    cols = list(tokenized_full_df.columns.values) #Make list of all columns, same for train/test so only have to do it once
    cols.pop(cols.index('outcome')) #Remove outcome
    tokenized_full_df = tokenized_full_df[cols+['outcome']]

    return tokenized_full_df
    
def prepare_tokenize(name, file_location, dir_path, additional_df_loc):

    #Initialize data_processor for cancellation dataset
    data_processor = LogsDataProcessor(name=name, 
        filepath=file_location, 
        columns = ['TraceID', 'EventName', 'Timestamp', 'Label'],
        dir_path= dir_path, 
        pool = 1) #changed from 4 to 1, number of CPU's to use in processing

    #process the data
    data_processor.process_logs(task='OUTCOME', sort_temporally=False)

    #load the data
    data_loader = LogsDataLoader(name=name)

    #tokenize the data
    token_full = tokenize_traces(data_loader)

    #create seperate dataframe with additional data        
    additional_df = pd.read_csv(additional_df_loc)
    additional_df = additional_df.drop('Label', axis=1)#drop redundant 2nd label
    token_full_agg = pd.concat([token_full, additional_df], axis=1) #axis = 1 since we want to paste columns next to each other instead of rows

    return token_full, token_full_agg

#simple function to save tokenized results, input 4 dataframe and directory information
def save_tokenized(token_full, token_full_agg, dir_path, foldername, filename):
    token_full.to_csv(dir_path + foldername + '/processed/' + filename + '_full.csv', index = False)
    token_full_agg.to_csv(dir_path + foldername + '/processed/' + filename + '_full_additional.csv', index = False)

In [None]:
dir_path = 'C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/encoded_logs/transformer/'
additional_csv_locs = ['C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/encoded_logs/additional_data/additional_can.csv',
                       'C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/encoded_logs/additional_data/additional_los.csv',
                       'C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/encoded_logs/additional_data/additional_par.csv'] 

#tokenize all three dataset prediction targets
token_full_can, token_full_can_agg = prepare_tokenize(name='processed_df_can', 
                                                      file_location='C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/EventLog_Processed_Cancel.csv',
                                                      dir_path=dir_path, additional_df_loc=additional_csv_locs[0])

token_full_par, token_full_par_agg = prepare_tokenize(name='processed_df_par', 
                                                    file_location='C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/EventLog_Processed_Paracetamol.csv',
                                                    dir_path=dir_path, additional_df_loc=additional_csv_locs[2])

token_full_los, token_full_los_agg = prepare_tokenize(name='processed_df_los', 
                                                      file_location='C:/Users/20190337/Downloads/Tracebook_v2 (Projectfolder)/EventLog_Processed_LOS.csv',
                                                      dir_path=dir_path, additional_df_loc=additional_csv_locs[1])


#save results
save_tokenized(token_full_can, token_full_can_agg, dir_path, foldername='processed_df_can', filename='tokenized_can')
save_tokenized(token_full_par, token_full_par_agg, dir_path, foldername='processed_df_par', filename='tokenized_par')
save_tokenized(token_full_los, token_full_los_agg, dir_path, foldername='processed_df_los', filename='tokenized_los')
