# Learning of Process Representations Using Recurrent Neural Networks

In [5]:
import numpy as np
import pandas as pd

from replearn.eventlog import EventLog

from replearn.embedding_predict import EmbeddingPredict
from replearn.autoencoder import AutoencoderRepresentation
from replearn.doc2vec import Doc2VecRepresentation

from replearn.clustering import Clustering

from replearn.evaluation import Evaluation

# !pip install levenshtein
from Levenshtein import distance as led
from tqdm import tqdm
import os
import re

### General Parameters

In [6]:
# all possible parameters, abstracted
event_log_path = '../logs/'
case_attributes = None # auto-detect attributes
event_attributes = ['concept:name', 'user'] # use activity name and user
true_cluster_label = 'cluster'

n_epochs = 25         #[10, 25]
n_batch_size = 64
n_clusters = 5
vector_size = 32      #[2, 3, 4, 8, 16, 32, 64, 128]

clustering_method = "agglomerative" # ["k_means", "agglomerative"]

### Event Logs

In [7]:
# noise event logs - self implemented
event_logs = {}
for i in range(11):
    noise = i / 10.0
    event_logs[noise] = []

# prepare all event log files
pattern = r'-(\d+\.\d+)'
for file in tqdm(os.listdir(event_log_path), unit='eventlog'):
    if os.path.isfile(os.path.join(event_log_path, file)):
        match = re.search(pattern, file)
        if match:
            noise = float(match.group(1))
            event_logs[noise].append(file)
            
print(event_logs.keys())
print("First three:", event_logs[0.0][0:3])

100%|██████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 1227.09eventlog/s]

dict_keys([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
First three: ['small_500_10_20_5_1_1-0.0-1.json.gz']





## Autoencoder

In [8]:
# init and train Autoencoder - added noise filtering, loop over all logs, metrics like f-score, statistics to create graph on
# statistics saved in google docs excel file in README to utilize further for presentation
results_autoencoder = {}
for i in range(11):
    noise = i / 10.0
    results_autoencoder[noise] = []

for noise in tqdm(event_logs.keys(), unit="noise_level"):    
    for file in tqdm(event_logs[noise], unit='event_log'):
        # load file
        event_log = EventLog(file, case_attributes=case_attributes, event_attributes=event_attributes, true_cluster_label=true_cluster_label)
        event_log.load(event_log_path + file, False)
        event_log.preprocess()
        
        # get sequences from event log as one-hot feature vector
        sequences = event_log.event_attributes_flat_onehot_features_2d
        
        # init and train autoencoder
        autoencoder = AutoencoderRepresentation(event_log)
        autoencoder.build_model(input_dim=sequences.shape[1], encoder_dim=vector_size)
        autoencoder.fit(batch_size=n_batch_size, epochs=n_epochs, verbose=True)

        # get feature vector (hidden representation)
        feature_vector = autoencoder.predict()
        
        # cluster feature vector (cluster all case embeddings into n_clusters)
        cluster_analysis = Clustering(event_log)
        cluster_analysis.cluster(feature_vector, clustering_method, n_clusters, 'cosine')
        
        # evaluate clustered embeddings (by comparing it to event_log 'cluster' entry)
        cluster_result = cluster_analysis.evaluate() # 1. Metric (F1-BCubed)
        
        evaluation = Evaluation(event_log)
        (fitness, precision, simplicity) = evaluation.evaluate_clusters(n_clusters, cluster_analysis.pred_labels) # Heuristics Miner + 2. Metric
        f_score = 2 * (fitness * precision) / (fitness + precision) # idea: fitness <=> recall & precision <=> precision
            
        results_autoencoder[noise].append({"f1_bcubed":cluster_result[2], "f_score":f_score, "fitness": fitness, "precision":precision, "simplicity":simplicity})

  0%|                                                                                  | 0/11 [00:00<?, ?noise_level/s]


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.62s/event_log][A
  9%|██████▋                                                                   | 1/11 [00:06<01:06,  6.63s/noise_level]


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.28s/event_log][A
 18%|█████████████▍                                                            | 2/11 [00:13<01:03,  7.03s/noise_level]


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.87s/event_log][A
 27%|████████████████████▏                                                     | 3/11 [00:22<01:03,  7.88s/noise_level]


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.97s/event_log][A
 36%|██████████████████████████▉                                               | 4/11 [00:32<01:00,  8.71s/noise_level]


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:11<00:00, 11.11s/event_log][A
 45%|█████████████████████████████████▋                                        | 5/11 [00:43<00:57,  9.58s/noise_level]


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:12<00:00, 12.57s/event_log][A
 55%|████████████████████████████████████████▎                                 | 6/11 [00:56<00:53, 10.60s/noise_level]


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:11<00:00, 11.85s/event_log][A
 64%|███████████████████████████████████████████████                           | 7/11 [01:08<00:44, 11.02s/noise_level]


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.97s/event_log][A
 73%|█████████████████████████████████████████████████████▊                    | 8/11 [01:22<00:35, 11.96s/noise_level]


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:14<00:00, 14.04s/event_log][A
 82%|████████████████████████████████████████████████████████████▌             | 9/11 [01:36<00:25, 12.62s/noise_level]


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:14<00:00, 14.08s/event_log][A
 91%|██████████████████████████████████████████████████████████████████▎      | 10/11 [01:50<00:13, 13.08s/noise_level]


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:15<00:00, 15.56s/event_log][A
100%|█████████████████████████████████████████████████████████████████████████| 11/11 [02:06<00:00, 11.47s/noise_level]


In [9]:
# Statistics for Autoencoder

# complete
complete = []
for i in range(11):
    complete += results_autoencoder[i/10]
complete = pd.DataFrame.from_dict(complete)
print(complete.describe())

print("---")

# noise
for i in range(11):
    print(f"noise: {i/10}")
    print(pd.DataFrame.from_dict(results_autoencoder[i/10]).describe())
    print("---")

       f1_bcubed    f_score    fitness  precision  simplicity
count  11.000000  11.000000  11.000000  11.000000   11.000000
mean    0.637349   0.621429   0.734415   0.553133    0.737917
std     0.227838   0.229868   0.187193   0.231761    0.145319
min     0.366397   0.095874   0.328295   0.056134    0.422000
25%     0.407210   0.497597   0.666505   0.408160    0.713954
50%     0.697928   0.724865   0.806548   0.658205    0.803832
75%     0.839746   0.776864   0.850255   0.718304    0.820306
max     0.901997   0.860843   0.928573   0.802322    0.880000
---
noise: 0.0
       f1_bcubed  f_score  fitness  precision  simplicity
count   1.000000  1.00000     1.00   1.000000        1.00
mean    0.896061  0.79904     0.88   0.731721        0.88
std          NaN      NaN      NaN        NaN         NaN
min     0.896061  0.79904     0.88   0.731721        0.88
25%     0.896061  0.79904     0.88   0.731721        0.88
50%     0.896061  0.79904     0.88   0.731721        0.88
75%     0.896061  0.7