# Learning of Process Representations Using Recurrent Neural Networks

In [11]:
# added more imports like tqdm, evaluation
import numpy as np
import pandas as pd

from replearn.eventlog import EventLog

from replearn.embedding_predict import EmbeddingPredict
from replearn.autoencoder import AutoencoderRepresentation
from replearn.doc2vec import Doc2VecRepresentation

from replearn.clustering import Clustering
from replearn.evaluation import Evaluation

from tqdm import tqdm
import os
import re

### General Parameters

In [12]:
event_log_path = '../logs/'
case_attributes = None # auto-detect attributes
event_attributes = ['concept:name', 'user'] # use activity name and user
true_cluster_label = 'cluster'

n_epochs = 25         #[10, 25]
n_batch_size = 64
n_clusters = 5
vector_size = 32      #[2, 3, 4, 8, 16, 32, 64, 128]

clustering_method = "agglomerative" # ["k_means", "agglomerative"]

## Load event log

In [13]:
# noise event logs
event_logs = {}
for i in range(11):
    noise = i / 10.0
    event_logs[noise] = []

# prepare all event log files
pattern = r'-(\d+\.\d+)'
for file in tqdm(os.listdir(event_log_path), unit='eventlog'):
    if os.path.isfile(os.path.join(event_log_path, file)):
        match = re.search(pattern, file)
        if match:
            noise = float(match.group(1))
            event_logs[noise].append(file)
            
print(event_logs.keys())
print("First three:", event_logs[0.0][0:3])

100%|██████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 3674.23eventlog/s]

dict_keys([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
First three: ['small_500_10_20_5_1_1-0.0-1.json.gz']





## Representation Learning

In [14]:
# hyperparameters
n_epochs = 10
n_batch_size = 64
n_clusters = 5

vector_size = 32

### GRU

In [15]:
# init and train GRU - added noise filtering, loop over all logs, metrics like f-score, statistics to create graph on
# statistics saved in google docs excel file in README to utilize further for presentation

#GRUClust
results_gruclust = {}
for i in range(11):
    noise = i / 10.0
    results_gruclust[noise] = []

for noise in tqdm(event_logs.keys(), unit="noise_level"):    
    for file in tqdm(event_logs[noise], unit='event_log'):
        # load file
        event_log = EventLog(file, case_attributes=case_attributes, event_attributes=event_attributes, true_cluster_label=true_cluster_label)
        event_log.load(event_log_path + file, False)
        event_log.preprocess()
        
        # get sequences from event log as one-hot feature vector
        sequences = event_log.event_attributes_flat_onehot_features_2d

        predictor = EmbeddingPredict(event_log)
        predictor.build_model(embedding_dim=vector_size, gru_dim=vector_size, rnn='gru')
        predictor.fit(epochs=n_epochs, batch_size=n_batch_size, verbose=True)   
        
        # infer the vector from the model
        pred_model, feature_vector, embedding_vector = predictor.predict()    
        
        cluster_analysis = Clustering(event_log)
        cluster_analysis.cluster(feature_vector, clustering_method, n_clusters, 'cosine')

        cluster_result = cluster_analysis.evaluate()
        evaluation = Evaluation(event_log)
        (fitness, precision, simplicity) = evaluation.evaluate_clusters(n_clusters, cluster_analysis.pred_labels) # Heuristics Miner + 2. Metric
        f_score = 2 * (fitness * precision) / (fitness + precision) # idea: fitness <=> recall & precision <=> precision
        
        results_gruclust[noise].append({"f1_bcubed":cluster_result[2], "f_score":f_score, "fitness": fitness, "precision":precision, "simplicity":simplicity})

  0%|                                                                                  | 0/11 [00:00<?, ?noise_level/s]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:19<00:00, 19.60s/event_log][A
  9%|██████▋                                                                   | 1/11 [00:19<03:16, 19.61s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:19<00:00, 19.68s/event_log][A
 18%|█████████████▍                                                            | 2/11 [00:39<02:56, 19.66s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:18<00:00, 18.78s/event_log][A
 27%|████████████████████▏                                                     | 3/11 [00:58<02:34, 19.27s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:18<00:00, 18.62s/event_log][A
 36%|██████████████████████████▉                                               | 4/11 [01:16<02:13, 19.02s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:19<00:00, 19.21s/event_log][A
 45%|█████████████████████████████████▋                                        | 5/11 [01:35<01:54, 19.09s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:15<00:00, 15.17s/event_log][A
 55%|████████████████████████████████████████▎                                 | 6/11 [01:51<01:28, 17.77s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:23<00:00, 23.05s/event_log][A
 64%|███████████████████████████████████████████████                           | 7/11 [02:14<01:18, 19.50s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:21<00:00, 21.12s/event_log][A
 73%|█████████████████████████████████████████████████████▊                    | 8/11 [02:35<01:00, 20.02s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:21<00:00, 21.09s/event_log][A
 82%|████████████████████████████████████████████████████████████▌             | 9/11 [02:56<00:40, 20.36s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:20<00:00, 20.75s/event_log][A
 91%|██████████████████████████████████████████████████████████████████▎      | 10/11 [03:17<00:20, 20.49s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:20<00:00, 20.17s/event_log][A
100%|█████████████████████████████████████████████████████████████████████████| 11/11 [03:37<00:00, 19.77s/noise_level]


In [18]:
# Statistics GRU

# complete
complete = []
for i in range(11):
    complete += results_gruclust[i/10]
complete = pd.DataFrame.from_dict(complete)
print(complete.describe())

print("---")

# noise
for i in range(11):
    print(f"noise: {i/10}")
    print(pd.DataFrame.from_dict(results_gruclust[i/10]).describe())
    print("---")

       f1_bcubed    f_score    fitness  precision  simplicity
count  11.000000  11.000000  11.000000  11.000000   11.000000
mean    0.924485   0.648969   0.707216   0.604185    0.644677
std     0.037669   0.090937   0.084260   0.108425    0.084689
min     0.854812   0.488746   0.532383   0.451721    0.484000
25%     0.906433   0.580065   0.665202   0.488733    0.608873
50%     0.919701   0.667821   0.723871   0.634864    0.636857
75%     0.949292   0.720178   0.767371   0.699338    0.698663
max     0.978967   0.762060   0.806000   0.725518    0.806000
---
noise: 0.0
       f1_bcubed   f_score  fitness  precision  simplicity
count   1.000000  1.000000    1.000    1.00000       1.000
mean    0.854812  0.611144    0.806    0.49216       0.806
std          NaN       NaN      NaN        NaN         NaN
min     0.854812  0.611144    0.806    0.49216       0.806
25%     0.854812  0.611144    0.806    0.49216       0.806
50%     0.854812  0.611144    0.806    0.49216       0.806
75%     0.8548

### LSTM

In [19]:
# init and train LSTM - added noise filtering, loop over all logs, metrics like f-score, statistics to create graph on
# statistics saved in google docs excel file in README to utilize further for presentation

results_lstmclust= {}
for i in range(11):
    noise = i / 10.0
    results_lstmclust[noise] = []

for noise in tqdm(event_logs.keys(), unit="noise_level"):    
    for file in tqdm(event_logs[noise], unit='event_log'):
        # load file
        event_log = EventLog(file, case_attributes=case_attributes, event_attributes=event_attributes, true_cluster_label=true_cluster_label)
        event_log.load(event_log_path + file, False)
        event_log.preprocess()
        
        # get sequences from event log as one-hot feature vector
        sequences = event_log.event_attributes_flat_onehot_features_2d

        predictor = EmbeddingPredict(event_log)
        predictor.build_model(embedding_dim=vector_size, gru_dim=vector_size, rnn='LSTM')
        predictor.fit(epochs=n_epochs, batch_size=n_batch_size, verbose=True)   
        
        # infer the vector from the model
        pred_model, feature_vector, embedding_vector = predictor.predict()    
        
        cluster_analysis = Clustering(event_log)
        cluster_analysis.cluster(feature_vector, clustering_method, n_clusters, 'cosine')

        cluster_result = cluster_analysis.evaluate()
        evaluation = Evaluation(event_log)
        (fitness, precision, simplicity) = evaluation.evaluate_clusters(n_clusters, cluster_analysis.pred_labels) # Heuristics Miner + 2. Metric
        f_score = 2 * (fitness * precision) / (fitness + precision) # idea: fitness <=> recall & precision <=> precision
        
        results_lstmclust[noise].append({"f1_bcubed":cluster_result[2], "f_score":f_score, "fitness": fitness, "precision":precision, "simplicity":simplicity})

  0%|                                                                                  | 0/11 [00:00<?, ?noise_level/s]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:18<00:00, 18.16s/event_log][A
  9%|██████▋                                                                   | 1/11 [00:18<03:01, 18.18s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:20<00:00, 20.78s/event_log][A
 18%|█████████████▍                                                            | 2/11 [00:38<02:57, 19.71s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:19<00:00, 19.58s/event_log][A
 27%|████████████████████▏                                                     | 3/11 [00:58<02:37, 19.66s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:20<00:00, 20.69s/event_log][A
 36%|██████████████████████████▉                                               | 4/11 [01:19<02:20, 20.07s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:20<00:00, 20.89s/event_log][A
 45%|█████████████████████████████████▋                                        | 5/11 [01:40<02:02, 20.38s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:20<00:00, 20.72s/event_log][A
 55%|████████████████████████████████████████▎                                 | 6/11 [02:00<01:42, 20.50s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:20<00:00, 20.25s/event_log][A
 64%|███████████████████████████████████████████████                           | 7/11 [02:21<01:21, 20.43s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:21<00:00, 21.56s/event_log][A
 73%|█████████████████████████████████████████████████████▊                    | 8/11 [02:42<01:02, 20.79s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:20<00:00, 20.63s/event_log][A
 82%|████████████████████████████████████████████████████████████▌             | 9/11 [03:03<00:41, 20.75s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:20<00:00, 20.37s/event_log][A
 91%|██████████████████████████████████████████████████████████████████▎      | 10/11 [03:23<00:20, 20.64s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:20<00:00, 20.89s/event_log][A
100%|█████████████████████████████████████████████████████████████████████████| 11/11 [03:44<00:00, 20.43s/noise_level]


In [None]:
# Statistics LSTM

# complete
complete = []
for i in range(11):
    complete += results_lstmclust[i/10]
complete = pd.DataFrame.from_dict(complete)
print(complete.describe())

print("---")

# noise
for i in range(11):
    print(f"noise: {i/10}")
    print(pd.DataFrame.from_dict(results_lstmclust[i/10]).describe())
    print("---")