# Learning of Process Representations Using Recurrent Neural Networks

In [1]:
import numpy as np
import pandas as pd

from replearn.eventlog import EventLog

from replearn.embedding_predict import EmbeddingPredict
from replearn.autoencoder import AutoencoderRepresentation
from replearn.doc2vec import Doc2VecRepresentation

from replearn.clustering import Clustering

from replearn.evaluation import Evaluation

# !pip install levenshtein
from Levenshtein import distance as led
from tqdm import tqdm
import os
import re

### General Parameters

In [2]:
# all possible parameters, abstracted
event_log_path = '../logs/'
case_attributes = None # auto-detect attributes
event_attributes = ['concept:name', 'user'] # use activity name and user
true_cluster_label = 'cluster'

n_epochs = 25         #[10, 25]
n_batch_size = 64
n_clusters = 5
vector_size = 32      #[2, 3, 4, 8, 16, 32, 64, 128]

clustering_method = "agglomerative" # ["k_means", "agglomerative"]

### Event Logs

In [3]:
# noise event logs - self implemented
event_logs = {}
for i in range(11):
    noise = i / 10.0
    event_logs[noise] = []

# prepare all event log files
pattern = r'-(\d+\.\d+)'
for file in tqdm(os.listdir(event_log_path), unit='eventlog'):
    if os.path.isfile(os.path.join(event_log_path, file)):
        match = re.search(pattern, file)
        if match:
            noise = float(match.group(1))
            event_logs[noise].append(file)
            
print(event_logs.keys())
print("First three:", event_logs[0.0][0:3])

100%|██████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 2268.19eventlog/s]

dict_keys([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
First three: ['small_500_10_20_5_1_1-0.0-1.json.gz']





## Trace2Vec

In [4]:
# init and train Trace2Vec - added noise filtering, loop over all logs, metrics like f-score, statistics to create graph on
# statistics saved in google docs excel file in README to utilize further for presentation
results_trace2vec= {}
for i in range(11):
    noise = i / 10.0
    results_trace2vec[noise] = []

for noise in tqdm(event_logs.keys(), unit="noise_level"):    
    for file in tqdm(event_logs[noise], unit='event_log'):
        # load file
        event_log = EventLog(file, case_attributes=case_attributes, event_attributes=event_attributes, true_cluster_label=true_cluster_label)
        event_log.load(event_log_path + file, False)
        event_log.preprocess()
        
        # get sequences from event log as one-hot feature vector
        sequences = event_log.event_attributes_flat_onehot_features_2d

        doc2vec = Doc2VecRepresentation(event_log)
        doc2vec.build_model(append_case_attr=False, append_event_attr=False, vector_size=vector_size, concat=True, epochs=n_epochs)
        doc2vec.fit()
        
        # infer the vector from the model
        feature_vector = doc2vec.predict(epochs=50) 
        
        cluster_analysis = Clustering(event_log)
        cluster_analysis.cluster(feature_vector, clustering_method, n_clusters, 'cosine')

        cluster_result = cluster_analysis.evaluate()
        evaluation = Evaluation(event_log)
        (fitness, precision, simplicity) = evaluation.evaluate_clusters(n_clusters, cluster_analysis.pred_labels) # Heuristics Miner + 2. Metric
        f_score = 2 * (fitness * precision) / (fitness + precision) # idea: fitness <=> recall & precision <=> precision
            
        results_trace2vec[noise].append({"f1_bcubed":cluster_result[2], "f_score":f_score, "fitness": fitness, "precision":precision, "simplicity":simplicity})

  0%|                                                                                  | 0/11 [00:00<?, ?noise_level/s]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A
100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.29s/event_log][A
  9%|██████▋                                                                   | 1/11 [00:02<00:23,  2.30s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A
100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.92s/event_log][A
 18%|█████████████▍                                                            | 2/11 [00:04<00:18,  2.09s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A
100%|████████████████████

In [5]:
# Statistics

# complete
complete = []
for i in range(11):
    complete += results_trace2vec[i/10]
complete = pd.DataFrame.from_dict(complete)
print(complete.describe())

print("---")

# noise
for i in range(11):
    print(f"noise: {i/10}")
    print(pd.DataFrame.from_dict(results_trace2vec[i/10]).describe())
    print("---")

       f1_bcubed    f_score    fitness  precision  simplicity
count  11.000000  11.000000  11.000000  11.000000   11.000000
mean    0.895001   0.664172   0.724493   0.616018    0.712083
std     0.002212   0.192996   0.181781   0.198006    0.122266
min     0.888960   0.360574   0.430563   0.303143    0.517412
25%     0.894778   0.562183   0.628562   0.504756    0.633898
50%     0.896061   0.761703   0.806283   0.733319    0.751487
75%     0.896061   0.778172   0.824067   0.738465    0.776049
max     0.896467   0.810564   0.906000   0.749821    0.906000
---
noise: 0.0
       f1_bcubed   f_score  fitness  precision  simplicity
count   1.000000  1.000000    1.000   1.000000       1.000
mean    0.896061  0.810564    0.906   0.733319       0.906
std          NaN       NaN      NaN        NaN         NaN
min     0.896061  0.810564    0.906   0.733319       0.906
25%     0.896061  0.810564    0.906   0.733319       0.906
50%     0.896061  0.810564    0.906   0.733319       0.906
75%     0.8960

   ## Case2vec (event)
   

In [6]:
# init and train Case2vec (event) - added noise filtering, loop over all logs, metrics like f-score, statistics to create graph on
# statistics saved in google docs excel file in README to utilize further for presentation
results_case2vec_event= {}
for i in range(11):
    noise = i / 10.0
    results_case2vec_event[noise] = []

for noise in tqdm(event_logs.keys(), unit="noise_level"):    
    for file in tqdm(event_logs[noise], unit='event_log'):
        # load file
        event_log = EventLog(file, case_attributes=case_attributes, event_attributes=event_attributes, true_cluster_label=true_cluster_label)
        event_log.load(event_log_path + file, False)
        event_log.preprocess()
        
        # get sequences from event log as one-hot feature vector
        sequences = event_log.event_attributes_flat_onehot_features_2d

        doc2vec = Doc2VecRepresentation(event_log)
        doc2vec.build_model(append_case_attr=False, append_event_attr=True, vector_size=vector_size, concat=True, epochs=n_epochs)
        doc2vec.fit()
        
        # infer the vector from the model
        feature_vector = doc2vec.predict(epochs=50) 
        
        cluster_analysis = Clustering(event_log)
        cluster_analysis.cluster(feature_vector, clustering_method, n_clusters, 'cosine')

        cluster_result = cluster_analysis.evaluate()
        evaluation = Evaluation(event_log)
        (fitness, precision, simplicity) = evaluation.evaluate_clusters(n_clusters, cluster_analysis.pred_labels) # Heuristics Miner + 2. Metric
        f_score = 2 * (fitness * precision) / (fitness + precision) # idea: fitness <=> recall & precision <=> precision
            
        results_case2vec_event[noise].append({"f1_bcubed":cluster_result[2], "f_score":f_score, "fitness": fitness, "precision":precision, "simplicity":simplicity})

  0%|                                                                                  | 0/11 [00:00<?, ?noise_level/s]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A
100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.83s/event_log][A
  9%|██████▋                                                                   | 1/11 [00:01<00:18,  1.84s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A
100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.96s/event_log][A
 18%|█████████████▍                                                            | 2/11 [00:03<00:17,  1.92s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A
100%|████████████████████

In [7]:
# Statistics

# complete
complete = []
for i in range(11):
    complete += results_case2vec_event[i/10]
complete = pd.DataFrame.from_dict(complete)
print(complete.describe())

print("---")

# noise
for i in range(11):
    print(f"noise: {i/10}")
    print(pd.DataFrame.from_dict(results_case2vec_event[i/10]).describe())
    print("---")

       f1_bcubed    f_score    fitness  precision  simplicity
count  11.000000  11.000000  11.000000  11.000000   11.000000
mean    0.893404   0.780099   0.834658   0.732940    0.783374
std     0.003447   0.018063   0.034372   0.017797    0.046235
min     0.884865   0.742551   0.786472   0.682051    0.742653
25%     0.891696   0.771214   0.814213   0.732874    0.758817
50%     0.893851   0.782628   0.822214   0.735453    0.765412
75%     0.896061   0.789982   0.857787   0.741138    0.797214
max     0.896061   0.810564   0.906000   0.750984    0.906000
---
noise: 0.0
       f1_bcubed   f_score  fitness  precision  simplicity
count   1.000000  1.000000    1.000   1.000000       1.000
mean    0.896061  0.810564    0.906   0.733319       0.906
std          NaN       NaN      NaN        NaN         NaN
min     0.896061  0.810564    0.906   0.733319       0.906
25%     0.896061  0.810564    0.906   0.733319       0.906
50%     0.896061  0.810564    0.906   0.733319       0.906
75%     0.8960

## Case2vec (event + case)

In [8]:
# init and train Case2vec (event+case) - added noise filtering, loop over all logs, metrics like f-score, statistics to create graph on
# statistics saved in google docs excel file in README to utilize further for presentation
results_case2vec_event_case= {}
for i in range(11):
    noise = i / 10.0
    results_case2vec_event_case[noise] = []

for noise in tqdm(event_logs.keys(), unit="noise_level"):    
    for file in tqdm(event_logs[noise], unit='event_log'):
        # load file
        event_log = EventLog(file, case_attributes=case_attributes, event_attributes=event_attributes, true_cluster_label=true_cluster_label)
        event_log.load(event_log_path + file, False)
        event_log.preprocess()
        
        # get sequences from event log as one-hot feature vector
        sequences = event_log.event_attributes_flat_onehot_features_2d

        doc2vec = Doc2VecRepresentation(event_log)
        doc2vec.build_model(append_case_attr=True, append_event_attr=True, vector_size=vector_size, concat=True, epochs=n_epochs)
        doc2vec.fit()
        
        # infer the vector from the model
        feature_vector = doc2vec.predict(epochs=50) 
        
        cluster_analysis = Clustering(event_log)
        cluster_analysis.cluster(feature_vector, 'agglomerative', n_clusters, 'cosine')

        cluster_result = cluster_analysis.evaluate()
        evaluation = Evaluation(event_log)
        (fitness, precision, simplicity) = evaluation.evaluate_clusters(n_clusters, cluster_analysis.pred_labels) # Heuristics Miner + 2. Metric
        f_score = 2 * (fitness * precision) / (fitness + precision) # idea: fitness <=> recall & precision <=> precision
            
        results_case2vec_event_case[noise].append({"f1_bcubed":cluster_result[2], "f_score":f_score, "fitness": fitness, "precision":precision, "simplicity":simplicity})

  0%|                                                                                  | 0/11 [00:00<?, ?noise_level/s]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A
100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.26s/event_log][A
  9%|██████▋                                                                   | 1/11 [00:02<00:22,  2.28s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A
100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.49s/event_log][A
 18%|█████████████▍                                                            | 2/11 [00:04<00:21,  2.41s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A
100%|████████████████████

In [9]:
# Statistics - added by ourselves

# complete
complete = []
for i in range(11):
    complete += results_case2vec_event_case[i/10]
complete = pd.DataFrame.from_dict(complete)
print(complete.describe())

print("---")

# noise
for i in range(11):
    print(f"noise: {i/10}")
    print(pd.DataFrame.from_dict(results_case2vec_event_case[i/10]).describe())
    print("---")

       f1_bcubed    f_score    fitness  precision  simplicity
count  11.000000  11.000000  11.000000  11.000000   11.000000
mean    0.471022   0.266240   0.394146   0.216064    0.443401
std     0.055325   0.272956   0.316619   0.245014    0.305634
min     0.407086   0.007675   0.028000   0.004447    0.032000
25%     0.419993   0.058769   0.129700   0.037995    0.181000
50%     0.458697   0.144237   0.291988   0.088720    0.426000
75%     0.522048   0.498724   0.677313   0.404368    0.703706
max     0.545067   0.718850   0.844571   0.664356    0.862000
---
noise: 0.0
       f1_bcubed   f_score   fitness  precision  simplicity
count   1.000000  1.000000  1.000000   1.000000       1.000
mean    0.410871  0.478921  0.844571   0.334222       0.862
std          NaN       NaN       NaN        NaN         NaN
min     0.410871  0.478921  0.844571   0.334222       0.862
25%     0.410871  0.478921  0.844571   0.334222       0.862
50%     0.410871  0.478921  0.844571   0.334222       0.862
75%    