In [1]:
import numpy as np
import pandas as pd

from replearn.eventlog import EventLog

from replearn.embedding_predict import EmbeddingPredict
from replearn.autoencoder import AutoencoderRepresentation
from replearn.doc2vec import Doc2VecRepresentation

from replearn.clustering import Clustering

from replearn.evaluation import Evaluation

# !pip install levenshtein
from Levenshtein import distance as led
from tqdm import tqdm
import os
import re

### General Parameters

In [2]:
# all possible parameters, abstracted
event_log_path = '../logs/'
case_attributes = None # auto-detect attributes
event_attributes = ['concept:name', 'user'] # use activity name and user
true_cluster_label = 'cluster'

n_epochs = 25         #[10, 25]
n_batch_size = 64
n_clusters = 5
vector_size = 32      #[2, 3, 4, 8, 16, 32, 64, 128]

clustering_method = "agglomerative" # ["k_means", "agglomerative"]

### Event Logs

In [3]:
# noise event logs - self implemented
event_logs = {}
for i in range(11):
    noise = i / 10.0
    event_logs[noise] = []

# prepare all event log files
pattern = r'-(\d+\.\d+)'
for file in tqdm(os.listdir(event_log_path), unit='eventlog'):
    if os.path.isfile(os.path.join(event_log_path, file)):
        match = re.search(pattern, file)
        if match:
            noise = float(match.group(1))
            event_logs[noise].append(file)
            
print(event_logs.keys())
print("First three:", event_logs[0.0][0:3])

100%|██████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 2202.79eventlog/s]

dict_keys([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
First three: ['small_500_10_20_5_1_1-0.0-1.json.gz']





## Bag of Activities (BOA)

In [4]:
# completely self-implemented approach by us for BOA (not given in paper nor code)
results_boa = {}
for i in range(11):
    noise = i / 10.0
    results_boa[noise] = []

for noise in tqdm(event_logs.keys(), unit="noise_level"):    
    for file in tqdm(event_logs[noise], unit='event_log'):
        # load file
        event_log = EventLog(file, case_attributes=case_attributes, event_attributes=event_attributes, true_cluster_label=true_cluster_label)
        event_log.load(event_log_path + file, False)
        event_log.preprocess()

        # vector representation (BOA)
        activities = event_log._event_attribute_encodes[0]
        feature_vector = []
        for i in range(len(event_log.case_lens)):
            feature_vector.append(activities[i])
        feature_vector = np.array(feature_vector)

        # clustering
        cluster_analysis = Clustering(event_log)
        cluster_analysis.cluster(feature_vector, clustering_method, n_clusters, 'cosine')

        # evaluation
        cluster_result = cluster_analysis.evaluate()
        evaluation = Evaluation(event_log)
        (fitness, precision, simplicity) = evaluation.evaluate_clusters(n_clusters, cluster_analysis.pred_labels) # Heuristics Miner + 2. Metric
        f_score = 2 * (fitness * precision) / (fitness + precision) # idea: fitness <=> recall & precision <=> precision
        
        results_boa[noise].append({"f1_bcubed":cluster_result[2], "f_score":f_score, "fitness": fitness, "precision":precision, "simplicity":simplicity})

  0%|                                                                                  | 0/11 [00:00<?, ?noise_level/s]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A
100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.21event_log/s][A
  9%|██████▋                                                                   | 1/11 [00:00<00:08,  1.18noise_level/s]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A
100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.94event_log/s][A
 18%|█████████████▍                                                            | 2/11 [00:01<00:05,  1.51noise_level/s]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A
100%|████████████████████

In [5]:
# Statistics (BOA)

# complete
complete = []
for i in range(11):
    complete += results_boa[i/10]
complete = pd.DataFrame.from_dict(complete)
print(complete.describe())

print("---")

# noise
for i in range(11):
    print(f"noise: {i/10}")
    print(pd.DataFrame.from_dict(results_boa[i/10]).describe())
    print("---")

       f1_bcubed    f_score    fitness  precision  simplicity
count  11.000000  11.000000  11.000000  11.000000   11.000000
mean    0.663127   0.733692   0.857490   0.645703    0.796394
std     0.125396   0.125092   0.108571   0.137370    0.085042
min     0.432309   0.469185   0.586304   0.391067    0.583277
25%     0.593778   0.719674   0.859090   0.600394    0.775149
50%     0.657393   0.765091   0.894292   0.676967    0.800582
75%     0.747354   0.813408   0.912569   0.732738    0.824264
max     0.896061   0.867967   0.973200   0.822355    0.925200
---
noise: 0.0
       f1_bcubed  f_score  fitness  precision  simplicity
count   1.000000  1.00000     1.00   1.000000        1.00
mean    0.896061  0.79904     0.88   0.731721        0.88
std          NaN      NaN      NaN        NaN         NaN
min     0.896061  0.79904     0.88   0.731721        0.88
25%     0.896061  0.79904     0.88   0.731721        0.88
50%     0.896061  0.79904     0.88   0.731721        0.88
75%     0.896061  0.7

## Levenshtein Distance (LED)

In [6]:
# LED approach - self-implemented
results_led = {}
for i in range(11):
    noise = i / 10.0
    results_led[noise] = []

for noise in tqdm(event_logs.keys(), unit="noise_level"):    
    for file in tqdm(event_logs[noise], unit='event_log'):
        # load file
        event_log = EventLog(file, case_attributes=case_attributes, event_attributes=event_attributes, true_cluster_label=true_cluster_label)
        event_log.load(event_log_path + file, False)
        event_log.preprocess()

        # vector representation
        activities = event_log._event_attribute_encodes[0]
        feature_vector = []
        for i in range(len(event_log.case_lens)):
            feature_vector.append(activities[i])
        feature_vector = np.array(feature_vector)
        
        # clustering
        cluster_analysis = Clustering(event_log)
        cluster_analysis.cluster(feature_vector, clustering_method, n_clusters, metric=led)
        
        # evaluation
        cluster_result = cluster_analysis.evaluate() # 1. Metric (F1-BCubed)
        evaluation = Evaluation(event_log)
        (fitness, precision, simplicity) = evaluation.evaluate_clusters(n_clusters, cluster_analysis.pred_labels) # Heuristics Miner + 2. Metric
        f_score = 2 * (fitness * precision) / (fitness + precision) # idea: fitness <=> recall & precision <=> precision
        
        results_led[noise].append({"f1_bcubed":cluster_result[2], "f_score":f_score, "fitness": fitness, "precision":precision, "simplicity":simplicity})

  0%|                                                                                  | 0/11 [00:00<?, ?noise_level/s]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A
100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.68s/event_log][A
  9%|██████▋                                                                   | 1/11 [00:01<00:16,  1.69s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A
100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.86s/event_log][A
 18%|█████████████▍                                                            | 2/11 [00:03<00:16,  1.80s/noise_level]
  0%|                                                                                     | 0/1 [00:00<?, ?event_log/s][A
100%|████████████████████

In [7]:
# Statistics (LED) - self-implemented

# complete
complete = []
for i in range(11):
    complete += results_led[i/10]
complete = pd.DataFrame.from_dict(complete)
print(complete.describe())

print("---")

# noise
for i in range(11):
    print(f"noise: {i/10}")
    print(pd.DataFrame.from_dict(results_led[i/10]).describe())
    print("---")

       f1_bcubed    f_score    fitness  precision  simplicity
count  11.000000  11.000000  11.000000  11.000000   11.000000
mean    0.832452   0.705113   0.772172   0.650236    0.766743
std     0.066855   0.086473   0.080307   0.092656    0.059721
min     0.718287   0.523219   0.625550   0.449662    0.679163
25%     0.791262   0.651857   0.723313   0.602716    0.723029
50%     0.844610   0.714830   0.767808   0.673392    0.765201
75%     0.894169   0.778800   0.825629   0.733583    0.802828
max     0.896298   0.804538   0.887958   0.738267    0.880000
---
noise: 0.0
       f1_bcubed  f_score  fitness  precision  simplicity
count   1.000000  1.00000     1.00   1.000000        1.00
mean    0.896061  0.79904     0.88   0.731721        0.88
std          NaN      NaN      NaN        NaN         NaN
min     0.896061  0.79904     0.88   0.731721        0.88
25%     0.896061  0.79904     0.88   0.731721        0.88
50%     0.896061  0.79904     0.88   0.731721        0.88
75%     0.896061  0.7