# Learning of Process Representations Using Recurrent Neural Networks

In [None]:
import numpy as np
import pandas as pd

from replearn.eventlog import EventLog

from replearn.embedding_predict import EmbeddingPredict
from replearn.autoencoder import AutoencoderRepresentation
from replearn.doc2vec import Doc2VecRepresentation

from replearn.clustering import Clustering

from replearn.evaluation import Evaluation

# !pip install levenshtein
from Levenshtein import distance as led
import os

from tqdm import tqdm

### General Parameters

In [None]:
event_log_path = '../logs/'
case_attributes = None # auto-detect attributes
event_attributes = ['concept:name', 'user'] # use activity name and user
true_cluster_label = 'cluster'

clustering_method = "agglomerative" # ["k_means", "agglomerative"]

## Bag of Activities (BOA)

In [None]:
# BOA
boa_results = {}

for file in tqdm(os.listdir(event_log_path), unit='eventlog'):
    if os.path.isfile(os.path.join(event_log_path, file)):
        
        # load file
        event_log = EventLog(file, case_attributes=case_attributes, event_attributes=event_attributes, true_cluster_label=true_cluster_label)
        event_log.load(event_log_path + file, False)
        event_log.preprocess()

        # vector representation
        activities = event_log._event_attribute_encodes[0]
        feature_vector = []
        
        for i in range(len(event_log.case_lens)):
            feature_vector.append(activities[i])
        feature_vector = np.array(feature_vector)
        
        # clustering
        cluster_analysis = Clustering(event_log)
        cluster_analysis.cluster(feature_vector, clustering_method, n_clusters, 'cosine')
        
        # evaluation
        cluster_result = cluster_analysis.evaluate() # 1. Metric (F1-BCubed)
        evaluation = Evaluation(event_log)
        (fitness, precision, simplicity) = evaluation.evaluate_clusters(n_clusters, cluster_analysis.pred_labels) # Heuristics Miner + 2. Metric
        
        boa_results[file] = {"f1_bcubed":cluster_result[2], "fitness": fitness, "precision":precision, "simplicity":simplicity}

## Levenshtein Distance (LED)

In [None]:
# LED
led_results = {}

for file in tqdm(os.listdir(event_log_path), unit='eventlog'):
    if os.path.isfile(os.path.join(event_log_path, file)):
        
        # load file
        event_log = EventLog(file, case_attributes=case_attributes, event_attributes=event_attributes, true_cluster_label=true_cluster_label)
        event_log.load(event_log_path + file, False)
        event_log.preprocess()

        # vector representation
        activities = event_log._event_attribute_encodes[0]
        feature_vector = []
        for i in range(len(event_log.case_lens)):
            feature_vector.append(activities[i])
        feature_vector = np.array(feature_vector)
        
        print("ok")
        
        # clustering
        cluster_analysis = Clustering(event_log)
        cluster_analysis.cluster(feature_vector, clustering_method, n_clusters, metric=led)
        
        print("end")
        
        # evaluation
        cluster_result = cluster_analysis.evaluate() # 1. Metric (F1-BCubed)
        evaluation = Evaluation(event_log)
        (fitness, precision, simplicity) = evaluation.evaluate_clusters(n_clusters, cluster_analysis.pred_labels) # Heuristics Miner + 2. Metric
        
        led_results[file] = {"f1_bcubed":cluster_result[2], "fitness": fitness, "precision":precision, "simplicity":simplicity}

## Autoencoder

In [None]:
# hyperparameters
n_epochs = 10         #[10, 25]
n_batch_size = 64
n_clusters = 5
vector_size = 32      #[2, 3, 4, 8, 16, 32, 64, 128]

clustering_method = "agglomerative" # ["k_means", "agglomerative"]

In [None]:
# Autoencoder
results_autoencoder = {}

for file in tqdm(os.listdir(event_log_path), unit='eventlog'):
    if os.path.isfile(os.path.join(event_log_path, file)):
        
        # load file
        event_log = EventLog(file, case_attributes=case_attributes, event_attributes=event_attributes, true_cluster_label=true_cluster_label)
        event_log.load(event_log_path + file, False)
        event_log.preprocess()
        
        # get sequences from event log as one-hot feature vector
        sequences = event_log.event_attributes_flat_onehot_features_2d
        
        # init and train autoencoder
        autoencoder = AutoencoderRepresentation(event_log)
        autoencoder.build_model(input_dim=sequences.shape[1], encoder_dim=vector_size)
        autoencoder.fit(batch_size=n_batch_size, epochs=n_epochs, verbose=True)

        # get feature vector (hidden representation)
        feature_vector = autoencoder.predict()
        
        # cluster feature vector (cluster all case embeddings into n_clusters)
        cluster_analysis = Clustering(event_log)
        cluster_analysis.cluster(feature_vector, clustering_method, n_clusters, 'cosine')
        
        # evaluate clustered embeddings (by comparing it to event_log 'cluster' entry)
        cluster_result = cluster_analysis.evaluate() # 1. Metric (F1-BCubed)
        
        evaluation = Evaluation(event_log)
        (fitness, precision, simplicity) = evaluation.evaluate_clusters(n_clusters, cluster_analysis.pred_labels) # Heuristics Miner + 2. Metric
        
        results_autoencoder[file] = {"f1_bcubed":cluster_result[2], "fitness": fitness, "precision":precision, "simplicity":simplicity}

In [None]:
# Statistics

results = {"autoencoder":results_autoencoder, "boa": results_boa, "led": results_led} 

statistics = pd.DataFrame(results.autoencoder).transpose()
print(statistics.describe())


f1_bcubed_mean, f1_bcubed_std = statistics.f1_bcubed.mean(), statistics.f1_bcubed.std()
fitness_mean, fitness_std = statistics.fitness.mean(), statistics.fitness.std()
precision_mean, precision_std = statistics.precision.mean(), statistics.precision.std()
simplicity_mean, simplicity_std = statistics.simplicity.mean(), statistics.simplicity.std()
count = statistics.count()

In [None]:
#plot
from matplotlib import pyplot as plt

# plt.plot()
# <<< USE TO PLOT HERE...

## Load event log

In [None]:
# event log configuration
event_log_path = '../logs/'
file_name = 'huge_500_10_20_5_1_1-0.0-1.json.gz'

case_attributes = None # auto-detect attributes
event_attributes = ['concept:name', 'user'] # use activity name and user
true_cluster_label = 'cluster'

print(file_name)

# load file
event_log = EventLog(file_name, case_attributes=case_attributes, event_attributes=event_attributes, true_cluster_label=true_cluster_label)
event_log.load(event_log_path + '/' + file_name, False)
event_log.preprocess()

## Representation Learning

In [None]:
# hyperparameters
n_epochs = 10
n_batch_size = 64
n_clusters = 5

vector_size = 32

### Autoencoder

In [None]:
# get sequences from event log as one-hot feature vector
sequences = event_log.event_attributes_flat_onehot_features_2d

# init and train autoencoder
autoencoder = AutoencoderRepresentation(event_log)
autoencoder.build_model(sequences.shape[1], encoder_dim=vector_size)
autoencoder.fit(batch_size=n_batch_size, epochs=n_epochs, verbose=True)

In [None]:
# get feature vector
feature_vector = autoencoder.predict()

### Clustering

In [None]:
# cluster feature vector
cluster_analysis = Clustering(event_log)
cluster_analysis.cluster(feature_vector, 'agglomerative', n_clusters, 'cosine')

cluster_result = cluster_analysis.evaluate()

In [None]:
print('Adjusted Rand Index: ' + str(cluster_result[0]))
print('Normalized Mutual Information: ' + str(cluster_result[1]))
print('F1-BCubed: ' + str(cluster_result[2]))