# Installations

In [None]:
# Installations

# install once necessary libraries

!pip install pandas
!pip install -U scikit-learn
!pip install pm4py
!pip install kneed
!pip install torch

# Import Event Log

In [1]:
import pandas as pd
import numpy as np
import pm4py
from pm4py.objects.conversion.log import converter as log_converter


if __name__ == "__main__":
    # Read the CSV file
    dataframe = pd.read_csv('model_A.csv', sep=',')  

    # Drop the first column without knowing its name
    dataframe = dataframe.drop(dataframe.columns[0], axis=1)

    # Format the dataframe
    dataframe = pm4py.format_dataframe(
        dataframe, 
        case_id='case:concept:name', 
        activity_key='concept:name', 
        timestamp_key='time:timestamp'
    )

    # Convert the dataframe to event log
    log = log_converter.apply(dataframe)
    
dataframe

  df[col] = pd.to_datetime(df[col], utc=True)


Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,@@index,@@case_index
0,112,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000,0,0
1,112,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000,1,0
2,112,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000,2,0
3,10862,COMPLETE,A_ACCEPTED,2011-10-01 11:42:43.308000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000,3,0
4,10862,COMPLETE,A_FINALIZED,2011-10-01 11:45:09.243000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000,4,0
...,...,...,...,...,...,...,...,...,...
60844,10933,COMPLETE,A_ACCEPTED,2012-03-01 20:17:22.457000+01:00,2012-02-29 23:43:09.766000+01:00,214373,8500,60844,13085
60845,10933,COMPLETE,A_FINALIZED,2012-03-01 20:22:38.593000+01:00,2012-02-29 23:43:09.766000+01:00,214373,8500,60845,13085
60846,112,COMPLETE,A_SUBMITTED,2012-02-29 23:51:16.799000+01:00,2012-02-29 23:51:16.799000+01:00,214376,15000,60846,13086
60847,112,COMPLETE,A_PARTLYSUBMITTED,2012-02-29 23:51:17.423000+01:00,2012-02-29 23:51:16.799000+01:00,214376,15000,60847,13086


# Data Preprocessing

## Integer Encoding 

In [2]:
from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Fit label encoder and return encoded labels
dataframe['activity_encoded'] = label_encoder.fit_transform(dataframe['concept:name'])

# Now the 'dataframe' has a new column 'activity_encoded' with integer encoded values of the 'activity' column
dataframe

Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,@@index,@@case_index,activity_encoded
0,112,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000,0,0,9
1,112,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000,1,0,6
2,112,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000,2,0,7
3,10862,COMPLETE,A_ACCEPTED,2011-10-01 11:42:43.308000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000,3,0,0
4,10862,COMPLETE,A_FINALIZED,2011-10-01 11:45:09.243000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000,4,0,5
...,...,...,...,...,...,...,...,...,...,...
60844,10933,COMPLETE,A_ACCEPTED,2012-03-01 20:17:22.457000+01:00,2012-02-29 23:43:09.766000+01:00,214373,8500,60844,13085,0
60845,10933,COMPLETE,A_FINALIZED,2012-03-01 20:22:38.593000+01:00,2012-02-29 23:43:09.766000+01:00,214373,8500,60845,13085,5
60846,112,COMPLETE,A_SUBMITTED,2012-02-29 23:51:16.799000+01:00,2012-02-29 23:51:16.799000+01:00,214376,15000,60846,13086,9
60847,112,COMPLETE,A_PARTLYSUBMITTED,2012-02-29 23:51:17.423000+01:00,2012-02-29 23:51:16.799000+01:00,214376,15000,60847,13086,6


## Extract Traces 

In [3]:
# Group by case index to get individual traces
traces = dataframe.groupby('@@case_index')['activity_encoded'].apply(list)
traces

@@case_index
0        [9, 6, 7, 0, 5, 8, 2, 1]
1        [9, 6, 7, 0, 5, 2, 8, 1]
2        [9, 6, 7, 0, 5, 2, 8, 1]
3                       [9, 6, 4]
4                       [9, 6, 4]
                   ...           
13082             [9, 6, 7, 0, 5]
13083                   [9, 6, 4]
13084                   [9, 6, 4]
13085             [9, 6, 7, 0, 5]
13086                   [9, 6, 4]
Name: activity_encoded, Length: 13087, dtype: object

## Padding

In [4]:
# Post-Padding
import torch
from torch.nn.utils.rnn import pad_sequence

# First, we convert the pandas Series 'traces' to a list of PyTorch tensors
traces_tensors = [torch.tensor(trace) for trace in traces]

# Then, we apply padding to this list of tensors
# We are assuming here that your traces are sequences of integers that represent encoded activities
# pad_sequence requires the input to be a list (or other iterable) of tensors
padded_traces = pad_sequence(traces_tensors, batch_first=True)

# Now 'padded_traces' is a tensor with each row representing a trace and zeros padding the sequences to equal length
padded_traces

tensor([[9, 6, 7,  ..., 8, 2, 1],
        [9, 6, 7,  ..., 2, 8, 1],
        [9, 6, 7,  ..., 2, 8, 1],
        ...,
        [9, 6, 4,  ..., 0, 0, 0],
        [9, 6, 7,  ..., 0, 0, 0],
        [9, 6, 4,  ..., 0, 0, 0]])

# Build Input X and output Y

In [5]:
# Assuming 'padded_traces' is already created as per the previous discussion.

# Determine the integer for 'END' token, assuming it's the max(encoded_activities) + 1
end_token = padded_traces.max() + 1

# Initialize lists to hold the input sequences X and the targets Y
X = []
Y = []

# Iterate through each trace
for trace in padded_traces:
    # Initialize the sequence with the 'null' token
    seq = [0]  
    for activity in trace:
        # Exclude padding (0) and end token (if already included)
        if activity.item() not in (0, end_token):
            # Append the current sequence to X
            X.append(seq.copy())
            # Append the next activity as the target to Y
            Y.append(activity.item())
            # Add the current activity to the sequence
            seq.append(activity.item())
    # After the last activity in the trace, append the 'END' token to the sequence and to Y
    X.append(seq.copy())
    Y.append(end_token)

# Convert X and Y to PyTorch tensors
X_tensor = pad_sequence([torch.tensor(x) for x in X], batch_first=True, padding_value=0)
Y_tensor = torch.tensor(Y)

In [6]:
X_tensor.shape

torch.Size([68823, 8])

In [None]:
Y_tensor.shape

# Optional: Prepare Test and Train Data

# LSTM Model Architecture

In [None]:
import torch
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, num_activities, embedding_dim, hidden_dim, output_dim, dropout_prob=0.2):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(num_activities, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=dropout_prob)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (h_n, c_n) = self.lstm(embedded)
        out = self.fc(lstm_out[:, -1, :])  # Output of the last LSTM cell
        return out, h_n

# Model parameters
num_activities = end_token + 1 # define this based on your data
embedding_dim = len(dataframe['concept:name'].unique())
hidden_dim = 100
output_dim = num_activities

# Initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lstm_model = LSTMModel(num_activities, embedding_dim, hidden_dim, output_dim, dropout_prob=0.2).to(device)

# LSTM Training

In [None]:
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_model.parameters())

# Data preparation
train_data = TensorDataset(X_tensor, Y_tensor)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

# Training loop
num_epochs = 50
for epoch in range(num_epochs):
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs, _ = lstm_model(inputs)  # Ignore hidden states during training
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# Optional: Evaluation of LSTM Predictive Performance

# Extract Trace Representations Using LSTM Network

In [None]:
# To collect hidden states for the entire dataset
lstm_model.eval()  # Set the model to evaluation mode
hidden_states = []

with torch.no_grad():
    for inputs in DataLoader(X_tensor, batch_size=1):  # Process one sequence at a time
        inputs = inputs.to(device)
        _, h_n = lstm_model(inputs)  # Get the hidden state
        hidden_states.append(h_n.squeeze(0).cpu().numpy())  # Remove batch dimension and convert to numpy

hidden_states = np.array(hidden_states)

In [None]:
hidden_states = hidden_states.squeeze(1)  # Removes the singleton dimension

# Check the new shape
print(hidden_states.shape)

In [None]:
# Group by case index to get individual traces
traces = dataframe.groupby('@@case_index')['concept:name'].apply(list)
traces

In [None]:
hidden_state_pos = traces.apply(len).values
hidden_state_pos = hidden_state_pos - 1
hidden_state_pos

In [None]:
trace_representations = [hidden_states[i-1] for i in hidden_state_pos]
trace_representations = np.array(trace_representations)
len(trace_representations)

In [None]:
trace_representations.shape

# Clustering

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from kneed import KneeLocator

# Elbow method to determine the optimal number of clusters
distortions = []
K = range(1, 10)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(trace_representations)
    distortions.append(kmeanModel.inertia_)

# Plot the elbow graph
plt.figure(figsize=(8,6))
plt.plot(K, distortions, 'bx-')
plt.xlabel('Number of Clusters')
plt.ylabel('SSE')
plt.title('The Elbow Method showing the optimal k')
plt.show()

# Find the elbow point
kl = KneeLocator(range(1, 10), distortions, curve="convex", direction="decreasing")
print("Optimal number of clusters:", kl.elbow)

In [None]:
from sklearn.cluster import KMeans

# Number of clusters
k = kl.elbow

# Create a KMeans instance with k clusters
kmeans = KMeans(n_clusters=k, random_state=42)

# Fit the model to the data
kmeans.fit(trace_representations)

# Predict the clusters for each trace
clusters = kmeans.predict(trace_representations)

# Cluster Evaluation

In [None]:
# the value of the silhouette coefficient ranges between -1 and 1
# value close to 1 is considered as good

from sklearn.metrics import silhouette_score

# Calculate silhouette score
silhouette_avg = silhouette_score(trace_representations, clusters)

print(f"Silhouette Coefficient: {silhouette_avg:.2f}")

# Token-Based Replay for Conformance Checking

In [None]:
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.bpmn.importer import importer as bpmn_importer
from pm4py.algo.conformance.alignments.petri_net import algorithm as alignments_petri

# 2. Import the given BPMN model
bpmn_graph = bpmn_importer.apply("Model_A_corrected.bpmn")

# 3. Convert the BPMN to a Petri net
net, im, fm = pm4py.convert_to_petri_net(bpmn_graph)

# 4. Perform alignment-based conformance checking
alignments = alignments_petri.apply(log, net, im, fm)

# Calculate and print diagnostics
fit_traces = sum(1 for trace in alignments if trace['fitness'] == 1.0)

print(f"Total traces: {len(log)}")
print(f"Conform traces: {fit_traces}")
print(f"Non-Conform traces: {len(log) - fit_traces}")

# Happy Cluster

In [None]:
labels = kmeans.labels_

# Identify the dominant cluster
from collections import Counter
counter = Counter(labels)
happy_cluster = counter.most_common(1)[0][0]
print(f"Happy Cluster: {happy_cluster}")

In [None]:
# Get the indices of the data points belonging to the happy cluster
happy_cluster_indices = np.where(clusters == happy_cluster)[0]

# Use these indices to fetch the corresponding data points from the original data X
happy_cluster_data_points = trace_representations[happy_cluster_indices, :]

# Calculate the centroid as the mean of these data points
happy_cluster_centroid = np.mean(happy_cluster_data_points, axis=0)

print("Centroid of happy cluster:", happy_cluster_centroid)

# Distance Measurement

In [None]:
from scipy.spatial.distance import euclidean

# Calculate the Euclidean distance from each data point in X to the happy_cluster_centroid
distances_to_centroid = [euclidean(trace_representations[i], happy_cluster_centroid) for i in range(len(trace_representations))]

# Results overview

In [None]:
# create dataframe with necessary information for distance measurement

grouped = dataframe.groupby('@@case_index')['concept:name'].apply(list).reset_index(name='trace')

results = pd.DataFrame(grouped['trace'])

results['cluster'] = clusters

conformity_array = [int(trace['fitness']) for trace in alignments]
results['conform'] = conformity_array

results['distance'] = distances_to_centroid

In [None]:
summary = results.groupby('cluster').agg(
    count=pd.NamedAgg(column='trace', aggfunc='size'),
    conform_count=pd.NamedAgg(column='conform', aggfunc='sum')
).reset_index()

print("cluster\tcount\tconform_count")
for _, row in summary.iterrows():
    print(f"{row['cluster']}\t{row['count']}\t{row['conform_count']}")

# Evaluation

In [None]:
from sklearn.cluster import KMeans

# Filter the DataFrame into conforming and non-conforming subsets
conforming_distances = results[results['conform'] == 1]['distance']
non_conforming_distances = results[results['conform'] == 0]['distance']

# Determine common bin edges
min_distance = min(results['distance'])
max_distance = max(results['distance'])
bin_edges = np.linspace(min_distance, max_distance, num=30)

# Combine the data and reshape for k-means
all_distances = results['distance']
all_distances = np.array(all_distances)
all_distances_reshaped = all_distances.reshape(-1, 1)

# Apply k-means clustering
kmeans = KMeans(n_clusters=2, random_state=0).fit(all_distances_reshaped)
kmeans_labels = kmeans.labels_

# Find the threshold as the average of the two cluster centers
threshold_value = np.mean(kmeans.cluster_centers_)

# Plot histograms and the threshold
plt.hist(conforming_distances, bins=bin_edges, alpha=0.5, label='Conforming', color='green')
plt.hist(non_conforming_distances, bins=bin_edges, alpha=0.5, label='Non-Conforming', color='red')
plt.axvline(threshold_value, color='blue', linestyle='dashed', linewidth=1, label='Threshold')
plt.xlabel('Distance')
plt.ylabel('Frequency')
plt.legend(loc='upper right')
plt.title(f'Threshold at {threshold_value:.2f}')
plt.show()

In [None]:
true_positive = np.sum(results[results['conform'] == 1]['distance'] < threshold_value)
true_negative = np.sum(results[results['conform'] == 0]['distance'] > threshold_value)
false_positive = np.sum(results[results['conform'] == 0]['distance'] < threshold_value)
false_negative = np.sum(results[results['conform'] == 1]['distance'] > threshold_value)

In [None]:
# Calculate accuracy
accuracy = (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative)
print(f"Accuracy: {accuracy:.3f}")

In [None]:
# Calculate precision
precision = true_positive / (true_positive + false_positive)
print(f"Precision: {precision:.3f}")

In [None]:
# Calculate recall
recall = true_positive / (true_positive + false_negative)
print(f"Recall: {recall:.3f}")

In [None]:
# Calculate f1
f1 = 2 * ((precision * recall) / (precision + recall))
print(f"F1: {f1:.3f}")

# Dev (Non Conform Traces)

In [None]:
# Calculate precision for Dev
precision = true_negative / (true_negative + false_negative)
print(f"Precision: {precision:.3f}")

In [None]:
# Calculate recall for Dev
recall = true_negative / (true_negative + false_positive)
print(f"Recall: {recall:.3f}")

# No Dev (Conform Traces)

In [None]:
# Calculate precision for No Dev
precision = true_positive / (true_positive + false_positive)
print(f"Precision: {precision:.3f}")

In [None]:
# Calculate recall for No Dev
recall = true_positive / (true_positive + false_negative)
print(f"Recall: {recall:.3f}")

# AUC_ROC

In [None]:
# UPDATED

import numpy as np
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# Assuming 'results' is your DataFrame and 'distance' is the score to predict conformity

# Inverting the 'distance' scores because higher scores indicate non-conformity
# We invert the scores for ROC AUC calculation because roc_auc_score expects higher values
# to indicate higher likelihood of the positive class
inverted_scores = 1 - results['distance']

# Calculate the ROC curve and AUC using inverted scores
fpr, tpr, thresholds = roc_curve(results['conform'], inverted_scores, pos_label=1)
roc_auc = roc_auc_score(results['conform'], inverted_scores)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='orange', label=f'ROC curve (area = {roc_auc:0.2f})')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# Print the AUC
print(f"ROC AUC: {roc_auc:.3f}")