# Import Event Log

In [1]:
import pandas as pd
import numpy as np
import pm4py
from pm4py.objects.conversion.log import converter as log_converter

if __name__ == "__main__":
    # Read the XES file
    dataframe_log = pm4py.read_xes('../../data/logs/DomesticDeclarations.xes')

    # If 'log' is already a DataFrame, add the @@case_index column directly
    case_indices = {case_id: idx for idx, case_id in enumerate(dataframe_log['case:concept:name'].unique())}
    dataframe_log['@@case_index'] = dataframe_log['case:concept:name'].map(case_indices)
    
     # Convert the dataframe to event log
    log = log_converter.apply(dataframe_log)
    
dataframe_log



parsing log, completed traces ::   0%|          | 0/10500 [00:00<?, ?it/s]

Unnamed: 0,id,org:resource,concept:name,time:timestamp,org:role,case:id,case:concept:name,case:BudgetNumber,case:DeclarationNumber,case:Amount,@@case_index
0,st_step 86794_0,STAFF MEMBER,Declaration SUBMITTED by EMPLOYEE,2017-01-09 08:49:50+00:00,EMPLOYEE,declaration 86791,declaration 86791,budget 86566,declaration number 86792,26.851205,0
1,st_step 86793_0,STAFF MEMBER,Declaration FINAL_APPROVED by SUPERVISOR,2017-01-09 10:27:48+00:00,SUPERVISOR,declaration 86791,declaration 86791,budget 86566,declaration number 86792,26.851205,0
2,dd_declaration 86791_19,SYSTEM,Request Payment,2017-01-10 08:34:44+00:00,UNDEFINED,declaration 86791,declaration 86791,budget 86566,declaration number 86792,26.851205,0
3,dd_declaration 86791_20,SYSTEM,Payment Handled,2017-01-12 16:31:22+00:00,UNDEFINED,declaration 86791,declaration 86791,budget 86566,declaration number 86792,26.851205,0
4,st_step 86798_0,STAFF MEMBER,Declaration SUBMITTED by EMPLOYEE,2017-01-09 09:26:14+00:00,EMPLOYEE,declaration 86795,declaration 86795,budget 86566,declaration number 86796,182.464172,1
...,...,...,...,...,...,...,...,...,...,...,...
56432,st_step 138363_0,STAFF MEMBER,Declaration SUBMITTED by EMPLOYEE,2018-12-29 16:50:14+00:00,EMPLOYEE,declaration 138359,declaration 138359,budget 86566,declaration number 138360,190.404576,10499
56433,st_step 138361_0,STAFF MEMBER,Declaration APPROVED by ADMINISTRATION,2018-12-29 16:56:13+00:00,ADMINISTRATION,declaration 138359,declaration 138359,budget 86566,declaration number 138360,190.404576,10499
56434,st_step 138362_0,STAFF MEMBER,Declaration FINAL_APPROVED by SUPERVISOR,2019-01-03 07:55:52+00:00,SUPERVISOR,declaration 138359,declaration 138359,budget 86566,declaration number 138360,190.404576,10499
56435,dd_declaration 138359_19,SYSTEM,Request Payment,2019-01-08 07:20:28+00:00,UNDEFINED,declaration 138359,declaration 138359,budget 86566,declaration number 138360,190.404576,10499


# Drop unnessary columns

In [2]:
dataframe_log = dataframe_log.drop(columns=['case:id'])

In [3]:
dataframe_log = dataframe_log.drop(columns=['case:concept:name'])

In [4]:
dataframe_log = dataframe_log.drop(columns=['case:BudgetNumber'])

In [5]:
dataframe_log = dataframe_log.drop(columns=['case:DeclarationNumber'])

In [6]:
dataframe_log = dataframe_log.drop(columns=['case:Amount'])

In [7]:
dataframe_log

Unnamed: 0,id,org:resource,concept:name,time:timestamp,org:role,@@case_index
0,st_step 86794_0,STAFF MEMBER,Declaration SUBMITTED by EMPLOYEE,2017-01-09 08:49:50+00:00,EMPLOYEE,0
1,st_step 86793_0,STAFF MEMBER,Declaration FINAL_APPROVED by SUPERVISOR,2017-01-09 10:27:48+00:00,SUPERVISOR,0
2,dd_declaration 86791_19,SYSTEM,Request Payment,2017-01-10 08:34:44+00:00,UNDEFINED,0
3,dd_declaration 86791_20,SYSTEM,Payment Handled,2017-01-12 16:31:22+00:00,UNDEFINED,0
4,st_step 86798_0,STAFF MEMBER,Declaration SUBMITTED by EMPLOYEE,2017-01-09 09:26:14+00:00,EMPLOYEE,1
...,...,...,...,...,...,...
56432,st_step 138363_0,STAFF MEMBER,Declaration SUBMITTED by EMPLOYEE,2018-12-29 16:50:14+00:00,EMPLOYEE,10499
56433,st_step 138361_0,STAFF MEMBER,Declaration APPROVED by ADMINISTRATION,2018-12-29 16:56:13+00:00,ADMINISTRATION,10499
56434,st_step 138362_0,STAFF MEMBER,Declaration FINAL_APPROVED by SUPERVISOR,2019-01-03 07:55:52+00:00,SUPERVISOR,10499
56435,dd_declaration 138359_19,SYSTEM,Request Payment,2019-01-08 07:20:28+00:00,UNDEFINED,10499


In [8]:
from sklearn.preprocessing import StandardScaler

# Convert to datetime format
dataframe_log['time:timestamp'] = pd.to_datetime(dataframe_log['time:timestamp'])

# Calculate elapsed time since the start of each case
dataframe_log['start_time'] = dataframe_log.groupby('@@case_index')['time:timestamp'].transform('min')
dataframe_log['elapsed_time'] = (dataframe_log['time:timestamp'] - dataframe_log['start_time']).dt.total_seconds()

# Normalize the elapsed time in minutes
scaler = StandardScaler()
dataframe_log['standardized_elapsed_time'] = scaler.fit_transform(dataframe_log[['elapsed_time']])

dataframe_log = dataframe_log.drop(columns=['start_time'])
dataframe_log = dataframe_log.drop(columns=['elapsed_time'])
dataframe_log = dataframe_log.drop(columns=['time:timestamp'])

# Insert Start & End markers

In [9]:
# Define a function to insert start and end markers
def add_markers(df):
    # Identify unique case indices
    case_indices = df['@@case_index'].unique()
    
    # Prepare a container for new DataFrame rows
    new_rows = []
    
    # Iterate over each case index to add start and end markers
    for case_index in case_indices:
        # Create a start marker row with all columns except @@case_index set to 'Start'
        start_row = {col: 'Start' if col != '@@case_index' else case_index for col in df.columns}
        
        # Create an end marker row with all columns except @@case_index set to 'End'
        end_row = {col: 'End' if col != '@@case_index' else case_index for col in df.columns}
        
        # Append start row, rows for the current case, and end row
        new_rows.append(start_row)
        new_rows.extend(df[df['@@case_index'] == case_index].to_dict('records'))
        new_rows.append(end_row)
    
    # Convert the list of rows into a DataFrame
    return pd.DataFrame(new_rows)

# Apply the function to add start and end markers to the dataframe
modified_dataframe = add_markers(dataframe_log)

In [10]:
modified_dataframe

Unnamed: 0,id,org:resource,concept:name,org:role,@@case_index,standardized_elapsed_time
0,Start,Start,Start,Start,0,Start
1,st_step 86794_0,STAFF MEMBER,Declaration SUBMITTED by EMPLOYEE,EMPLOYEE,0,-0.372192
2,st_step 86793_0,STAFF MEMBER,Declaration FINAL_APPROVED by SUPERVISOR,SUPERVISOR,0,-0.367752
3,dd_declaration 86791_19,SYSTEM,Request Payment,UNDEFINED,0,-0.307612
4,dd_declaration 86791_20,SYSTEM,Payment Handled,UNDEFINED,0,-0.155482
...,...,...,...,...,...,...
77432,st_step 138361_0,STAFF MEMBER,Declaration APPROVED by ADMINISTRATION,ADMINISTRATION,10499,-0.371921
77433,st_step 138362_0,STAFF MEMBER,Declaration FINAL_APPROVED by SUPERVISOR,SUPERVISOR,10499,-0.07009
77434,dd_declaration 138359_19,SYSTEM,Request Payment,UNDEFINED,10499,0.254628
77435,dd_declaration 138359_20,SYSTEM,Payment Handled,UNDEFINED,10499,0.410114


# Preprocess

In [11]:
modified_dataframe['standardized_elapsed_time'] = modified_dataframe['standardized_elapsed_time'].replace({'Start': 0, 'End': 1})

In [12]:
codes, uniques = pd.factorize(modified_dataframe['id'])
modified_dataframe['id'] = codes

In [13]:
codes, uniques = pd.factorize(modified_dataframe['org:resource'])
modified_dataframe['org:resource'] = codes

In [14]:
codes, uniques = pd.factorize(modified_dataframe['concept:name'])
modified_dataframe['concept:name'] = codes

In [15]:
codes, uniques = pd.factorize(modified_dataframe['org:role'])
modified_dataframe['org:role'] = codes

In [16]:
modified_dataframe

Unnamed: 0,id,org:resource,concept:name,org:role,@@case_index,standardized_elapsed_time
0,0,0,0,0,0,0.000000
1,1,1,1,1,0,-0.372192
2,2,1,2,2,0,-0.367752
3,3,2,3,3,0,-0.307612
4,4,2,4,3,0,-0.155482
...,...,...,...,...,...,...
77432,56435,1,12,7,10499,-0.371921
77433,56436,1,2,2,10499,-0.070090
77434,56437,2,3,3,10499,0.254628
77435,56438,2,4,3,10499,0.410114


### Padding for Cases with less then 5 events

In [17]:
# Calculating the frequency of each unique value in '@@case_index'
frequency = modified_dataframe['@@case_index'].value_counts()

# Finding the minimum occurrence
min_occurrence = frequency.min()

min_occurrence

3

In [18]:
# Assuming your dataframe might have different types, let's create a generic function to add rows
def add_rows(group, num_rows, case_index_value):
    # For each column, determine the appropriate "zero" value (int 0, string '', etc.)
    additional_rows = pd.DataFrame({
        column: 0 if pd.api.types.is_numeric_dtype(group[column]) else '' for column in group.columns
    }, index=range(num_rows))
    
    # Set the @@case_index column to the current case index value
    additional_rows['@@case_index'] = case_index_value
    
    # Append the additional rows to the group
    return pd.concat([group, additional_rows], ignore_index=True)

# Function to pad cases with less than 5 events
def pad_cases(df):
    # Group by @@case_index
    groups = df.groupby('@@case_index')
    
    # Placeholder for modified groups
    modified_groups = []
    
    for name, group in groups:
        # Calculate the number of events to add
        events_to_add = 5 - len(group)
        
        if events_to_add > 0:
            # Add the required number of rows
            group = add_rows(group, events_to_add, name)
        
        # Append the modified group to the list
        modified_groups.append(group)
    
    # Concatenate all modified groups back into a single DataFrame
    return pd.concat(modified_groups, ignore_index=True) 

In [19]:
# Apply the padding function
modified_dataframe = pad_cases(modified_dataframe)
modified_dataframe

Unnamed: 0,id,org:resource,concept:name,org:role,@@case_index,standardized_elapsed_time
0,0,0,0,0,0,0.000000
1,1,1,1,1,0,-0.372192
2,2,1,2,2,0,-0.367752
3,3,2,3,3,0,-0.307612
4,4,2,4,3,0,-0.155482
...,...,...,...,...,...,...
77705,56435,1,12,7,10499,-0.371921
77706,56436,1,2,2,10499,-0.070090
77707,56437,2,3,3,10499,0.254628
77708,56438,2,4,3,10499,0.410114


# Generate sliding windows

In [20]:
df_id = modified_dataframe[['id', '@@case_index']]
df_resource = modified_dataframe[['org:resource', '@@case_index']]
df_activity = modified_dataframe[['concept:name', '@@case_index']]
df_role = modified_dataframe[['org:role', '@@case_index']]
df_timestamp = modified_dataframe[['standardized_elapsed_time', '@@case_index']]

In [21]:
def generate_sliding_windows(df, case_id_column='@@case_index', window_size=5):
    windows = []
    targets = []
    case_indices = []

    # Iterate over each unique case
    for case_id in df[case_id_column].unique():
        # Extract the case
        case_data = df[df[case_id_column] == case_id]
        
        # Convert case_data to a NumPy array and drop the case_id_column
        case_data_array = case_data.drop(columns=[case_id_column]).to_numpy()

        # Adjusting the condition to correctly reflect window_size without needing an additional +1
        # Now it correctly considers window_size as including the target event
        if len(case_data_array) >= window_size:
            # Adjust the loop to generate sliding windows of size window_size - 1 for the inputs and use the next event as the target
            for i in range(len(case_data_array) - window_size + 1):
                # window now has window_size - 1 events
                window = case_data_array[i:i + window_size - 1]
                # The target is the event immediately following the window
                target = case_data_array[i + window_size - 1]
                windows.append(window)
                targets.append(target)
                case_indices.append(case_id)  # Store the case_id corresponding to the window

    # Convert lists to numpy arrays for easier handling and to ensure they are two-dimensional
    windows_array = np.array(windows)
    targets_array = np.array(targets)
    case_indices_array = np.array(case_indices)
    
    return windows_array, targets_array, case_indices_array

In [22]:
windows_id, targets_id, case_indices = generate_sliding_windows(df_id)
windows_resource, targets_resource, case_indices = generate_sliding_windows(df_resource)
windows_activity, targets_activity, case_indices = generate_sliding_windows(df_activity)
windows_role, targets_role, case_indices = generate_sliding_windows(df_role)
windows_timestamp, targets_timestamp, case_indices = generate_sliding_windows(df_timestamp)

# LSTM

### Architecture

In [23]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate, Embedding, BatchNormalization

# Assuming these values as placeholders, replace them with actual counts from your data
num_resources = modified_dataframe['org:resource'].nunique()
num_activities = modified_dataframe['concept:name'].nunique()
num_roles = modified_dataframe['org:role'].nunique()
embedding_dim_resource = 50
embedding_dim_activity = 50
embedding_dim_role = 50

time_steps = 4

# Input layers
input_resource = Input(shape=(time_steps,), name='input_resource')
input_activity = Input(shape=(time_steps,), name='input_activity')
input_role = Input(shape=(time_steps,), name='input_role')
input_timestamp = Input(shape=(time_steps, 1), name='input_timestamp')

# Embedding layers
embedding_resource = Embedding(input_dim=num_resources, output_dim=embedding_dim_resource, input_length=time_steps)(input_resource)
embedding_activity = Embedding(input_dim=num_activities, output_dim=embedding_dim_activity, input_length=time_steps)(input_activity)
embedding_role = Embedding(input_dim=num_roles, output_dim=embedding_dim_role, input_length=time_steps)(input_role)

# LSTM layers
lstm_resource = LSTM(25, return_sequences=False)(embedding_resource)
lstm_activity = LSTM(25, return_sequences=False)(embedding_activity)
lstm_role = LSTM(25, return_sequences=False)(embedding_role)

# Normalize the numeric input for better performance
batch_norm_timestamp = BatchNormalization()(input_timestamp)
lstm_timestamp = LSTM(25, return_sequences=False)(batch_norm_timestamp)

# Concatenate outputs
concatenated = Concatenate(axis=-1)([lstm_resource, lstm_activity, lstm_role, lstm_timestamp])

# Output layers for classification
output_resource = Dense(num_resources, activation='softmax', name='output_resource')(concatenated)
output_activity = Dense(num_activities, activation='softmax', name='output_activity')(concatenated)
output_role = Dense(num_roles, activation='softmax', name='output_role')(concatenated)

# Additional output layer for the timestamp (regression)
output_timestamp = Dense(1, activation='linear', name='output_timestamp')(concatenated)  # For predicting timestamp as a continuous variable

# Create and compile the model
model = Model(inputs=[input_resource, input_activity, input_role, input_timestamp], 
              outputs=[output_resource, output_activity, output_role, output_timestamp])

model.compile(optimizer='adam', 
              loss={'output_resource': 'categorical_crossentropy',
                    'output_activity': 'categorical_crossentropy',
                    'output_role': 'categorical_crossentropy',
                    'output_timestamp': 'mean_squared_error'},  # Adding MSE loss for the timestamp
              metrics={'output_resource': 'accuracy',
                       'output_activity': 'accuracy',
                       'output_role': 'accuracy',
                       'output_timestamp': None})  # Metrics for regression might include MSE, MAE, etc., but it's optional here

model.summary()

2024-08-19 22:09:39.842063: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_resource (InputLayer  [(None, 4)]                  0         []                            
 )                                                                                                
                                                                                                  
 input_activity (InputLayer  [(None, 4)]                  0         []                            
 )                                                                                                
                                                                                                  
 input_role (InputLayer)     [(None, 4)]                  0         []                            
                                                                                              

# Data Splitting

In [24]:
from sklearn.model_selection import train_test_split

train_resource, test_resource, train_targets_resource, test_targets_resource = train_test_split(
    windows_resource, targets_resource, test_size=0.3, random_state=42)

train_activity, test_activity, train_targets_activity, test_targets_activity = train_test_split(
    windows_activity, targets_activity, test_size=0.3, random_state=42)

train_role, test_role, train_targets_role, test_targets_role = train_test_split(
    windows_role, targets_role, test_size=0.3, random_state=42)

train_timestamp, test_timestamp, train_targets_timestamp, test_targets_timestamp = train_test_split(
    windows_timestamp, targets_timestamp, test_size=0.3, random_state=42)

# Training

In [25]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler

# EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

def cyclic_lr(epoch, lr):
    # Example function that modulates LR within a range for each epoch
    # Customize this function based on your cyclic learning rate policy
    max_lr = 0.01  # Maximum LR
    base_lr = 0.001  # Base LR
    step_size = 10  # Number of epochs for half a cycle
    cycle = np.floor(1 + epoch / (2 * step_size))
    x = np.abs(epoch / step_size - 2 * cycle + 1)
    lr = base_lr + (max_lr - base_lr) * np.maximum(0, (1 - x))
    return lr

lr_scheduler = LearningRateScheduler(cyclic_lr)

In [26]:
from tensorflow.keras.utils import to_categorical

train_targets_resource_cat = to_categorical(train_targets_resource, num_classes=num_resources)
test_targets_resource_cat = to_categorical(test_targets_resource, num_classes=num_resources)

train_targets_activity_cat = to_categorical(train_targets_activity, num_classes=num_activities)
test_targets_activity_cat = to_categorical(test_targets_activity, num_classes=num_activities)

train_targets_role_cat = to_categorical(train_targets_role, num_classes=num_roles)
test_targets_role_cat = to_categorical(test_targets_role, num_classes=num_roles)

# Assuming 'timestamp' is a target to be predicted in a regression context,
# and thus doesn't require conversion to categorical. 
# Ensure `train_targets_timestamp` and `test_targets_timestamp` are properly scaled if needed.

# Adjusting the `model.fit` call to include the new attributes
history = model.fit([train_resource, train_activity, train_role, train_timestamp], 
                    [train_targets_resource_cat, train_targets_activity_cat, train_targets_role_cat, train_targets_timestamp],
                    epochs=25,
                    batch_size=64,
                    validation_data=([test_resource, test_activity, test_role, test_timestamp], 
                                     [test_targets_resource_cat, test_targets_activity_cat, test_targets_role_cat, test_targets_timestamp]),
                    verbose=1,
                    callbacks=[lr_scheduler])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [27]:
evaluation = model.evaluate(
    [test_resource, test_activity, test_role, test_timestamp],
    [test_targets_resource_cat, test_targets_activity_cat, test_targets_role_cat, test_targets_timestamp]
)

print(evaluation)

[0.7043089270591736, 0.024570632725954056, 0.06807364523410797, 0.0438777320086956, 0.5677869915962219, 0.9910389184951782, 0.9750770330429077, 0.9803043007850647]


# Anomaly Score Computation

In [29]:
# Generate predictions for all inputs
predictions = model.predict([windows_resource, windows_activity, windows_role, windows_timestamp])

# Assuming your model is set to predict id, resource, activity, and role categories
# Extract predictions for categorical attributes (softmax probabilities)
predictions_resource = predictions[0] # Resource predictions
predictions_activity = predictions[1] # Activity predictions
predictions_role = predictions[2]     # Role predictions

# If you had added 'timestamp' as a target to be predicted, you would extract its predictions like so:
predictions_timestamp = predictions[3]  # Assuming 'timestamp' is a regression target and the model is adjusted accordingly



In [30]:
import numpy as np
from tensorflow.keras.utils import to_categorical

def compute_anomaly_scores(predictions, actuals):
    # For categorical predictions, convert actuals to one-hot for comparison
    actuals_one_hot = to_categorical(actuals, num_classes=predictions.shape[-1])
    
    max_predictions = np.max(predictions, axis=-1)
    actual_predictions = np.sum(predictions * actuals_one_hot, axis=-1)  # Extract the probability of the actual class
    
    anomaly_scores = (max_predictions - actual_predictions) / max_predictions
    
    return anomaly_scores

# Assuming targets_id, targets_resource, targets_activity, targets_role are the true values for these attributes
anomaly_scores_resource = compute_anomaly_scores(predictions_resource, targets_resource)
anomaly_scores_activity = compute_anomaly_scores(predictions_activity, targets_activity)
anomaly_scores_role = compute_anomaly_scores(predictions_role, targets_role)

In [31]:
def compute_anomaly_scores_continuous(predictions, actuals, normalization_factor):
    """
    Compute anomaly scores for continuous attributes.
    
    Parameters:
    - predictions: numpy array of predicted values.
    - actuals: numpy array of actual values.
    - normalization_factor: normalization factor (e.g., standard deviation of the attribute).
    
    Returns:
    - numpy array of anomaly scores.
    """
    # Calculate absolute differences
    differences = np.abs(predictions - actuals)
    
    # Normalize the differences
    anomaly_scores = differences / normalization_factor
    
    return anomaly_scores

In [32]:
normalization_factor = np.std(targets_timestamp)       # Example normalization factor (standard deviation)
anomaly_scores_timestamp = compute_anomaly_scores_continuous(predictions_timestamp, targets_timestamp, normalization_factor)

In [33]:
import numpy as np

def classify_cases(anomaly_scores_resource, anomaly_scores_activity, anomaly_scores_timestamp, anomaly_scores_role,  threshold=0.98):
    # Ensure all inputs are numpy arrays of the same shape
    anomaly_scores_resource = np.array(anomaly_scores_resource).flatten()
    anomaly_scores_activity = np.array(anomaly_scores_activity).flatten()
    anomaly_scores_timestamp = np.array(anomaly_scores_timestamp).flatten()
    anomaly_scores_role = np.array(anomaly_scores_role).flatten()


    # Check if all arrays have the same length
    if not (len(anomaly_scores_resource) == len(anomaly_scores_activity) == len(anomaly_scores_timestamp == len(anomaly_scores_role))):
        raise ValueError("All input anomaly scores must have the same length.")

    # Find the maximum anomaly score across all attributes for each case
    max_scores = np.maximum.reduce([anomaly_scores_resource, anomaly_scores_activity, anomaly_scores_timestamp, anomaly_scores_role])

    # Classify cases as anomalous if the maximum anomaly score exceeds the threshold
    anomalous_cases = max_scores > threshold
    
    return anomalous_cases

# Now use the anomaly scores for resource, activity, and timestamp in the classification
anomalous_cases = classify_cases(anomaly_scores_resource, anomaly_scores_activity, anomaly_scores_timestamp, anomaly_scores_role)

# Mapping

### True: anomaly, False: no anomaly

In [34]:
# Create a DataFrame from the case_indices_array corresponding to case_resource
mapping = pd.DataFrame({'case': case_indices})
mapping['predicted'] = anomalous_cases
mapping

Unnamed: 0,case,predicted
0,0,False
1,0,False
2,1,False
3,1,False
4,1,False
...,...,...
35705,10498,False
35706,10498,False
35707,10499,False
35708,10499,False


In [35]:
case_prediction = mapping.groupby('case')['predicted'].any()
case_prediction


case
0        False
1        False
2        False
3        False
4        False
         ...  
10495    False
10496    False
10497    False
10498    False
10499    False
Name: predicted, Length: 10500, dtype: bool

# Ground Truth

In [36]:
def generate_alignments_adjusted_tracecost_pkl(log, net, initial_marking, final_marking):
    from pm4py.algo.conformance.alignments.petri_net import algorithm as alignments
    from pm4py.algo.conformance.alignments.petri_net import variants
    from pm4py.objects.petri_net.utils import align_utils
    max_events=0
    for trace in log:
        counter=0
        for event in trace:
            counter+=1
        if counter > max_events:
            max_events=counter
    parameters={}
    parameters[alignments.Variants.VERSION_STATE_EQUATION_A_STAR.value.Parameters.PARAM_SYNC_COST_FUNCTION] = list(map(lambda i: .1*i, range(max_events*2)))
    parameters[alignments.Variants.VERSION_STATE_EQUATION_A_STAR.value.Parameters.PARAM_TRACE_COST_FUNCTION]=list(map(lambda i: align_utils.STD_MODEL_LOG_MOVE_COST-.1*i, range(max_events*2)))
    aligned_traces = alignments.apply_log(log, net, initial_marking, final_marking, variant=variants.state_equation_a_star, parameters=parameters)
    return aligned_traces

In [37]:
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.bpmn.importer import importer as bpmn_importer
from pm4py.algo.conformance.alignments.petri_net import algorithm as alignments_petri

# 2. Import the given BPMN model
bpmn_graph = bpmn_importer.apply("../../data/model/Model_ DomesticDeclarations.bpmn")

# 3. Convert the BPMN to a Petri net
net, initial_marking, final_marking = pm4py.convert_to_petri_net(bpmn_graph)

aligned_traces = generate_alignments_adjusted_tracecost_pkl(log, net, initial_marking, final_marking)

aligning log, completed variants ::   0%|          | 0/99 [00:00<?, ?it/s]

In [38]:
def extract_conformance_status_by_fitness(aligned_traces):
    conformance_status = []
    for alignment in aligned_traces:
        fitness = alignment['fitness']
        # If the fitness is 1.0, the trace is conforming
        if fitness == 1.0:
            conformance_status.append(0)
        else:
            conformance_status.append(1)
    return conformance_status

# Get the conformance status list from the aligned traces
conformance = extract_conformance_status_by_fitness(aligned_traces)

In [39]:
ground_truth = pd.DataFrame({'conformity': conformance})
ground_truth['predicted'] = case_prediction

# Convert False to 0 and True to 1
ground_truth['predicted'] = [int(value) for value in ground_truth['predicted']]
ground_truth

Unnamed: 0,conformity,predicted
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
10495,0,0
10496,0,0
10497,0,0
10498,0,0


# Evaluation

In [40]:
# Calculating TP, TN, FP, FN
TP = ((ground_truth['conformity'] == 1) & (ground_truth['predicted'] == 1)).sum()
TN = ((ground_truth['conformity'] == 0) & (ground_truth['predicted'] == 0)).sum()
FP = ((ground_truth['conformity'] == 0) & (ground_truth['predicted'] == 1)).sum()
FN = ((ground_truth['conformity'] == 1) & (ground_truth['predicted'] == 0)).sum()

In [41]:
precision_dev = TP / (TP + FP)
print(f"Precision Dev: {precision_dev:.2f}")

Precision Dev: 0.50


In [42]:
recall_dev = TP / (TP + FN)
print(f"Recall Dev: {recall_dev:.2f}")

Recall Dev: 0.08


In [43]:
precision_no_dev = TN / (TN + FN)
print(f"Precision No Dev: {precision_no_dev:.2f}")

Precision No Dev: 0.71


In [44]:
recall_no_dev = TN / (TN + FP)
print(f"Recall No Dev: {recall_no_dev:.2f}")

Recall No Dev: 0.97


In [45]:
from sklearn.metrics import roc_auc_score

auc_roc = roc_auc_score(ground_truth['conformity'], ground_truth['predicted'])
print(f"AUC-ROC: {auc_roc:.2f}")

AUC-ROC: 0.52
