# Import Event Log

In [2]:
import pandas as pd
import numpy as np
import pm4py
from pm4py.objects.conversion.log import converter as log_converter

if __name__ == "__main__":
    # Read the XES file
    dataframe_log = pm4py.read_xes('../../data/logs/RequestForPayment.xes')

    # If 'log' is already a DataFrame, add the @@case_index column directly
    case_indices = {case_id: idx for idx, case_id in enumerate(dataframe_log['case:concept:name'].unique())}
    dataframe_log['@@case_index'] = dataframe_log['case:concept:name'].map(case_indices)
    
     # Convert the dataframe to event log
    log = log_converter.apply(dataframe_log)
    
dataframe_log



parsing log, completed traces ::   0%|          | 0/6886 [00:00<?, ?it/s]

Unnamed: 0,id,org:resource,concept:name,time:timestamp,org:role,case:Rfp_id,case:Project,case:Task,case:concept:name,case:OrganizationalEntity,case:Cost Type,case:RequestedAmount,case:Activity,case:RfpNumber,@@case_index
0,st_step 148220_0,STAFF MEMBER,Request For Payment SUBMITTED by EMPLOYEE,2017-01-09 08:17:18+00:00,EMPLOYEE,request for payment 148214,project 148216,UNKNOWN,request for payment 148214,organizational unit 65463,0,34.336343,UNKNOWN,request for payment number 148215,0
1,st_step 148221_0,STAFF MEMBER,Request For Payment FINAL_APPROVED by SUPERVISOR,2017-01-09 08:18:00+00:00,SUPERVISOR,request for payment 148214,project 148216,UNKNOWN,request for payment 148214,organizational unit 65463,0,34.336343,UNKNOWN,request for payment number 148215,0
2,st_step 148222_0,STAFF MEMBER,Request For Payment REJECTED by MISSING,2017-01-10 11:42:32+00:00,MISSING,request for payment 148214,project 148216,UNKNOWN,request for payment 148214,organizational unit 65463,0,34.336343,UNKNOWN,request for payment number 148215,0
3,st_step 148219_0,STAFF MEMBER,Request For Payment SUBMITTED by EMPLOYEE,2017-03-03 08:51:13+00:00,EMPLOYEE,request for payment 148214,project 148216,UNKNOWN,request for payment 148214,organizational unit 65463,0,34.336343,UNKNOWN,request for payment number 148215,0
4,st_step 148218_0,STAFF MEMBER,Request For Payment APPROVED by PRE_APPROVER,2017-03-03 08:51:42+00:00,PRE_APPROVER,request for payment 148214,project 148216,UNKNOWN,request for payment 148214,organizational unit 65463,0,34.336343,UNKNOWN,request for payment number 148215,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36791,st_step 185004_0,STAFF MEMBER,Request For Payment APPROVED by ADMINISTRATION,2018-12-29 11:35:02+00:00,ADMINISTRATION,request for payment 185000,project 147860,task 152704,request for payment 185000,organizational unit 65468,0,15.409660,activity 505,request for payment number 185001,6885
36792,st_step 185003_0,STAFF MEMBER,Request For Payment APPROVED by BUDGET OWNER,2019-01-03 08:27:20+00:00,BUDGET OWNER,request for payment 185000,project 147860,task 152704,request for payment 185000,organizational unit 65468,0,15.409660,activity 505,request for payment number 185001,6885
36793,st_step 185005_0,STAFF MEMBER,Request For Payment FINAL_APPROVED by SUPERVISOR,2019-01-08 08:00:39+00:00,SUPERVISOR,request for payment 185000,project 147860,task 152704,request for payment 185000,organizational unit 65468,0,15.409660,activity 505,request for payment number 185001,6885
36794,rp_request for payment 185000_15,SYSTEM,Request Payment,2019-01-08 08:29:14+00:00,UNDEFINED,request for payment 185000,project 147860,task 152704,request for payment 185000,organizational unit 65468,0,15.409660,activity 505,request for payment number 185001,6885


# Drop unnessary columns

In [3]:
dataframe_log = dataframe_log.drop(columns=['case:Rfp_id'])

In [4]:
dataframe_log = dataframe_log.drop(columns=['case:Project'])

In [5]:
dataframe_log = dataframe_log.drop(columns=['case:Task'])

In [6]:
dataframe_log = dataframe_log.drop(columns=['case:concept:name'])

In [7]:
dataframe_log = dataframe_log.drop(columns=['case:OrganizationalEntity'])

In [8]:
dataframe_log = dataframe_log.drop(columns=['case:Cost Type'])

In [9]:
dataframe_log = dataframe_log.drop(columns=['case:RequestedAmount'])

In [10]:
dataframe_log = dataframe_log.drop(columns=['case:Activity'])

In [11]:
dataframe_log = dataframe_log.drop(columns=['case:RfpNumber'])

In [12]:
from sklearn.preprocessing import StandardScaler

# Convert to datetime format
dataframe_log['time:timestamp'] = pd.to_datetime(dataframe_log['time:timestamp'])

# Calculate elapsed time since the start of each case
dataframe_log['start_time'] = dataframe_log.groupby('@@case_index')['time:timestamp'].transform('min')
dataframe_log['elapsed_time'] = (dataframe_log['time:timestamp'] - dataframe_log['start_time']).dt.total_seconds()

# Normalize the elapsed time in minutes
scaler = StandardScaler()
dataframe_log['standardized_elapsed_time'] = scaler.fit_transform(dataframe_log[['elapsed_time']])

dataframe_log = dataframe_log.drop(columns=['start_time'])
dataframe_log = dataframe_log.drop(columns=['elapsed_time'])
dataframe_log = dataframe_log.drop(columns=['time:timestamp'])

# Insert Start & End markers

In [13]:
# Define a function to insert start and end markers
def add_markers(df):
    # Identify unique case indices
    case_indices = df['@@case_index'].unique()
    
    # Prepare a container for new DataFrame rows
    new_rows = []
    
    # Iterate over each case index to add start and end markers
    for case_index in case_indices:
        # Create a start marker row with all columns except @@case_index set to 'Start'
        start_row = {col: 'Start' if col != '@@case_index' else case_index for col in df.columns}
        
        # Create an end marker row with all columns except @@case_index set to 'End'
        end_row = {col: 'End' if col != '@@case_index' else case_index for col in df.columns}
        
        # Append start row, rows for the current case, and end row
        new_rows.append(start_row)
        new_rows.extend(df[df['@@case_index'] == case_index].to_dict('records'))
        new_rows.append(end_row)
    
    # Convert the list of rows into a DataFrame
    return pd.DataFrame(new_rows)

# Apply the function to add start and end markers to the dataframe
modified_dataframe = add_markers(dataframe_log)

In [14]:
modified_dataframe

Unnamed: 0,id,org:resource,concept:name,org:role,@@case_index,standardized_elapsed_time
0,Start,Start,Start,Start,0,Start
1,st_step 148220_0,STAFF MEMBER,Request For Payment SUBMITTED by EMPLOYEE,EMPLOYEE,0,-0.411261
2,st_step 148221_0,STAFF MEMBER,Request For Payment FINAL_APPROVED by SUPERVISOR,SUPERVISOR,0,-0.411227
3,st_step 148222_0,STAFF MEMBER,Request For Payment REJECTED by MISSING,MISSING,0,-0.330864
4,st_step 148219_0,STAFF MEMBER,Request For Payment SUBMITTED by EMPLOYEE,EMPLOYEE,0,3.319916
...,...,...,...,...,...,...
50563,st_step 185003_0,STAFF MEMBER,Request For Payment APPROVED by BUDGET OWNER,BUDGET OWNER,6885,-0.068584
50564,st_step 185005_0,STAFF MEMBER,Request For Payment FINAL_APPROVED by SUPERVISOR,SUPERVISOR,6885,0.281954
50565,rp_request for payment 185000_15,SYSTEM,Request Payment,UNDEFINED,6885,0.283351
50566,rp_request for payment 185000_16,SYSTEM,Payment Handled,UNDEFINED,6885,0.447637


# Preprocess

In [15]:
modified_dataframe['standardized_elapsed_time'] = modified_dataframe['standardized_elapsed_time'].replace({'Start': 0, 'End': 1})

In [16]:
codes, uniques = pd.factorize(modified_dataframe['id'])
modified_dataframe['id'] = codes

In [17]:
codes, uniques = pd.factorize(modified_dataframe['org:resource'])
modified_dataframe['org:resource'] = codes

In [18]:
codes, uniques = pd.factorize(modified_dataframe['concept:name'])
modified_dataframe['concept:name'] = codes

In [19]:
codes, uniques = pd.factorize(modified_dataframe['org:role'])
modified_dataframe['org:role'] = codes

In [20]:
modified_dataframe

Unnamed: 0,id,org:resource,concept:name,org:role,@@case_index,standardized_elapsed_time
0,0,0,0,0,0,0.000000
1,1,1,1,1,0,-0.411261
2,2,1,2,2,0,-0.411227
3,3,1,3,3,0,-0.330864
4,4,1,1,1,0,3.319916
...,...,...,...,...,...,...
50563,36794,1,16,9,6885,-0.068584
50564,36795,1,2,2,6885,0.281954
50565,36796,2,5,5,6885,0.283351
50566,36797,2,6,5,6885,0.447637


### Padding for Cases with less then 5 events

In [21]:
# Calculating the frequency of each unique value in '@@case_index'
frequency = modified_dataframe['@@case_index'].value_counts()

# Finding the minimum occurrence
min_occurrence = frequency.min()

min_occurrence

3

In [22]:
# Assuming your dataframe might have different types, let's create a generic function to add rows
def add_rows(group, num_rows, case_index_value):
    # For each column, determine the appropriate "zero" value (int 0, string '', etc.)
    additional_rows = pd.DataFrame({
        column: 0 if pd.api.types.is_numeric_dtype(group[column]) else '' for column in group.columns
    }, index=range(num_rows))
    
    # Set the @@case_index column to the current case index value
    additional_rows['@@case_index'] = case_index_value
    
    # Append the additional rows to the group
    return pd.concat([group, additional_rows], ignore_index=True)

# Function to pad cases with less than 5 events
def pad_cases(df):
    # Group by @@case_index
    groups = df.groupby('@@case_index')
    
    # Placeholder for modified groups
    modified_groups = []
    
    for name, group in groups:
        # Calculate the number of events to add
        events_to_add = 5 - len(group)
        
        if events_to_add > 0:
            # Add the required number of rows
            group = add_rows(group, events_to_add, name)
        
        # Append the modified group to the list
        modified_groups.append(group)
    
    # Concatenate all modified groups back into a single DataFrame
    return pd.concat(modified_groups, ignore_index=True) 

In [23]:
# Apply the padding function
modified_dataframe = pad_cases(modified_dataframe)
modified_dataframe

Unnamed: 0,id,org:resource,concept:name,org:role,@@case_index,standardized_elapsed_time
0,0,0,0,0,0,0.000000
1,1,1,1,1,0,-0.411261
2,2,1,2,2,0,-0.411227
3,3,1,3,3,0,-0.330864
4,4,1,1,1,0,3.319916
...,...,...,...,...,...,...
50715,36794,1,16,9,6885,-0.068584
50716,36795,1,2,2,6885,0.281954
50717,36796,2,5,5,6885,0.283351
50718,36797,2,6,5,6885,0.447637


# Generate sliding windows

In [24]:
df_id = modified_dataframe[['id', '@@case_index']]
df_resource = modified_dataframe[['org:resource', '@@case_index']]
df_activity = modified_dataframe[['concept:name', '@@case_index']]
df_role = modified_dataframe[['org:role', '@@case_index']]
df_timestamps = modified_dataframe[['standardized_elapsed_time', '@@case_index']]

In [25]:
def generate_sliding_windows(df, case_id_column='@@case_index', window_size=5):
    windows = []
    targets = []
    case_indices = []

    # Iterate over each unique case
    for case_id in df[case_id_column].unique():
        # Extract the case
        case_data = df[df[case_id_column] == case_id]
        
        # Convert case_data to a NumPy array and drop the case_id_column
        case_data_array = case_data.drop(columns=[case_id_column]).to_numpy()

        # Adjusting the condition to correctly reflect window_size without needing an additional +1
        # Now it correctly considers window_size as including the target event
        if len(case_data_array) >= window_size:
            # Adjust the loop to generate sliding windows of size window_size - 1 for the inputs and use the next event as the target
            for i in range(len(case_data_array) - window_size + 1):
                # window now has window_size - 1 events
                window = case_data_array[i:i + window_size - 1]
                # The target is the event immediately following the window
                target = case_data_array[i + window_size - 1]
                windows.append(window)
                targets.append(target)
                case_indices.append(case_id)  # Store the case_id corresponding to the window

    # Convert lists to numpy arrays for easier handling and to ensure they are two-dimensional
    windows_array = np.array(windows)
    targets_array = np.array(targets)
    case_indices_array = np.array(case_indices)
    
    return windows_array, targets_array, case_indices_array

In [26]:
windows_id, targets_id, case_indices = generate_sliding_windows(df_id)
windows_resource, targets_resource, case_indices = generate_sliding_windows(df_resource)
windows_activity, targets_activity, case_indices = generate_sliding_windows(df_activity)
windows_role, targets_role, case_indices = generate_sliding_windows(df_role)
windows_timestamps, targets_timestamps, case_indices = generate_sliding_windows(df_timestamps)

# LSTM

### Architecture

- id not included because bad learning through LSTM

In [27]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate, Embedding, BatchNormalization

# Assuming these values as placeholders, replace them with actual counts from your data
num_resources = modified_dataframe['org:resource'].nunique()
num_activities = modified_dataframe['concept:name'].nunique()
num_roles = modified_dataframe['org:role'].nunique()

embedding_dim_resource = 50
embedding_dim_activity = 50
embedding_dim_role = 50

time_steps = 4

# Input layers
input_resource = Input(shape=(time_steps,), name='input_resource')
input_activity = Input(shape=(time_steps,), name='input_activity')
input_role = Input(shape=(time_steps,), name='input_role')
input_timestamps = Input(shape=(time_steps, 1), name='input_timestamps')  # Assuming timestamps are continuous

# Embedding layers for categorical attributes
embedding_resource = Embedding(input_dim=num_resources, output_dim=embedding_dim_resource, input_length=time_steps)(input_resource)
embedding_activity = Embedding(input_dim=num_activities, output_dim=embedding_dim_activity, input_length=time_steps)(input_activity)
embedding_role = Embedding(input_dim=num_roles, output_dim=embedding_dim_role, input_length=time_steps)(input_role)

# LSTM layers for the embeddings
lstm_resource = LSTM(25, return_sequences=False)(embedding_resource)
lstm_activity = LSTM(25, return_sequences=False)(embedding_activity)
lstm_role = LSTM(25, return_sequences=False)(embedding_role)

# Normalize the numeric input for better performance
batch_norm_timestamps = BatchNormalization()(input_timestamps)
lstm_timestamps = LSTM(25, return_sequences=False)(batch_norm_timestamps)

# Concatenate outputs
concatenated = Concatenate(axis=-1)([lstm_resource, lstm_activity, lstm_role, lstm_timestamps])

# Output layers for classification
output_resource = Dense(num_resources, activation='softmax', name='output_resource')(concatenated)
output_activity = Dense(num_activities, activation='softmax', name='output_activity')(concatenated)
output_role = Dense(num_roles, activation='softmax', name='output_role')(concatenated)

# Additional output layer for the amount (regression)
output_timestamps = Dense(1, activation='linear', name='output_timestamps')(concatenated)  # For predicting timestamps as a continuous variable

# Create and compile the model
model = Model(inputs=[input_resource, input_activity, input_role, input_timestamps], 
              outputs=[output_resource, output_activity, output_role, output_timestamps])

model.compile(optimizer='adam', 
              loss={'output_resource': 'categorical_crossentropy',
                    'output_activity': 'categorical_crossentropy',
                    'output_role': 'categorical_crossentropy',
                    'output_timestamps': 'mean_squared_error'},  # Adding MSE loss for the timestamps
              metrics={'output_resource': 'accuracy', 
                       'output_activity': 'accuracy',
                       'output_role': 'accuracy',
                       'output_timestamps': 'mean_absolute_error'})  # Metrics for regression
model.summary()

2024-08-19 22:02:45.908459: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_resource (InputLayer  [(None, 4)]                  0         []                            
 )                                                                                                
                                                                                                  
 input_activity (InputLayer  [(None, 4)]                  0         []                            
 )                                                                                                
                                                                                                  
 input_role (InputLayer)     [(None, 4)]                  0         []                            
                                                                                              

# Data Splitting

In [28]:
from sklearn.model_selection import train_test_split

train_resource, test_resource, train_targets_resource, test_targets_resource = train_test_split(
    windows_resource, targets_resource, test_size=0.3, random_state=42)

train_activity, test_activity, train_targets_activity, test_targets_activity = train_test_split(
    windows_activity, targets_activity, test_size=0.3, random_state=42)

train_role, test_role, train_targets_role, test_targets_role = train_test_split(
    windows_role, targets_role, test_size=0.3, random_state=42)

train_timestamps, test_timestamps, train_targets_timestamps, test_targets_timestamps = train_test_split(
    windows_timestamps, targets_timestamps, test_size=0.3, random_state=42)

# Training

In [29]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler

# EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

def cyclic_lr(epoch, lr):
    # Example function that modulates LR within a range for each epoch
    # Customize this function based on your cyclic learning rate policy
    max_lr = 0.01  # Maximum LR
    base_lr = 0.001  # Base LR
    step_size = 10  # Number of epochs for half a cycle
    cycle = np.floor(1 + epoch / (2 * step_size))
    x = np.abs(epoch / step_size - 2 * cycle + 1)
    lr = base_lr + (max_lr - base_lr) * np.maximum(0, (1 - x))
    return lr

lr_scheduler = LearningRateScheduler(cyclic_lr)

In [30]:
import numpy as np
from tensorflow.keras.utils import to_categorical

# Converting targets to categorical format for softmax output
train_targets_resource_cat = to_categorical(train_targets_resource, num_classes=num_resources)
test_targets_resource_cat = to_categorical(test_targets_resource, num_classes=num_resources)

train_targets_activity_cat = to_categorical(train_targets_activity, num_classes=num_activities)
test_targets_activity_cat = to_categorical(test_targets_activity, num_classes=num_activities)

train_targets_role_cat = to_categorical(train_targets_role, num_classes=num_roles)
test_targets_role_cat = to_categorical(test_targets_role, num_classes=num_roles)


# Ensure `train_targets_timestamps` and `test_targets_timestamps` are properly scaled if needed

# Adjusting the `model.fit` call to include the new variables
history = model.fit(
    [train_resource, train_activity, train_role, train_timestamps], 
    [train_targets_resource_cat, train_targets_activity_cat, train_targets_role_cat, train_targets_timestamps],
    epochs=25,
    batch_size=64,
    validation_data=(
        [test_resource, test_activity, test_role, test_timestamps], 
        [test_targets_resource_cat, test_targets_activity_cat, test_targets_role_cat, test_targets_timestamps]
    ),
    verbose=1,
    callbacks=[lr_scheduler]  # Assuming lr_scheduler is defined, add other callbacks as needed
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [31]:
evaluation = model.evaluate(
    [test_resource, test_activity, test_role, test_timestamps],
    [test_targets_resource_cat, test_targets_activity_cat, test_targets_role_cat, test_targets_timestamps]
)

print(evaluation)

[0.9720749855041504, 0.040459781885147095, 0.0872754380106926, 0.0609469972550869, 0.7833930253982544, 0.9784265756607056, 0.9623184204101562, 0.9687904715538025, 0.3026599586009979]


# Anomaly Score Computation

In [32]:
# Generate predictions for all inputs
predictions = model.predict([windows_resource, windows_activity, windows_role, windows_timestamps])

# Extract predictions for each output
predictions_resource = predictions[0]   # Resource predictions (categorical)
predictions_activity = predictions[1]   # Activity predictions (categorical)
predictions_role = predictions[2]       # Role predictions (categorical)
predictions_timestamps = predictions[3] # Timestamps predictions (regression)



In [33]:
# Step 2: Compute Anomaly Scores for categorical variables (Resources and Activities)

def compute_anomaly_scores(predictions, actuals):
    # For categorical predictions, convert actuals to one-hot for comparison
    actuals_one_hot = to_categorical(actuals, num_classes=predictions.shape[-1])
    
    max_predictions = np.max(predictions, axis=-1)
    actual_predictions = np.sum(predictions * actuals_one_hot, axis=-1)  # Extract the probability of the actual class
    
    anomaly_scores = (max_predictions - actual_predictions) / max_predictions
    
    return anomaly_scores

anomaly_scores_resource = compute_anomaly_scores(predictions_resource, targets_resource)
anomaly_scores_activity = compute_anomaly_scores(predictions_activity, targets_activity)
anomaly_scores_role = compute_anomaly_scores(predictions_role, targets_role)

In [34]:
def compute_anomaly_scores_continuous(predictions, actuals, normalization_factor):
    """
    Compute anomaly scores for continuous attributes.
    
    Parameters:
    - predictions: numpy array of predicted values.
    - actuals: numpy array of actual values.
    - normalization_factor: normalization factor (e.g., standard deviation of the attribute).
    
    Returns:
    - numpy array of anomaly scores.
    """
    # Calculate absolute differences
    differences = np.abs(predictions - actuals)
    
    # Normalize the differences
    anomaly_scores = differences / normalization_factor
    
    return anomaly_scores

In [35]:
normalization_factor = np.std(targets_timestamps)       # Example normalization factor (standard deviation)
anomaly_scores_timestamp = compute_anomaly_scores_continuous(predictions_timestamps, targets_timestamps, normalization_factor)

In [36]:
def classify_cases(anomaly_scores_resource, anomaly_scores_activity, anomaly_scores_role, anomaly_scores_timestamps, threshold=0.98):
    # Ensure all inputs are numpy arrays of the same shape
    anomaly_scores_resource = np.array(anomaly_scores_resource).flatten()
    anomaly_scores_activity = np.array(anomaly_scores_activity).flatten()
    anomaly_scores_role = np.array(anomaly_scores_role).flatten()
    anomaly_scores_timestamps = np.array(anomaly_scores_timestamps).flatten()

    # Check if all arrays have the same length
    if not (len(anomaly_scores_resource) == len(anomaly_scores_activity) == len(anomaly_scores_role) == len(anomaly_scores_timestamps)):
        raise ValueError("All input anomaly scores must have the same length.")

    # Find the maximum anomaly score across all attributes for each case
    max_scores = np.maximum.reduce([anomaly_scores_resource, anomaly_scores_activity, anomaly_scores_role, anomaly_scores_timestamps])

    # Classify cases as anomalous if the maximum anomaly score exceeds the threshold
    anomalous_cases = max_scores > threshold
    
    return anomalous_cases

# Now use the anomaly scores for id, resource, activity, role, and timestamps in the classification
anomalous_cases = classify_cases(anomaly_scores_resource, anomaly_scores_activity, anomaly_scores_role, anomaly_scores_timestamp)

# Mapping

### True: anomaly, False: no anomaly

In [37]:
# Create a DataFrame from the case_indices_array corresponding to case_resource
mapping = pd.DataFrame({'case': case_indices})
mapping['predicted'] = anomalous_cases
mapping

Unnamed: 0,case,predicted
0,0,True
1,0,False
2,0,False
3,0,True
4,0,True
...,...,...
23171,6884,False
23172,6885,False
23173,6885,False
23174,6885,False


In [38]:
case_prediction = mapping.groupby('case')['predicted'].any()
case_prediction


case
0        True
1       False
2        True
3        True
4        True
        ...  
6881    False
6882    False
6883    False
6884    False
6885    False
Name: predicted, Length: 6886, dtype: bool

# Ground Truth

In [39]:
def generate_alignments_adjusted_tracecost_pkl(log, net, initial_marking, final_marking):
    from pm4py.algo.conformance.alignments.petri_net import algorithm as alignments
    from pm4py.algo.conformance.alignments.petri_net import variants
    from pm4py.objects.petri_net.utils import align_utils
    max_events=0
    for trace in log:
        counter=0
        for event in trace:
            counter+=1
        if counter > max_events:
            max_events=counter
    parameters={}
    parameters[alignments.Variants.VERSION_STATE_EQUATION_A_STAR.value.Parameters.PARAM_SYNC_COST_FUNCTION] = list(map(lambda i: .1*i, range(max_events*2)))
    parameters[alignments.Variants.VERSION_STATE_EQUATION_A_STAR.value.Parameters.PARAM_TRACE_COST_FUNCTION]=list(map(lambda i: align_utils.STD_MODEL_LOG_MOVE_COST-.1*i, range(max_events*2)))
    aligned_traces = alignments.apply_log(log, net, initial_marking, final_marking, variant=variants.state_equation_a_star, parameters=parameters)
    return aligned_traces

In [40]:
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.bpmn.importer import importer as bpmn_importer
from pm4py.algo.conformance.alignments.petri_net import algorithm as alignments_petri

# 2. Import the given BPMN model
bpmn_graph = bpmn_importer.apply("../../data/model/Model_RequestForPayment.bpmn")

# 3. Convert the BPMN to a Petri net
net, initial_marking, final_marking = pm4py.convert_to_petri_net(bpmn_graph)

aligned_traces = generate_alignments_adjusted_tracecost_pkl(log, net, initial_marking, final_marking)

aligning log, completed variants ::   0%|          | 0/89 [00:00<?, ?it/s]

In [41]:
def extract_conformance_status_by_fitness(aligned_traces):
    conformance_status = []
    for alignment in aligned_traces:
        fitness = alignment['fitness']
        # If the fitness is 1.0, the trace is conforming
        if fitness == 1.0:
            conformance_status.append(0)
        else:
            conformance_status.append(1)
    return conformance_status

# Get the conformance status list from the aligned traces
conformance = extract_conformance_status_by_fitness(aligned_traces)

In [42]:
ground_truth = pd.DataFrame({'conformity': conformance})
ground_truth['predicted'] = case_prediction

# Convert False to 0 and True to 1
ground_truth['predicted'] = [int(value) for value in ground_truth['predicted']]
ground_truth

Unnamed: 0,conformity,predicted
0,1,1
1,1,0
2,1,1
3,1,1
4,1,1
...,...,...
6881,0,0
6882,0,0
6883,0,0
6884,0,0


# Evaluation

In [43]:
# Calculating TP, TN, FP, FN
TP = ((ground_truth['conformity'] == 1) & (ground_truth['predicted'] == 1)).sum()
TN = ((ground_truth['conformity'] == 0) & (ground_truth['predicted'] == 0)).sum()
FP = ((ground_truth['conformity'] == 0) & (ground_truth['predicted'] == 1)).sum()
FN = ((ground_truth['conformity'] == 1) & (ground_truth['predicted'] == 0)).sum()

In [44]:
precision_dev = TP / (TP + FP)
print(f"Precision Dev: {precision_dev:.2f}")

Precision Dev: 0.58


In [45]:
recall_dev = TP / (TP + FN)
print(f"Recall Dev: {recall_dev:.2f}")

Recall Dev: 0.18


In [46]:
precision_no_dev = TN / (TN + FN)
print(f"Precision No Dev: {precision_no_dev:.2f}")

Precision No Dev: 0.78


In [47]:
recall_no_dev = TN / (TN + FP)
print(f"Recall No Dev: {recall_no_dev:.2f}")

Recall No Dev: 0.96


In [48]:
from sklearn.metrics import roc_auc_score

auc_roc = roc_auc_score(ground_truth['conformity'], ground_truth['predicted'])
print(f"AUC-ROC: {auc_roc:.2f}")

AUC-ROC: 0.57
