In [23]:
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
import numpy as np
from itertools import chain
from sklearn.preprocessing import OneHotEncoder
import pickle as pkl
import sklearn.metrics as metrics

In [22]:
def add_dirichlet_noise(p, alpha=0.1, noise_level=0.1):
    """
    p: original probability vector (e.g., one-hot)
    alpha: concentration parameter for the Dirichlet distribution.
           Lower alpha makes the Dirichlet sample more "peaky".
    noise_level: mixing coefficient for the noise.
    """
    # Generate a Dirichlet noise vector.
    noise = np.random.dirichlet(np.ones_like(p) * alpha)
    # Mix the original distribution with the noise.
    perturbed = (1 - noise_level) * p + noise_level * noise
    # Ensure it sums to 1.
    perturbed /= perturbed.sum()
    return perturbed

In [11]:
bpi12_factory = xes_importer.apply(r"../data/BPI_Challenge_2012.xes")

parsing log, completed traces ::   0%|          | 0/13087 [00:00<?, ?it/s]

In [46]:
all_activities_bpi12 = set()
for trace in bpi12_factory:
    for event in trace:
        all_activities_bpi12.add(event['concept:name'])

In [50]:
activity_to_int_bpi12 = {str(idx): act for idx, act in enumerate(sorted(all_activities_bpi12))}

In [51]:
activity_to_int_bpi12

{'0': 'A_ACCEPTED',
 '1': 'A_ACTIVATED',
 '2': 'A_APPROVED',
 '3': 'A_CANCELLED',
 '4': 'A_DECLINED',
 '5': 'A_FINALIZED',
 '6': 'A_PARTLYSUBMITTED',
 '7': 'A_PREACCEPTED',
 '8': 'A_REGISTERED',
 '9': 'A_SUBMITTED',
 '10': 'O_ACCEPTED',
 '11': 'O_CANCELLED',
 '12': 'O_CREATED',
 '13': 'O_DECLINED',
 '14': 'O_SELECTED',
 '15': 'O_SENT',
 '16': 'O_SENT_BACK',
 '17': 'W_Afhandelen leads',
 '18': 'W_Beoordelen fraude',
 '19': 'W_Completeren aanvraag',
 '20': 'W_Nabellen incomplete dossiers',
 '21': 'W_Nabellen offertes',
 '22': 'W_Valideren aanvraag',
 '23': 'W_Wijzigen contractgegevens'}

In [16]:
tokenized_traces = [
    np.array([activity_to_int[event['concept:name']] for event in trace])
    for trace in bpi12_factory
]

In [18]:
encoder = OneHotEncoder(sparse_output=False)
encoder.fit(np.array(list(chain.from_iterable(tokenized_traces))).reshape(-1, 1))
one_hot_bpi12 = [encoder.transform(x.reshape(-1, 1)) for x in tokenized_traces]

In [71]:
simple_noisy_dataset12 = [np.array([add_dirichlet_noise(x, alpha=0.04, noise_level=0.55) for x in t]) for t in one_hot_bpi12]

In [72]:
np.mean([metrics.accuracy_score(np.argmax(t, axis=1), np.argmax(s, axis=1)) for t, s in zip(one_hot_bpi12, simple_noisy_dataset12)])

0.781206770562066

In [43]:
with open("../data/pickles/bpi12_unified.pkl", "wb") as f:
    pkl.dump({'target': one_hot_bpi12, 'stochastic': simple_noisy_dataset12}, f)

In [52]:
bpi19_factory = xes_importer.apply(r"../data/BPI_Challenge_2019.xes")

parsing log, completed traces ::   0%|          | 0/251734 [00:00<?, ?it/s]

In [56]:
all_activities_bpi19 = set()
for trace in bpi19_factory:
    for event in trace:
        all_activities_bpi19.add(event['concept:name'])
activity_to_int_bpi19 = {str(idx): act for idx, act in enumerate(sorted(all_activities_bpi19))}
activity_to_int_bpi19

{'0': 'Block Purchase Order Item',
 '1': 'Cancel Goods Receipt',
 '2': 'Cancel Invoice Receipt',
 '3': 'Cancel Subsequent Invoice',
 '4': 'Change Approval for Purchase Order',
 '5': 'Change Currency',
 '6': 'Change Delivery Indicator',
 '7': 'Change Final Invoice Indicator',
 '8': 'Change Price',
 '9': 'Change Quantity',
 '10': 'Change Rejection Indicator',
 '11': 'Change Storage Location',
 '12': 'Change payment term',
 '13': 'Clear Invoice',
 '14': 'Create Purchase Order Item',
 '15': 'Create Purchase Requisition Item',
 '16': 'Delete Purchase Order Item',
 '17': 'Reactivate Purchase Order Item',
 '18': 'Receive Order Confirmation',
 '19': 'Record Goods Receipt',
 '20': 'Record Invoice Receipt',
 '21': 'Record Service Entry Sheet',
 '22': 'Record Subsequent Invoice',
 '23': 'Release Purchase Order',
 '24': 'Release Purchase Requisition',
 '25': 'Remove Payment Block',
 '26': 'SRM: Awaiting Approval',
 '27': 'SRM: Change was Transmitted',
 '28': 'SRM: Complete',
 '29': 'SRM: Created',

In [58]:
r_activity_to_int_bpi19 = {act: idx for idx, act in enumerate(sorted(all_activities_bpi19))}

In [60]:
tokenized_traces = [
    np.array([r_activity_to_int_bpi19[event['concept:name']] for event in trace])
    for trace in bpi19_factory
]
encoder = OneHotEncoder(sparse_output=False)
encoder.fit(np.array(list(chain.from_iterable(tokenized_traces))).reshape(-1, 1))
one_hot_bpi19 = [encoder.transform(x.reshape(-1, 1)) for x in tokenized_traces]

In [73]:
simple_noisy_dataset19 = [np.array([add_dirichlet_noise(x, alpha=0.04, noise_level=0.6) for x in t]) for t in one_hot_bpi19]

In [74]:
np.mean([metrics.accuracy_score(np.argmax(t, axis=1), np.argmax(s, axis=1)) for t, s in
         zip(one_hot_bpi19, simple_noisy_dataset19)])

0.7919753616666503

In [66]:
with open("../data/pickles/bpi19_unified.pkl", "wb") as f:
    pkl.dump({'target': one_hot_bpi19, 'stochastic': simple_noisy_dataset19}, f)

In [68]:
max([x.shape[0] for x in one_hot_bpi19])

990

In [69]:
from sktr_update.utils import group_cases_by_trace
from sktr_update.utils import prepare_df_from_dataset

# our refactored comparison function
from sktr_update.core import compare_stochastic_vs_argmax_random_indices

In [75]:
bpi12_target_sktr = [np.argmax(t, axis=1) for t in one_hot_bpi12]
bpi12_source_sktr = [x.T for x in simple_noisy_dataset12]
bpi19_target_sktr = [np.argmax(t, axis=1) for t in one_hot_bpi19]
bpi19_source_sktr = [x.T for x in simple_noisy_dataset19]

In [76]:
df12, softmax_lst12 = prepare_df_from_dataset(bpi12_target_sktr, bpi12_source_sktr)
df19, softmax_lst19 = prepare_df_from_dataset(bpi19_target_sktr, bpi19_source_sktr)

In [None]:
recovery_results_df, alignment_results_df, model = compare_stochastic_vs_argmax_random_indices(
    df=df12,
    softmax_lst=softmax_lst12,
    n_indices=60,
    activity_prob_threshold=0.01,
    cost_function='logarithmic',
    random_seed=42,
    train_cases=None,
    test_cases=None,
    n_train_traces=10,
    n_test_traces=200,
    allow_train_test_case_overlap=False,
    allow_duplicate_variants=True,
    sequential_sampling=True,
    round_precision=4,
    return_model=True,
    lambdas=[0.1, 0.3, 0.6],
    alpha=0.6,
    use_cond_probs=True,
    use_calibration=True,
    use_ngram_smoothing=False,
    temp_bounds=(1, 5)
)
case_lists = recovery_results_df.groupby("case:concept:name")[["sktr_pred", "argmax_pred", "ground_truth"]].apply(lambda g: g.values.tolist()).tolist()
sktr_result12 = [[int(x[0]) for x in y if x[0] is not None] for y in case_lists]
sktr_gt12 = [[int(x[2]) for x in y if x[0] is not None] for y in case_lists]
bpi12_accuracy = np.mean([metrics.accuracy_score(t, s) for t, s in zip(sktr_gt12, sktr_result12)])

2025-05-16 18:50:21 - INFO - Starting comparison.
2025-05-16 18:50:21 - DEBUG - Validating input parameters.
2025-05-16 18:50:21 - DEBUG - Input parameters validation passed.
2025-05-16 18:50:21 - DEBUG - Processing cost function: logarithmic
2025-05-16 18:50:21 - DEBUG - Using logarithmic cost function.
2025-05-16 18:50:21 - DEBUG - Setting random seed to 42.
2025-05-16 18:50:21 - DEBUG - === Entering filter_indices block ===
2025-05-16 18:50:21 - DEBUG - Input softmax_lst: 13087 matrices
2025-05-16 18:50:21 - DEBUG - Preparing softmax list.
2025-05-16 18:50:21 - DEBUG - prepare_softmax returned list of 13087 arrays, shapes=[(24, 26), (24, 39), (24, 59), (24, 3), (24, 3), (24, 9), (24, 14), (24, 12), (24, 14), (24, 24), (24, 77), (24, 35), (24, 20), (24, 3), (24, 76), (24, 6), (24, 55), (24, 48), (24, 40), (24, 19), (24, 38), (24, 24), (24, 42), (24, 3), (24, 26), (24, 6), (24, 29), (24, 3), (24, 3), (24, 9), (24, 6), (24, 7), (24, 108), (24, 35), (24, 28), (24, 36), (24, 20), (24, 13

In [78]:
bpi12_accuracy

0.8168613344138087

In [86]:
recovery_results_df, alignment_results_df, model = compare_stochastic_vs_argmax_random_indices(
    df=df19,
    softmax_lst=softmax_lst19,
    n_indices=60,
    activity_prob_threshold=0.01,
    cost_function='logarithmic',
    random_seed=42,
    train_cases=None,
    test_cases=None,
    n_train_traces=100,
    n_test_traces=2000,
    allow_train_test_case_overlap=False,
    allow_duplicate_variants=True,
    sequential_sampling=True,
    round_precision=4,
    return_model=True,
    lambdas=[0.1, 0.3, 0.6],
    alpha=0.6,
    use_cond_probs=True,
    use_calibration=True,
    use_ngram_smoothing=False,
    temp_bounds=(1, 5)
)
case_lists = recovery_results_df.groupby("case:concept:name")[["sktr_pred", "argmax_pred", "ground_truth"]].apply(lambda g: g.values.tolist()).tolist()
sktr_result19 = [[int(x[0]) for x in y if x[0] is not None] for y in case_lists]
sktr_gt19 = [[int(x[2]) for x in y if x[0] is not None] for y in case_lists]
bpi19_accuracy = np.mean([metrics.accuracy_score(t, s) for t, s in zip(sktr_gt19, sktr_result19)])

2025-05-16 15:05:27 - INFO - Starting comparison.
2025-05-16 15:05:27 - DEBUG - Validating input parameters.
2025-05-16 15:05:27 - DEBUG - Input parameters validation passed.
2025-05-16 15:05:27 - DEBUG - Processing cost function: logarithmic
2025-05-16 15:05:27 - DEBUG - Using logarithmic cost function.
2025-05-16 15:05:27 - DEBUG - Setting random seed to 42.
2025-05-16 15:05:27 - DEBUG - === Entering filter_indices block ===
2025-05-16 15:05:27 - DEBUG - Input softmax_lst: 251734 matrices
2025-05-16 15:05:27 - DEBUG - Preparing softmax list.
2025-05-16 15:05:27 - DEBUG - prepare_softmax returned list of 251734 arrays, shapes=[(42, 12), (42, 15), (42, 18), (42, 12), (42, 12), (42, 12), (42, 12), (42, 16), (42, 8), (42, 15), (42, 13), (42, 13), (42, 13), (42, 38), (42, 15), (42, 12), (42, 13), (42, 35), (42, 103), (42, 250), (42, 120), (42, 180), (42, 105), (42, 81), (42, 61), (42, 131), (42, 14), (42, 42), (42, 13), (42, 12), (42, 12), (42, 36), (42, 44), (42, 12), (42, 12), (42, 12),

In [87]:
bpi19_accuracy

0.9427067745889491

bpi19 2000 accuracy: 0.94