# 1. Import and Cleaning

In [3]:
# Add FINAL/code to sys.path to import custom functions
import sys
import os
code_dir = os.path.abspath(os.path.join(os.getcwd(), "..", "code"))
sys.path.insert(0, code_dir)

import os
import sys
import pandas as pd
import matplotlib.pyplot as plt

# PM4Py packages
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.bpmn.importer import importer as bpmn_importer
from pm4py.objects.conversion.bpmn.variants import to_petri_net

from preprocess import extract_bpmn_decision_point_map

# Pandas settings
pd.options.mode.chained_assignment = None

## 1.1 Import Dataset and BPMN-File

In [4]:
# Activity Attributes
    # Action: The name of the action that occurred (might be duplicate of concept:name)
    # org:resource: The person or system executing the event
    # concept:name: The name of the activity that was executed
    # EventOrigin: Source system or origin of the event
    # EventID: Unique ID for the specific event
    # lifecycle:transition: Status of the activity (e.g., start, complete)
    # time:timestamp: Timestamp when the event occurred

# Case Attributes
    # case:LoanGoal: Purpose of the loan application (e.g., "Home improvements")
    # case:ApplicationType: Type of loan application (e.g., "New", "Existing Customer")
    # case:concept:name: Unique identifier for each case (Loan Application ID)
    # case:RequestedAmount: Amount of money requested by the applicant
    # FirstWithdrawalAmount: Amount first withdrawn by the customer (if different from requested)
    # NumberOfTerms: Number of payment installments (loan terms)
    # Accepted: Whether the offer was accepted (boolean or flag)
    # MonthlyCost: Monthly payment amount for the loan
    # Selected: Whether a particular loan offer was selected
    # CreditScore: Credit score of the applicant
    # OfferedAmount: Amount offered by the bank or lender
    # OfferID: Unique identifier for a specific loan offer


# Load XES
log = xes_importer.apply('../data/raw/BPI Challenge 2017.xes')

# Convert to DataFrame
df_log = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)

# Display current log
display(df_log)

  from .autonotebook import tqdm as notebook_tqdm
parsing log, completed traces :: 100%|██████████| 31509/31509 [00:24<00:00, 1298.88it/s]


Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID
0,Created,User_1,A_Create Application,Application,Application_652823628,complete,2016-01-01 09:51:15.304000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
1,statechange,User_1,A_Submitted,Application,ApplState_1582051990,complete,2016-01-01 09:51:15.352000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
2,Created,User_1,W_Handle leads,Workflow,Workitem_1298499574,schedule,2016-01-01 09:51:15.774000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1673366067,withdraw,2016-01-01 09:52:36.392000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
4,Created,User_1,W_Complete application,Workflow,Workitem_1493664571,schedule,2016-01-01 09:52:36.403000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1202262,Deleted,User_1,W_Call after offers,Workflow,Workitem_1817549786,ate_abort,2017-01-06 06:33:02.212000+00:00,Home improvement,New credit,Application_1350494635,20000.0,,,,,,,,
1202263,Created,User_1,W_Call after offers,Workflow,Workitem_363876066,schedule,2017-01-06 06:33:02.221000+00:00,Home improvement,New credit,Application_1350494635,20000.0,,,,,,,,
1202264,statechange,User_28,A_Cancelled,Application,ApplState_1869071797,complete,2017-01-16 09:51:21.114000+00:00,Home improvement,New credit,Application_1350494635,20000.0,,,,,,,,
1202265,statechange,User_28,O_Cancelled,Offer,OfferState_420066181,complete,2017-01-16 09:51:21.139000+00:00,Home improvement,New credit,Application_1350494635,20000.0,,,,,,,,Offer_1580299144


In [5]:
# Import BPMN-File
bpmn_model = bpmn_importer.apply("../resources/loan_application.bpmn")

# Convert BPMN to Petri net
net, im, fm = to_petri_net.apply(bpmn_model)

## 1.2 General Data Cleaning

In [6]:
# Basic Data-Cleaning
df_filtered = df_log.copy()

# Null values summary table
null_summary = (
    df_filtered.isnull()
    .sum()
    .reset_index()
    .rename(columns={"index": "Column", 0: "Null Count"})
)

# Add percentage of nulls
null_summary["% Nulls"] = (null_summary["Null Count"] / len(df_filtered) * 100).round(2)

# Keep only columns with at least 1 null
null_summary = null_summary[null_summary["Null Count"] > 0].reset_index(drop=True)

display(null_summary)

Unnamed: 0,Column,Null Count,% Nulls
0,FirstWithdrawalAmount,1159272,96.42
1,NumberOfTerms,1159272,96.42
2,Accepted,1159272,96.42
3,MonthlyCost,1159272,96.42
4,Selected,1159272,96.42
5,CreditScore,1159272,96.42
6,OfferedAmount,1159272,96.42
7,OfferID,1051413,87.45


In [7]:
# Convert Timestamp Column and sort ascending by timestamp
df_filtered['time:timestamp'] = pd.to_datetime(df_filtered['time:timestamp'])
df_filtered = df_filtered.sort_values(by=['case:concept:name', 'time:timestamp']).reset_index(drop=True)

# Keep columns that are relevant for the analysis
columns_to_keep = [
    "case:concept:name",
    "concept:name",
    "org:resource",
    "time:timestamp",
    "case:LoanGoal",
    "case:ApplicationType",
    "case:RequestedAmount",
    "lifecycle:transition"
]

# Retain only the necessary columns (drop all others)
df_filtered = df_filtered[columns_to_keep]

In [8]:
# 1) Get the last event of each case and duplicate it
end_events = (
    df_filtered
    .sort_values("time:timestamp")
    .groupby("case:concept:name")
    .tail(1)            # exactly one row per case
    .copy()
)

# 2) Modify attributes for the artificial end event
end_events["time:timestamp"] += pd.Timedelta(seconds=1)   # add +1 second to the timestamp
end_events["concept:name"]    = "End"                     # change activity label to "End"

# 3) Insert into the log and sort chronologically
df_filtered = (
    pd.concat([df_filtered, end_events], ignore_index=True)
      .sort_values(["case:concept:name", "time:timestamp"])
      .reset_index(drop=True)
)

print("Added end events:", len(end_events))

Added end events: 31509


In [9]:
# Final dataset for training data generation
df_cleaned = df_filtered.copy()

## 1.3 Create Decision Point Dictionary

In [10]:
# Create Mapping
bpmn_decision_point_map = extract_bpmn_decision_point_map(bpmn_model)

# --- Display final unified decision point structure ---
print("\n--- Unified BPMN Decision Point Map ---")
for dp, data in bpmn_decision_point_map.items():
    print(f"{dp}:")
    print(f"  Incoming: {data['incoming']}")
    print(f"  Outgoing: {data['outgoing']}")


--- Unified BPMN Decision Point Map ---
DP 1:
  Incoming: ['A_Create Application']
  Outgoing: ['A_Submitted', 'A_Concept', 'A_Accepted', 'W_Complete application', 'O_Create Offer']
DP 2:
  Incoming: ['A_Submitted', 'W_Handle leads']
  Outgoing: ['W_Handle leads']
DP 3:
  Incoming: ['W_Handle leads']
  Outgoing: ['W_Handle leads', 'A_Concept', 'A_Accepted', 'W_Complete application', 'O_Create Offer']
DP 4:
  Incoming: ['A_Create Application', 'W_Handle leads']
  Outgoing: ['A_Concept', 'A_Accepted', 'W_Complete application', 'O_Create Offer']
DP 5:
  Incoming: ['A_Create Application', 'W_Handle leads']
  Outgoing: ['A_Concept', 'A_Accepted', 'W_Complete application', 'O_Create Offer']
DP 6:
  Incoming: ['W_Complete application', 'A_Create Application', 'W_Handle leads']
  Outgoing: ['W_Complete application']
DP 7:
  Incoming: ['W_Complete application']
  Outgoing: ['A_Concept', 'A_Accepted', 'W_Complete application', 'O_Create Offer']
DP 8:
  Incoming: ['W_Complete application', 'A_Cr

## 1.4 Creation of Full Training Data

In [11]:
# Create a full dataset for training data generation
df_full = df_cleaned.drop(columns=["lifecycle:transition"]).copy()
df_full = df_full.drop_duplicates()

# 2. Generation of Training Data

In [12]:
# --- Standard Library ---
import copy

# --- Third-Party Libraries ---
import joblib
from sklearn.model_selection import train_test_split

# --- Local Modules ---
from preprocess import generate_enriched_training_sets_simple
from preprocess import enrich_with_temporal_features, enrich_with_loop_features

## 2.1 Simple Dataset (No Additional Feature Engineering)

In [13]:
# --- Parameters ---
max_history_len = 15
min_sequence_count = 20
random_state = 42

# --- Result container ---
dp_split_datasets_simple = {}

for dp_name, dp_config in bpmn_decision_point_map.items():
    print(f"Processing {dp_name}...")

    try:
        filtered_dp_map = {dp_name: dp_config}

        dp_training_sets = generate_enriched_training_sets_simple(
            df_full,
            filtered_dp_map,
            max_history_len=max_history_len,
            min_sequence_count=min_sequence_count
        )
        df_dp = dp_training_sets.get(dp_name)

        if df_dp is None or df_dp.empty:
            print(f"{dp_name}: no valid samples.")
            continue

        # Step 1: Split off 10% holdout
        df_temp, df_holdout = train_test_split(
            df_dp,
            test_size=0.1,
            random_state=random_state,
            stratify=df_dp["label"] if df_dp["label"].nunique() > 1 else None
        )

        # Step 2: Split remaining 90% into train (70%) and test (20%)
        df_train, df_test = train_test_split(
            df_temp,
            test_size=2/9,  # ≈ 22.22% of 90% = 20% of full
            random_state=random_state,
            stratify=df_temp["label"] if df_temp["label"].nunique() > 1 else None
        )

        dp_split_datasets_simple[dp_name] = {
            "train": df_train.reset_index(drop=True),
            "test": df_test.reset_index(drop=True),
            "holdout": df_holdout.reset_index(drop=True)
        }

        print(f"{dp_name}: train={len(df_train)}, test={len(df_test)}, holdout={len(df_holdout)}")

    except Exception as e:
        print(f"{dp_name}: error occurred -> {e}")

print("\nFinished splitting all decision points.")
print(f"Total decision points with splits: {len(dp_split_datasets_simple)}")

Processing DP 1...
DP 1: train=22056, test=6302, holdout=3151
Processing DP 2...
DP 2: no valid samples.
Processing DP 3...
DP 3: train=32955, test=9416, holdout=4708
Processing DP 4...
DP 4: train=22008, test=6288, holdout=3145
Processing DP 5...
DP 5: train=22008, test=6288, holdout=3145
Processing DP 6...
DP 6: no valid samples.
Processing DP 7...
DP 7: train=79778, test=22794, holdout=11397
Processing DP 8...
DP 8: train=101787, test=29082, holdout=14541
Processing DP 9...
DP 9: train=125517, test=35862, holdout=17932
Processing DP 10...
DP 10: train=125517, test=35862, holdout=17932
Processing DP 11...
DP 11: train=125478, test=35852, holdout=17926
Processing DP 12...
DP 12: train=125478, test=35852, holdout=17926
Processing DP 13...
DP 13: train=125276, test=35794, holdout=17897
Processing DP 14...
DP 14: train=25348, test=7243, holdout=3622
Processing DP 15...
DP 15: train=25348, test=7243, holdout=3622
Processing DP 16...
DP 16: train=23158, test=6617, holdout=3309
Processing D

In [14]:
# Create copy of dataset as they will be adjusted in the advanced version
dp_split_datasets_advanced = copy.deepcopy(dp_split_datasets_simple)

SPLITS = ("train", "test", "holdout")

for dp, splits in dp_split_datasets_simple.items():
    for split_name in SPLITS:
        df = splits.get(split_name)
        if df is not None and "sequence_timestamps" in df.columns:
            dp_split_datasets_simple[dp][split_name] = df.drop(columns=["sequence_timestamps"])

In [15]:
# Display a specific decision point's training set 
dp = "DP 48"
display(dp_split_datasets_simple[dp]["train"])

Unnamed: 0,case_id,sequence,sequence_resources,sequence_durations,label,case:LoanGoal,case:ApplicationType,case:RequestedAmount
0,Application_1624036826,"[W_Complete application, W_Complete applicatio...","[User_2, User_12, User_12, User_12, User_12, U...","[0.0, 87320.408, 1138.098, 250.43, 1.355999999...",O_Create Offer,Home improvement,New credit,10000.0
1,Application_309551659,"[A_Create Application, A_Concept, W_Complete a...","[User_3, User_3, User_3, User_3, User_3, User_...","[0.0, 0.014, 0.01, 0.006, 83.354, 146.843, 1.2...",W_Complete application,Unknown,Limit raise,0.0
2,Application_2119963425,"[W_Call after offers, W_Validate application, ...","[User_123, User_123, User_123, User_123, User_...","[0.0, 0.01, 0.003, 0.884, 3.058, 1055.068, 0.0...",W_Call incomplete files,Home improvement,New credit,14000.0
3,Application_1075145802,"[W_Call after offers, W_Call after offers, W_C...","[User_3, User_3, User_117, User_117, User_117,...","[0.0, 21.91, 705539.49, 0.023, 0.005, 0.923, 8...",W_Call incomplete files,Home improvement,Limit raise,10000.0
4,Application_1361883922,"[W_Call incomplete files, A_Incomplete, W_Call...","[User_123, User_123, User_123, User_54, User_5...","[0.0, 0.003, 34.206, 4459.858, 59.283, 266340....",W_Call incomplete files,"Other, see explanation",New credit,6000.0
...,...,...,...,...,...,...,...,...
298597,Application_1100902083,"[A_Create Application, W_Complete application,...","[User_15, User_15, User_15, User_15, User_15, ...","[0.0, 0.02, 0.006, 0.01, 230.11, 447.27, 1.381...",O_Create Offer,"Other, see explanation",New credit,24000.0
298598,Application_1223096308,"[W_Validate application, W_Validate applicatio...","[User_95, User_95, User_95, User_75, User_75, ...","[0.0, 7402.454, 173.179, 59536.353, 0.009, 0.0...",W_Validate application,Home improvement,Limit raise,50000.0
298599,Application_1692109151,"[W_Complete application, W_Call after offers, ...","[User_60, User_60, User_60, User_60, User_60, ...","[0.0, 0.04, 0.012, 0.012, 641.746, 8614.82, 0....",W_Validate application,Home improvement,New credit,15000.0
298600,Application_1393455197,"[W_Call after offers, W_Call after offers, A_C...","[User_36, User_36, User_36, User_36, User_19, ...","[0.0, 0.058, 0.004, 541.319, 452965.95, 83.948...",W_Call incomplete files,Home improvement,New credit,27500.0


In [16]:
# Go up from notebooks/ to FINAL/
output_dir = os.path.join("..", "data", "processed")
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, "dp_split_datasets_full_simple.joblib")
joblib.dump(dp_split_datasets_simple, output_path)

print(f"Saved to {os.path.abspath(output_path)}")

Saved to /Users/marcohmayer/next_activity_prediction/data/processed/dp_split_datasets_full_simple.joblib


## 2.2 Advanced Dataset (Addtitional Feature Engineering)

In [None]:
SPLITS = ("train", "test", "holdout")

for dp, splits in dp_split_datasets_advanced.items():
    for split_name in SPLITS:
        df = splits.get(split_name)
        if df is None or df.empty:
            continue

        # 1) Enrich with loop features first (works only on "sequence")
        df_enriched = enrich_with_loop_features(df)

        # 2) Enrich with temporal features if timestamps are present
        if "sequence_timestamps" in df_enriched.columns:
            df_enriched = enrich_with_temporal_features(df_enriched)
            # 3) Drop timestamp sequences afterwards
            df_enriched = df_enriched.drop(columns=["sequence_timestamps"])

        # Persist back
        dp_split_datasets_advanced[dp][split_name] = df_enriched

In [None]:
# Display a specific decision point's training set 
dp = "DP 48"
display(dp_split_datasets_advanced[dp]["train"])

Unnamed: 0,case_id,sequence,sequence_resources,sequence_durations,label,case:LoanGoal,case:ApplicationType,case:RequestedAmount,n_repeats_current_activity,n_unique_activities,...,longest_repeat_streak,position_in_trace,day_of_week,time_of_day,month,week_of_year,is_weekend,is_holiday_nl,time_since_case_start,time_since_prev_event
0,Application_1624036826,"[W_Complete application, W_Complete applicatio...","[User_2, User_12, User_12, User_12, User_12, U...","[0.0, 87320.408, 1138.098, 250.43, 1.355999999...",O_Create Offer,Home improvement,New credit,10000.0,7,7,...,5,15,2,10,1,2,0,0,420263.182,40.397
1,Application_309551659,"[A_Create Application, A_Concept, W_Complete a...","[User_3, User_3, User_3, User_3, User_3, User_...","[0.0, 0.014, 0.01, 0.006, 83.354, 146.843, 1.2...",W_Complete application,Unknown,Limit raise,0.0,1,7,...,2,8,0,15,7,27,0,0,244.451,12.948
2,Application_2119963425,"[W_Call after offers, W_Validate application, ...","[User_123, User_123, User_123, User_123, User_...","[0.0, 0.01, 0.003, 0.884, 3.058, 1055.068, 0.0...",W_Call incomplete files,Home improvement,New credit,14000.0,8,6,...,6,15,2,8,1,2,0,0,1969104.434,1206789.949
3,Application_1075145802,"[W_Call after offers, W_Call after offers, W_C...","[User_3, User_3, User_117, User_117, User_117,...","[0.0, 21.91, 705539.49, 0.023, 0.005, 0.923, 8...",W_Call incomplete files,Home improvement,Limit raise,10000.0,5,6,...,3,15,3,14,5,19,0,0,803115.617,232.151
4,Application_1361883922,"[W_Call incomplete files, A_Incomplete, W_Call...","[User_123, User_123, User_123, User_54, User_5...","[0.0, 0.003, 34.206, 4459.858, 59.283, 266340....",W_Call incomplete files,"Other, see explanation",New credit,6000.0,8,4,...,4,15,0,17,10,44,0,0,278491.870,37.202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298597,Application_1100902083,"[A_Create Application, W_Complete application,...","[User_15, User_15, User_15, User_15, User_15, ...","[0.0, 0.02, 0.006, 0.01, 230.11, 447.27, 1.381...",O_Create Offer,"Other, see explanation",New credit,24000.0,3,9,...,2,13,3,8,3,9,0,0,1087.844,397.573
298598,Application_1223096308,"[W_Validate application, W_Validate applicatio...","[User_95, User_95, User_95, User_75, User_75, ...","[0.0, 7402.454, 173.179, 59536.353, 0.009, 0.0...",W_Validate application,Home improvement,Limit raise,50000.0,7,4,...,4,15,3,15,3,13,0,0,100298.836,264.108
298599,Application_1692109151,"[W_Complete application, W_Call after offers, ...","[User_60, User_60, User_60, User_60, User_60, ...","[0.0, 0.04, 0.012, 0.012, 641.746, 8614.82, 0....",W_Validate application,Home improvement,New credit,15000.0,7,6,...,5,15,2,6,5,20,0,0,421742.294,5.166
298600,Application_1393455197,"[W_Call after offers, W_Call after offers, A_C...","[User_36, User_36, User_36, User_36, User_19, ...","[0.0, 0.058, 0.004, 541.319, 452965.95, 83.948...",W_Call incomplete files,Home improvement,New credit,27500.0,1,7,...,4,15,1,13,8,35,0,0,1738131.837,98.926


In [None]:
# Go up from notebooks/ to FINAL/
output_dir = os.path.join("..", "data", "processed")
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, "dp_split_datasets_full_advanced.joblib")
joblib.dump(dp_split_datasets_advanced, output_path)

print(f"Saved to {os.path.abspath(output_path)}")

Saved to /Users/marcohmayer/Desktop/BPSO/FINAL/data/processed/dp_split_datasets_full_advanced.joblib
