In [1]:
# ============================================================
# ðŸ“‚ Load Event Log (Setup)
# ============================================================

from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as log_converter
import pandas as pd
from pathlib import Path

# Define path to the .xes file (adjust if needed)
xes_path = Path("../data/raw/BPI_Challenge_2017.xes")

# Load XES log using PM4Py
log = xes_importer.apply(str(xes_path))

# Convert to Pandas DataFrame
df = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)

# Confirm successful load
print(f"âœ… Event log loaded successfully with {len(df)} events and {df['case:concept:name'].nunique()} cases.")




parsing log, completed traces ::   0%|          | 0/31509 [00:00<?, ?it/s]

âœ… Event log loaded successfully with 1202267 events and 31509 cases.


In [5]:
# ============================================================
# ðŸ“Š Basic Event Log Analysis (Metrics)
# ============================================================

import warnings
warnings.filterwarnings("ignore")  # Suppress warnings for cleaner output
import pandas as pd

# Number of cases
num_cases = df['case:concept:name'].nunique()
print(f"Number of cases: {num_cases}")

# Number of events
num_events = len(df)
print(f"Number of events: {num_events}")

# Number of process variants
num_variants = df.groupby('case:concept:name').apply(lambda x: ' -> '.join(x['concept:name'])).nunique()
print(f"Number of process variants: {num_variants}")

# Number of case and event labels
num_event_labels = df['concept:name'].nunique()
print(f"Number of event labels: {num_event_labels}")

# Mean and standard deviation of case length
case_lengths = df.groupby('case:concept:name').size()
mean_case_length = case_lengths.mean()
std_case_length = case_lengths.std()
print(f"Mean case length: {mean_case_length:.2f} events")
print(f"Standard deviation of case length: {std_case_length:.2f} events")

# Mean and standard deviation of case duration (in days)
case_durations = df.groupby('case:concept:name').apply(lambda x: (x['time:timestamp'].max() - x['time:timestamp'].min()).total_seconds())
mean_case_duration = case_durations.mean() / (24 * 3600)  # Convert seconds to days
std_case_duration = case_durations.std() / (24 * 3600)  # Convert seconds to days
print(f"Mean case duration: {mean_case_duration:.2f} days")
print(f"Standard deviation of case duration: {std_case_duration:.2f} days")

# Number of categorical event attributes
categorical_attributes = df.select_dtypes(include=['object']).columns
num_categorical_attributes = len(categorical_attributes)
print(f"Number of categorical event attributes: {num_categorical_attributes}")

# Mean Inter-Event Time
inter_event_times = df.groupby('case:concept:name')['time:timestamp'].diff().dropna().dt.total_seconds()
mean_inter_event_time = inter_event_times.mean()
hours, remainder = divmod(mean_inter_event_time, 3600)
minutes, seconds = divmod(remainder, 60)
print(f"Mean inter-event time: {int(hours)} hours,")


Number of cases: 31509
Number of events: 1202267
Number of process variants: 15930
Number of event labels: 26
Number of process variants: 15930
Number of event labels: 26
Mean case length: 38.16 events
Standard deviation of case length: 16.72 events
Mean case length: 38.16 events
Standard deviation of case length: 16.72 events
Mean case duration: 21.90 days
Standard deviation of case duration: 13.17 days
Mean case duration: 21.90 days
Standard deviation of case duration: 13.17 days
Number of categorical event attributes: 12
Mean inter-event time: 14 hours,
Number of categorical event attributes: 12
Mean inter-event time: 14 hours,


In [None]:
#Rework rate calculation
CASE, ACT, TS, LIFE = "case:concept:name", "concept:name", "time:timestamp", "lifecycle:transition"

dfc = df[df[LIFE].astype(str).str.lower().eq("complete")].copy()
dfc[TS] = pd.to_datetime(dfc[TS], errors="coerce")
dfc = dfc.sort_values([CASE, TS], kind="mergesort").drop_duplicates([CASE, ACT, TS])

counts = dfc.groupby([CASE, ACT]).size()
rework_events = (counts[counts > 1] - 1).sum()
rework_rate = rework_events / len(dfc)

print(f"Rework rate: {rework_rate:.4f} ({rework_rate:.2%})")



Rework rate: 0.1466 (14.66%)


In [6]:
# Convert timestamps to datetime (safety check)
df["time:timestamp"] = pd.to_datetime(df["time:timestamp"], errors="coerce")

# Calculate case durations in seconds
case_durations = df.groupby("case:concept:name")["time:timestamp"].apply(lambda x: (x.max() - x.min()).total_seconds())

# Convert seconds to days for interpretability
case_durations_days = case_durations / (24 * 3600)

# Compute median and mean throughput times
median_throughput_time = case_durations_days.median()
mean_throughput_time = case_durations_days.mean()

print(f"Median throughput time: {median_throughput_time:.2f} days")

Median throughput time: 19.09 days
