In [1]:
# ============================================================
# ðŸ“‚ Load Event Log (Setup)
# ============================================================

from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as log_converter
import pandas as pd
from pathlib import Path

# Define path to the .xes file (adjust if needed)
xes_path = Path("../data/raw/BPI_Challenge_2017.xes")

# Load XES log using PM4Py
log = xes_importer.apply(str(xes_path))

# Convert to Pandas DataFrame
df = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)

# Confirm successful load
print(f"âœ… Event log loaded successfully with {len(df)} events and {df['case:concept:name'].nunique()} cases.")




parsing log, completed traces ::   0%|          | 0/31509 [00:00<?, ?it/s]

âœ… Event log loaded successfully with 1202267 events and 31509 cases.


In [12]:
# ============================================================
# ðŸ“Š Basic Event Log Analysis (Metrics)
# ============================================================

import warnings
warnings.filterwarnings("ignore")  # Suppress warnings for cleaner output
import pandas as pd

# Number of cases
num_cases = df['case:concept:name'].nunique()
print(f"Number of cases: {num_cases}")

# Number of events
num_events = len(df)
print(f"Number of events: {num_events}")

# Number of process variants
num_variants = df.groupby('case:concept:name').apply(lambda x: ' -> '.join(x['concept:name'])).nunique()
print(f"Number of process variants: {num_variants}")

# Number of unique event labels (activities)
num_event_labels = df['concept:name'].nunique()
print(f"Number of event labels: {num_event_labels:,}")

# case labels
case_labels = [col for col in df.columns if col.startswith("case:")]


print("Number of Case Labels:", len(case_labels))


# Mean and standard deviation of case length
case_lengths = df.groupby('case:concept:name').size()
mean_case_length = case_lengths.mean()
std_case_length = case_lengths.std()
print(f"Mean case length: {mean_case_length:.2f} events")
print(f"Standard deviation of case length: {std_case_length:.2f} events")

# Mean and standard deviation of case duration (in days)
case_durations = df.groupby('case:concept:name').apply(
    lambda x: (x['time:timestamp'].max() - x['time:timestamp'].min()).total_seconds()
)

mean_case_duration = case_durations.mean()
std_case_duration = case_durations.std()

# Mean case duration â†’ convert to d/h/m/s
mean_days = int(mean_case_duration // 86400)
mean_hours = int((mean_case_duration % 86400) // 3600)
mean_minutes = int((mean_case_duration % 3600) // 60)
mean_seconds = int(mean_case_duration % 60)

print(f"Mean case duration: {mean_days} days, {mean_hours} hours, {mean_minutes} minutes, {mean_seconds} seconds")

# Std case duration â†’ convert to d/h/m/s
std_days = int(std_case_duration // 86400)
std_hours = int((std_case_duration % 86400) // 3600)
std_minutes = int((std_case_duration % 3600) // 60)
std_seconds = int(std_case_duration % 60)

print(f"Standard deviation of case duration: {std_days} days, {std_hours} hours, {std_minutes} minutes, {std_seconds} seconds")


# Mean Inter-Event Time
inter_event_times = df.groupby('case:concept:name')['time:timestamp'].diff().dropna().dt.total_seconds()
mean_inter_event_time = inter_event_times.mean()
hours, remainder = divmod(mean_inter_event_time, 3600)
minutes, seconds = divmod(remainder, 60)
print(f"Mean inter-event time: {int(hours)} hours, {int(minutes)} minutes, {int(seconds)} seconds")



Number of cases: 31509
Number of events: 1202267
Number of process variants: 15930
Number of event labels: 26
Number of Case Labels: 4
Mean case length: 38.16 events
Standard deviation of case length: 16.72 events
Mean case duration: 21 days, 21 hours, 35 minutes, 25 seconds
Standard deviation of case duration: 13 days, 4 hours, 3 minutes, 41 seconds
Mean inter-event time: 14 hours, 8 minutes, 43 seconds


In [22]:
#Rework rate calculation
CASE, ACT, TS, LIFE = "case:concept:name", "concept:name", "time:timestamp", "lifecycle:transition"

dfc = df[df[LIFE].astype(str).str.lower().eq("complete")].copy()
dfc[TS] = pd.to_datetime(dfc[TS], errors="coerce")
dfc = dfc.sort_values([CASE, TS], kind="mergesort").drop_duplicates([CASE, ACT, TS])

counts = dfc.groupby([CASE, ACT]).size()
rework_events = (counts[counts > 1] - 1).sum()
rework_rate = rework_events / len(dfc)

print(f"Rework rate: {rework_rate:.4f} ({rework_rate:.2%})")



Rework rate: 0.1466 (14.66%)


In [None]:
# Convert timestamps to datetime (safety check)
df["time:timestamp"] = pd.to_datetime(df["time:timestamp"], errors="coerce")

# Calculate case durations in seconds
case_durations = df.groupby("case:concept:name")["time:timestamp"].apply(lambda x: (x.max() - x.min()).total_seconds())

# Convert seconds to days for interpretability
case_durations_days = case_durations / (24 * 3600)

# Compute median and mean throughput times
median_throughput_time = case_durations_days.median()
mean_throughput_time = case_durations_days.mean()

# Convert median throughput time (in days) to d/h/m/s
median_seconds_total = median_throughput_time * 24 * 3600
m_days = int(median_seconds_total // 86400)
m_hours = int((median_seconds_total % 86400) // 3600)
m_minutes = int((median_seconds_total % 3600) // 60)
m_seconds = int(median_seconds_total % 60)

print(f"Median throughput time: {m_days} days, {m_hours} hours, {m_minutes} minutes, {m_seconds} seconds")


Median throughput time: 19 days, 2 hours, 6 minutes, 20 seconds


In [21]:
# Calculate duration per case (in seconds)
case_durations_seconds = df.groupby("case:concept:name")["time:timestamp"].apply(lambda x: (x.max() - x.min()).total_seconds())

# Convert to days
case_durations_days = case_durations_seconds / (24 * 3600)

# Extract min / max
min_throughput_days = case_durations_days.min()
max_throughput_days = case_durations_days.max()

# ---- Convert min duration ----
min_seconds_total = min_throughput_days * 24 * 3600
min_days = int(min_seconds_total // 86400)
min_hours = int((min_seconds_total % 86400) // 3600)
min_minutes = int((min_seconds_total % 3600) // 60)
min_seconds = int(min_seconds_total % 60)

# ---- Convert max duration ----
max_seconds_total = max_throughput_days * 24 * 3600
max_days = int(max_seconds_total // 86400)
max_hours = int((max_seconds_total % 86400) // 3600)
max_minutes = int((max_seconds_total % 3600) // 60)
max_seconds = int(max_seconds_total % 60)

print(f"Minimum throughput time: {min_days} days, {min_hours} hours, {min_minutes} minutes, {min_seconds} seconds")
print(f"Maximum throughput time: {max_days} days, {max_hours} hours, {max_minutes} minutes, {max_seconds} seconds")

Minimum throughput time: 0 days, 0 hours, 3 minutes, 21 seconds
Maximum throughput time: 286 days, 1 hours, 44 minutes, 18 seconds


In [6]:
num_event_labels = df['concept:name'].nunique()

print(f"Number of distinct event labels: {num_event_labels}")

Number of distinct event labels: 26


In [10]:
case_labels = [col for col in df.columns 
               if col.startswith("case:") or df.groupby('case:concept:name')[col].nunique().max() == 1]

num_case_labels = len(case_labels)
print(f"Number of case labels: {num_case_labels}")

Number of case labels: 4
