# Setup 

In [None]:
# --- Core libraries ---
import pandas as pd
from pathlib import Path
import pm4py
from pm4py.algo.evaluation.generalization import algorithm as generalization_evaluator
from pm4py.visualization.bpmn import visualizer as bpmn_visualizer
from pm4py.objects.bpmn.importer import importer as bpmn_importer
from pm4py.objects.conversion.bpmn import converter as bpmn_to_petri_converter

# --- Column aliases used throughout ---
CASE = "case:concept:name"
ACT  = "concept:name"
TS   = "time:timestamp"
LIFE = "lifecycle:transition"


Event Load

In [3]:
xes_path = Path("../data/raw/BPI_Challenge_2017.xes")

# EventLog loading
elog = pm4py.read_xes(str(xes_path))

# DataFrame 
df = pm4py.convert_to_dataframe(elog)
df[TS] = pd.to_datetime(df[TS], errors="coerce")
df = df.sort_values([CASE, TS]).reset_index(drop=True)



parsing log, completed traces ::   0%|          | 0/31509 [00:00<?, ?it/s]

# Bottleneck Analysis on Connection Level

In [None]:

# --------------------------------------------------------
# 1) Compute directly-following transitions (A → B)
# --------------------------------------------------------
df_conn = df.copy()
df_conn = df_conn.sort_values([CASE, TS])

# Next activity and timestamp within each case
df_conn["next_act"] = df_conn.groupby(CASE)[ACT].shift(-1)
df_conn["next_ts"]  = df_conn.groupby(CASE)[TS].shift(-1)

# Keep only valid transitions
df_conn = df_conn.dropna(subset=["next_act"]).copy()

# Throughput time between A and B in hours
df_conn["throughput_hours"] = (
    (df_conn["next_ts"] - df_conn[TS]).dt.total_seconds() / 3600
)

# --------------------------------------------------------
# 2) Aggregate per connection: count, mean, std
# --------------------------------------------------------
total_cases = df[CASE].nunique()

conn_stats = (
    df_conn
    .groupby([ACT, "next_act"])["throughput_hours"]
    .agg(["count", "mean", "std"])
    .reset_index()
)

# Case share: number of distinct cases per connection
cases_per_conn = (
    df_conn.groupby([ACT, "next_act"])[CASE]
    .nunique()
    .reset_index(name="case_count")
)

conn_stats = conn_stats.merge(cases_per_conn, on=[ACT, "next_act"], how="left")
conn_stats["case_percentage"] = (conn_stats["case_count"] / total_cases) * 100

# --------------------------------------------------------
# 3) Bottleneck relevance score
#    (normalized std × normalized case_percentage)
# --------------------------------------------------------
std_max = conn_stats["std"].max() or 1.0
cp_max = conn_stats["case_percentage"].max() or 1.0

conn_stats["std_norm"]  = conn_stats["std"] / std_max
conn_stats["cp_norm"]   = conn_stats["case_percentage"] / cp_max


# --- Linear bottleneck relevance  ---
conn_stats["bottleneck_relevance"] = (
    conn_stats["std_norm"] * conn_stats["cp_norm"]
)



# --------------------------------------------------------
# 4) Convert hours → "Xd Yh Zm"
# --------------------------------------------------------
def format_hours(hours: float) -> str:
    """Convert hours (float) into a human-readable 'Xd Yh Zm' format."""
    if pd.isna(hours):
        return "0d 0h 0m"
    days = int(hours // 24)
    remainder = hours % 24
    h = int(remainder)
    m = int(round((remainder - h) * 60))
    return f"{days}d {h}h {m}m"

conn_stats["mean_fmt"] = conn_stats["mean"].apply(format_hours)
conn_stats["std_fmt"]  = conn_stats["std"].apply(format_hours)

# --------------------------------------------------------
# 5) Final readable table
# --------------------------------------------------------
top_bottlenecks = (
    conn_stats.sort_values("bottleneck_relevance", ascending=False)
    .loc[:, [
        ACT,
        "next_act",
        "case_percentage",
        "mean_fmt",
        "std_fmt",
        "bottleneck_relevance"
    ]]
    .rename(columns={
        ACT: "prev_activity",
        "next_act": "next_activity",
        "mean_fmt": "mean_throughput_time",
        "std_fmt": "std_throughput_time",
    })
)

# Round values for cleaner display
top_bottlenecks["case_percentage"] = top_bottlenecks["case_percentage"].round(2)
top_bottlenecks["bottleneck_relevance"] = top_bottlenecks["bottleneck_relevance"].round(4)

# Show top 10 bottleneck connections
top_bottlenecks.head(10)


Unnamed: 0,prev_activity,next_activity,case_percentage,mean_throughput_time,std_throughput_time,bottleneck_relevance
123,W_Call after offers,W_Call after offers,99.53,1d 21h 49m,3d 2h 32m,0.0775
114,W_Call after offers,A_Cancelled,27.1,23d 3h 6m,8d 6h 15m,0.0561
140,W_Complete application,A_Accepted,70.61,0d 9h 2m,1d 8h 41m,0.0241
137,W_Call incomplete files,W_Call incomplete files,47.61,0d 12h 33m,1d 23h 30m,0.0236
129,W_Call incomplete files,O_Accepted,15.18,5d 8h 27m,5d 23h 38m,0.0228
150,W_Complete application,W_Complete application,78.23,0d 6h 49m,1d 3h 15m,0.0223
177,W_Validate application,W_Validate application,69.41,0d 10h 37m,1d 4h 47m,0.0209
119,W_Call after offers,O_Create Offer,11.92,5d 15h 13m,6d 18h 35m,0.0203
17,A_Concept,W_Complete application,70.66,0d 19h 38m,0d 22h 55m,0.0169
168,W_Validate application,O_Accepted,37.25,0d 17h 43m,1d 11h 31m,0.0138


In [24]:
# --- Compute max values needed for bottleneck relevance ---
max_std = conn_stats["std"].max()
max_case_percentage = conn_stats["case_percentage"].max()

max_std_fmt = format_hours(max_std)

print("Max throughput time standard deviation (raw hours):", max_std)
print("Max throughput time standard deviation (D:H:M):", max_std_fmt)
print("Max case percentage (%):", max_case_percentage)


Max throughput time standard deviation (raw hours): 956.9267093705715
Max throughput time standard deviation (D:H:M): 39d 20h 56m
Max case percentage (%): 100.0


# Root Cause Analysis of Bottlenecks

Feature engineering

In [15]:
# --- Rework count per case ---
CASE, ACT = "case:concept:name", "concept:name"

# Count repeat events within cases
dfc = df[df["lifecycle:transition"].astype(str).str.lower().eq("complete")].copy()
dfc = dfc.sort_values([CASE, "time:timestamp"])
counts = dfc.groupby([CASE, ACT]).size()

rework_per_case = (counts[counts > 1] - 1).groupby(CASE).sum().fillna(0)
rework_per_case.name = "rework_count"



In [16]:
df["is_manual"] = df["org:resource"].apply(lambda x: 0 if x == "User_1" else 1)
df["is_automatic"] = df["org:resource"].apply(lambda x: 1 if x == "User_1" else 0)


In [17]:
resource_features = df.groupby(CASE).agg({
    "is_manual": "mean",        # Anteil manueller Events
    "is_automatic": "mean",     # Anteil automatischer Events
    "org:resource": "nunique",  # diversität der Ressourcen pro Fall
}).rename(columns={
    "is_manual": "pct_manual",
    "is_automatic": "pct_automatic",
    "org:resource": "num_unique_resources"
})


In [18]:
pct_user1 = df.assign(user1=df["org:resource"].eq("User_1").astype(int)) \
              .groupby(CASE)["user1"].mean() \
              .rename("pct_user1")


In [19]:
rca_features = rca_features.merge(rework_per_case, on=CASE, how="left")
rca_features = rca_features.merge(resource_features, on=CASE, how="left")
rca_features = rca_features.merge(pct_user1, on=CASE, how="left")


Correlation analysis

In [20]:
correlation = rca_features.corr()["throughput_days"].sort_values(ascending=False)
correlation


throughput_days               1.000000
mean_wait_hours               0.746297
A_Cancelled                   0.426716
O_Cancelled                   0.424573
rework_count                  0.335082
pct_automatic                 0.322069
pct_user1                     0.322069
W_Personal Loan collection    0.151904
A_Submitted                   0.144395
W_Handle leads                0.144395
num_unique_resources          0.132503
case_length                   0.116974
O_Sent (mail and online)      0.103700
A_Complete                    0.075926
W_Call after offers           0.075926
O_Sent (online only)          0.060557
W_Assess potential fraud      0.028002
W_Shortened completion        0.020158
A_Incomplete                 -0.082702
W_Call incomplete files      -0.082702
O_Refused                    -0.139529
A_Denied                     -0.142270
A_Pending                    -0.314043
O_Accepted                   -0.314043
pct_manual                   -0.322069
O_Returned               