# Setup and Imports



In [1]:
# --- Core libraries ---
import pandas as pd
from pathlib import Path

# --- PM4Py modules (checked for 2.7.18) ---
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as log_converter

from pm4py.algo.discovery.inductive import algorithm as im
from pm4py.algo.discovery.heuristics import algorithm as hm

from pm4py.algo.evaluation.replay_fitness import algorithm as fitness_evaluator
from pm4py.algo.evaluation.precision import algorithm as precision_evaluator
from pm4py.algo.evaluation.generalization import algorithm as generalization_evaluator

# ✅ correct exporter path for pm4py 2.7.18
from pm4py.objects.petri_net.exporter import exporter as pnml_exporter

from pm4py.objects.conversion.process_tree import converter as pt_converter
from pm4py.objects.bpmn.exporter import exporter as bpmn_exporter

# --- Project structure ---
Path("models").mkdir(exist_ok=True)
Path("figs").mkdir(exist_ok=True)
Path("tables").mkdir(exist_ok=True)

# --- Column aliases used throughout ---
CASE = "case:concept:name"
ACT  = "concept:name"
TS   = "time:timestamp"
LIFE = "lifecycle:transition"


# Eventlog loading and pre-processing

In [2]:
# Load XES log and convert to DataFrame
xes_path = Path("../data/raw/BPI_Challenge_2017.xes")
log = xes_importer.apply(str(xes_path))
df = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)

print(f"✅ Event log loaded with {len(df):,} events and {df[CASE].nunique():,} cases.")




parsing log, completed traces ::   0%|          | 0/31509 [00:00<?, ?it/s]

✅ Event log loaded with 1,202,267 events and 31,509 cases.


# Clean Log View for Discovery

In [3]:
# I work on a clean view for discovery:
# - only 'complete' lifecycle events (to avoid duplicates)
# - ensure chronological ordering
# - optional filter to keep only business activities (A_* and O_*)

df_view = df[[CASE, ACT, TS] + ([LIFE] if LIFE in df.columns else [])].copy()
df_view[TS] = pd.to_datetime(df_view[TS], errors="coerce")
if LIFE in df_view.columns:
    df_view = df_view[df_view[LIFE].astype(str).str.lower() == "complete"]

# Optional: focus on business steps only (uncomment if desired)
# df_view = df_view[df_view[ACT].str.startswith(("A_", "O_"))]

df_view = df_view.sort_values([CASE, TS], kind="mergesort").dropna(subset=[TS])

print(f"Events in view: {len(df_view):,} | Cases: {df_view[CASE].nunique():,}")


Events in view: 475,306 | Cases: 31,509


# Transformation to Event-Log

In [4]:
# Convert DataFrame to EventLog for discovery
elog = log_converter.apply(df_view, variant=log_converter.Variants.TO_EVENT_LOG)
print("EventLog created.")


EventLog created.


# Inductive Miner (IMf) Discovery

In [5]:
# Inductive Miner discovery (returns a Process Tree)
pt_im = im.apply(elog)

# Convert Process Tree to Petri Net
from pm4py.objects.conversion.process_tree import converter as tree_converter
net_im, im_im, fm_im = tree_converter.apply(pt_im, variant=tree_converter.Variants.TO_PETRI_NET)

print("Inductive Miner model discovered and converted to Petri net.")


Inductive Miner model discovered and converted to Petri net.


# Heuristics Miner Discovery

In [6]:
net_hm, im_hm, fm_hm = hm.apply(elog)
print("Heuristics Miner model discovered.")


Heuristics Miner model discovered.


# Evaluation (Replay Fitness)

In [None]:




# --- Fast version: Token-based Replay Fitness ---
fit_im = fitness_evaluator.apply(
    elog, net_im, im_im, fm_im,
    parameters={"variant": fitness_evaluator.Variants.TOKEN_BASED}
)["log_fitness"]

fit_hm = fitness_evaluator.apply(
    elog, net_hm, im_hm, fm_hm,
    parameters={"variant": fitness_evaluator.Variants.TOKEN_BASED}
)["log_fitness"]

print(f" Fitness IM: {fit_im:.3f} | Fitness HM: {fit_hm:.3f}")


aligning log, completed variants ::   0%|          | 0/5623 [00:00<?, ?it/s]

aligning log, completed variants ::   0%|          | 0/5623 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Evaluation (Precision)

In [None]:
prec_im = precision_evaluator.apply(elog, net_im, im_im, fm_im)
prec_hm = precision_evaluator.apply(elog, net_hm, im_hm, fm_hm)
print(f"Precision IM: {prec_im:.3f} | Precision HM: {prec_hm:.3f}")


# Evaluation (Generalization)

In [None]:
gen_im = generalization_evaluator.apply(elog, net_im, im_im, fm_im)
gen_hm = generalization_evaluator.apply(elog, net_hm, im_hm, fm_hm)
print(f"Generalization IM: {gen_im:.3f} | Generalization HM: {gen_hm:.3f}")

# Simplicity-Metrics

In [None]:
def simplicity_size(net):
    return len(net.places) + len(net.transitions)

def simplicity_density(net):
    arcs = sum(len(x.out_arcs) for x in list(net.places) + list(net.transitions))
    denom = max(1, len(net.places) * len(net.transitions))
    return arcs / denom

simp_size_im, simp_dens_im = simplicity_size(net_im), simplicity_density(net_im)
simp_size_hm, simp_dens_hm = simplicity_size(net_hm), simplicity_density(net_hm)

print(f"IM  -> size:{simp_size_im}  density:{simp_dens_im:.3f}")
print(f"HM  -> size:{simp_size_hm}  density:{simp_dens_hm:.3f}")

# Comparison Table

In [None]:
metrics_df = pd.DataFrame(
    [
        ["Inductive Miner",  fit_im, prec_im, gen_im, simp_size_im, simp_dens_im],
        ["Heuristics Miner", fit_hm, prec_hm, gen_hm, simp_size_hm, simp_dens_hm],
    ],
    columns=["Model", "Fitness", "Precision", "Generalization", "Simplicity(Size)", "Simplicity(Density)"]
)
metrics_df.to_csv("tables/model_quality_comparison.csv", index=False)
metrics_df

# Final Model Export (PNML + BPMN)

In [11]:
# Export final Inductive Miner model
pnml_exporter.apply(net_im, im_im, fm_im, "models/final_model.pnml")

# Export BPMN version via process tree
pt = im.apply_tree(elog)
bpmn_graph = pt_converter.apply(pt, variant=pt_converter.Variants.TO_BPMN)
bpmn_exporter.apply(bpmn_graph, "models/final_model.bpmn")

print("Final model exported: models/final_model.pnml and models/final_model.bpmn")

AttributeError: 'str' object has no attribute 'name'

# Final Metrics Snapshot

In [None]:
final_row = metrics_df.loc[metrics_df["Model"]=="Inductive Miner"].iloc[0].to_dict()
pd.DataFrame([final_row]).to_csv("tables/final_model_metrics.csv", index=False)
print("Saved: tables/final_model_metrics.csv")