# Build an Object-Centric Event Log (OCEL) from a post-processed XES (PM4Py)

This notebook:
1. Loads your **case-centric** XES log
2. Converts it to a dataframe
3. Creates an **object-centric** log (OCEL) by assigning objects (Application, Student, ParentA, ParentB)
4. Exports the OCEL to disk

> Adjust the file paths in **Step 0** and run top-to-bottom.


In [3]:
# Step 0 — Paths 

import sys
from pathlib import Path

# if your notebook is in /notebooks
PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

from config.config import get_asset, XES_DIR, JSON_DIR

bpmn_asset = get_asset("group10_extended")
bpmn_path = bpmn_asset.bpmn_path
prosimos_cfg = bpmn_asset.prosimos_config_path


XES_FILE_PATH = XES_DIR / "simulated_log_final.xes"  
OUT_OCEL_JSON = JSON_DIR / "ocel_from_xes.jsonocel"                 # JSON-OCEL output
OUT_OCEL_SQLITE = JSON_DIR /"ocel_from_xes.sqlite"                 # optional SQLite output


In [2]:
# Step 1 — Imports
import pandas as pd
import pm4py


In [4]:
# Step 2 — Load XES (robust across PM4Py versions)
def load_xes(path: str):
    # Newer PM4Py
    if hasattr(pm4py, "read_xes"):
        return pm4py.read_xes(path)
    # Older PM4Py fallback
    try:
        from pm4py.objects.log.importer.xes import importer as xes_importer
        return xes_importer.apply(path)
    except Exception as e:
        raise RuntimeError(
            "Could not load XES. If pm4py.read_xes is missing, ensure pm4py is installed correctly."
        ) from e

log = load_xes(str(XES_FILE_PATH))
len(log), type(log)


  from .autonotebook import tqdm as notebook_tqdm
parsing log, completed traces :: 100%|██████████| 1000/1000 [00:00<00:00, 5296.79it/s]


(14971, pandas.core.frame.DataFrame)

In [5]:
# Step 3 — Convert to a dataframe (case-centric)
df = pm4py.convert_to_dataframe(log)

df.head()


Unnamed: 0,concept:name,time:timestamp,event_id,case:concept:name
0,Check parent status,2024-10-04 09:00:23+00:00,e_0,0
1,Generateparent Bmail__from__Check parent status,2024-10-04 09:11:23+00:00,e_1,0
2,Send mail to parent B__from__Generateparent B...,2024-10-06 09:11:23+00:00,e_2,0
3,Generateparent Amail__from__Send mail to paren...,2024-10-06 09:28:23+00:00,e_3,0
4,Parent B data received__from__Generateparent A...,2024-10-22 09:28:23+00:00,e_4,0


In [6]:
# Step 3.1 — Ensure standard columns exist
required = ["case:concept:name", "concept:name", "time:timestamp"]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}. Columns found: {list(df.columns)}")

df["case:concept:name"] = df["case:concept:name"].astype(str)
df["concept:name"] = df["concept:name"].astype(str)
df["time:timestamp"] = pd.to_datetime(df["time:timestamp"], utc=True, errors="coerce")

if df["time:timestamp"].isna().any():
    bad = df[df["time:timestamp"].isna()].head(10)
    raise ValueError("Some timestamps could not be parsed. Example rows:\n" + bad.to_string(index=False))

df = df.sort_values(["case:concept:name", "time:timestamp"]).reset_index(drop=True)
df.head()


Unnamed: 0,concept:name,time:timestamp,event_id,case:concept:name
0,Check parent status,2024-10-04 09:00:23+00:00,e_0,0
1,Generateparent Bmail__from__Check parent status,2024-10-04 09:11:23+00:00,e_1,0
2,Send mail to parent B__from__Generateparent B...,2024-10-06 09:11:23+00:00,e_2,0
3,Generateparent Amail__from__Send mail to paren...,2024-10-06 09:28:23+00:00,e_3,0
4,Parent B data received__from__Generateparent A...,2024-10-22 09:28:23+00:00,e_4,0


## Object model we will create

For each **case** (trace), we create:

- `Application`: `APP_<case>`
- `Student`: `STU_<case>`
- `ParentA`: `PA_<case>` (only if events mention Parent A)
- `ParentB`: `PB_<case>` (only if events mention Parent B)

Each event is linked to:
- always: the `Application` object
- usually: the `Student` object
- additionally: `ParentA` or `ParentB` depending on the activity name


In [7]:
# Step 4 — Create OCEL tables (events + relations)
def infer_object_links(activity: str):
    a = activity.lower()
    links = set()
    if "parent a" in a:
        links.add("ParentA")
    if "parent b" in a:
        links.add("ParentB")
    return links

events_rows = []
rels_rows = []

for i, row in df.iterrows():
    case_id = row["case:concept:name"]
    act = row["concept:name"]
    ts = row["time:timestamp"]

    ev_id = f"E_{case_id}_{i}"
    events_rows.append({
        "ocel:eid": ev_id,
        "ocel:activity": act,
        "ocel:timestamp": ts
    })

    app_id = f"APP_{case_id}"
    stu_id = f"STU_{case_id}"

    rels_rows.append({"ocel:eid": ev_id, "ocel:oid": app_id, "ocel:type": "Application"})
    rels_rows.append({"ocel:eid": ev_id, "ocel:oid": stu_id, "ocel:type": "Student"})

    extra = infer_object_links(act)
    if "ParentA" in extra:
        rels_rows.append({"ocel:eid": ev_id, "ocel:oid": f"PA_{case_id}", "ocel:type": "Parent"})
    if "ParentB" in extra:
        rels_rows.append({"ocel:eid": ev_id, "ocel:oid": f"PB_{case_id}", "ocel:type": "Parent"})

events_df = pd.DataFrame(events_rows)
relations_df = pd.DataFrame(rels_rows)
obj_df = relations_df[["ocel:oid", "ocel:type"]].drop_duplicates().reset_index(drop=True)

events_df.head(), relations_df.head(), obj_df.head()


(  ocel:eid                                      ocel:activity  \
 0    E_0_0                                Check parent status   
 1    E_0_1    Generateparent Bmail__from__Check parent status   
 2    E_0_2  Send mail to parent  B__from__Generateparent B...   
 3    E_0_3  Generateparent Amail__from__Send mail to paren...   
 4    E_0_4  Parent B data received__from__Generateparent A...   
 
              ocel:timestamp  
 0 2024-10-04 09:00:23+00:00  
 1 2024-10-04 09:11:23+00:00  
 2 2024-10-06 09:11:23+00:00  
 3 2024-10-06 09:28:23+00:00  
 4 2024-10-22 09:28:23+00:00  ,
   ocel:eid ocel:oid    ocel:type
 0    E_0_0    APP_0  Application
 1    E_0_0    STU_0      Student
 2    E_0_1    APP_0  Application
 3    E_0_1    STU_0      Student
 4    E_0_1     PB_0       Parent,
   ocel:oid    ocel:type
 0    APP_0  Application
 1    STU_0      Student
 2     PB_0       Parent
 3     PA_0       Parent
 4    APP_1  Application)

In [8]:
# Step 5 — Create PM4Py OCEL object (robust across PM4Py versions)
def make_ocel(events_df, relations_df, objects_df=None):
    try:
        from pm4py.objects.ocel.obj import OCEL
        if objects_df is None:
            return OCEL(events=events_df, relations=relations_df)
        return OCEL(events=events_df, relations=relations_df, objects=objects_df)
    except Exception:
        pass

    # fallback utility (some versions)
    try:
        from pm4py.objects.ocel.util import dataframe_utils
        return dataframe_utils.create_ocel_from_df(events_df, relations_df, objects_df)
    except Exception as e:
        raise RuntimeError(
            "Could not create OCEL object with your PM4Py version. "
            "Please print(pm4py.__version__) and share it if this fails."
        ) from e

ocel = make_ocel(events_df, relations_df, obj_df)
type(ocel)


pm4py.objects.ocel.obj.OCEL

In [9]:
# Step 6 — Quick sanity checks
print("Events:", len(events_df))
print("Relations:", len(relations_df))
print("Objects:", len(obj_df))
print(obj_df["ocel:type"].value_counts())


Events: 14971
Relations: 35804
Objects: 3227
ocel:type
Parent         1227
Application    1000
Student        1000
Name: count, dtype: int64


In [10]:
# Step 7 — Export OCEL

def export_ocel_json(ocel_obj, path: str):
    try:
        from pm4py.objects.ocel.exporter.jsonocel import exporter as json_exporter
        json_exporter.apply(ocel_obj, path)
        return
    except Exception:
        pass

    if hasattr(pm4py, "write_ocel_json"):
        pm4py.write_ocel_json(ocel_obj, path)
        return

    raise RuntimeError("No JSON-OCEL exporter found in your PM4Py installation.")

export_ocel_json(ocel, OUT_OCEL_JSON)
print("Wrote:", OUT_OCEL_JSON)

def export_ocel_sqlite(ocel_obj, path: str):
    try:
        from pm4py.objects.ocel.exporter.sqlite import exporter as sqlite_exporter
        sqlite_exporter.apply(ocel_obj, path)
        return True
    except Exception:
        return False

if export_ocel_sqlite(ocel, OUT_OCEL_SQLITE):
    print("Wrote:", OUT_OCEL_SQLITE)
else:
    print("SQLite exporter not available in this PM4Py version (JSON-OCEL is fine).")


Wrote: C:\Users\abodu\Desktop\Clutter Desktop\اوراق الجامعة\Semesters\WinterSemester 25&26\Buisness Process Management\pm4py\Mining-tests\data\outputs\event_logs\json\ocel_from_xes.jsonocel
Wrote: C:\Users\abodu\Desktop\Clutter Desktop\اوراق الجامعة\Semesters\WinterSemester 25&26\Buisness Process Management\pm4py\Mining-tests\data\outputs\event_logs\json\ocel_from_xes.sqlite


In [8]:
import json

with open(OUT_OCEL_JSON, "r", encoding="utf-8") as f:
    j = json.load(f)

print(list(j.keys())[:30])

['ocel:global-event', 'ocel:global-object', 'ocel:global-log', 'ocel:events', 'ocel:objects']


In [10]:
from pm4py.objects.ocel.importer.jsonocel import importer as jsonocel_importer

ocel = jsonocel_importer.apply(str(OUT_OCEL_JSON), variant=jsonocel_importer.Variants.CLASSIC)

In [11]:
import json
from pm4py.objects.ocel.importer.jsonocel import importer as jsonocel_importer

def load_any_jsonocel(path: str):
    with open(path, "r", encoding="utf-8") as f:
        j = json.load(f)

    # OCEL 2.0 standard usually has "events" as a list
    if "events" in j and isinstance(j["events"], list):
        return jsonocel_importer.apply(path, variant=jsonocel_importer.Variants.OCEL20_STANDARD)

    # OCEL 1.0 classic often has "ocel:events"
    if "ocel:events" in j:
        return jsonocel_importer.apply(path, variant=jsonocel_importer.Variants.CLASSIC)

    raise ValueError(f"Unknown JSON-OCEL structure. Top-level keys: {list(j.keys())}")

ocel = load_any_jsonocel(str(OUT_OCEL_JSON))


In [12]:
print(ocel.events.columns)
print(ocel.objects.columns)
print(ocel.relations.columns)


Index(['ocel:eid', 'ocel:timestamp', 'ocel:activity'], dtype='object')
Index(['ocel:oid', 'ocel:type'], dtype='object')
Index(['ocel:eid', 'ocel:activity', 'ocel:timestamp', 'ocel:oid', 'ocel:type',
       'ocel:qualifier'],
      dtype='object')


In [9]:
obj_types = sorted(ocel.objects["ocel:type"].unique())
print(obj_types)

NameError: name 'ocel' is not defined