In [1]:
import pandas as pd
from datetime import datetime
import pm4py


In [2]:
def build_objects_df(applications, students, parents, documents, resubmissions, notifications, clerks):
    # Each input is a pandas DataFrame containing the UML attributes.

    objs = []

    # Application
    a = applications.copy()
    a["ocel:oid"] = "APP_" + a["applicationId"].astype(str)
    a["ocel:type"] = "Application"
    objs.append(a[["ocel:oid","ocel:type","status","amount","submittedAt","handledAt"]])

    # Student
    s = students.copy()
    s["ocel:oid"] = "STU_" + s["studentId"].astype(str)
    s["ocel:type"] = "Student"
    objs.append(s[["ocel:oid","ocel:type","name"]])

    # Parent
    p = parents.copy()
    p["ocel:oid"] = "PAR_" + p["parentId"].astype(str)
    p["ocel:type"] = "Parent"
    objs.append(p[["ocel:oid","ocel:type","role"]])

    # Document
    d = documents.copy()
    d["ocel:oid"] = "DOC_" + d["documentId"].astype(str)
    d["ocel:type"] = "Document"
    objs.append(d[["ocel:oid","ocel:type","type","isComplete","uploadedAt"]])

    # Resubmission
    r = resubmissions.copy()
    r["ocel:oid"] = "RES_" + r["resubmissionId"].astype(str)
    r["ocel:type"] = "Resubmission"
    objs.append(r[["ocel:oid","ocel:type","type","status","requestedAt","receivedAt"]])

    # Notification
    n = notifications.copy()
    n["ocel:oid"] = "NOT_" + n["notificationId"].astype(str)
    n["ocel:type"] = "Notification"
    objs.append(n[["ocel:oid","ocel:type","type","sentAt"]])

    # Clerk
    c = clerks.copy()
    c["ocel:oid"] = "CLK_" + c["clerkId"].astype(str)
    c["ocel:type"] = "BafoegClerk"
    objs.append(c[["ocel:oid","ocel:type","name","office"]])

    objects_df = pd.concat(objs, ignore_index=True)

    # Ensure timestamps are datetimes when present
    for col in ["submittedAt","handledAt","uploadedAt","requestedAt","receivedAt","sentAt"]:
        if col in objects_df.columns:
            objects_df[col] = pd.to_datetime(objects_df[col], errors="coerce")

    return objects_df


In [3]:
def build_static_object_relations(
    applications,
    application_parents,      # columns: applicationId, parentId
    documents,                # columns: documentId, applicationId
    resubmissions,            # columns: resubmissionId, applicationId, requestedFromType, requestedFromId
    resubmission_documents,   # columns: resubmissionId, documentId
    notifications             # columns: notificationId, applicationId, resubmissionId (nullable)
):
    rel = []

    # Application submittedBy Student
    tmp = applications[["applicationId","studentId"]].dropna()
    tmp["from"] = "APP_" + tmp["applicationId"].astype(str)
    tmp["to"]   = "STU_" + tmp["studentId"].astype(str)
    tmp["qualifier"] = "submittedBy"
    rel.append(tmp[["from","to","qualifier"]])

    # Application handledBy Clerk
    if "clerkId" in applications.columns:
        tmp = applications[["applicationId","clerkId"]].dropna()
        tmp["from"] = "APP_" + tmp["applicationId"].astype(str)
        tmp["to"]   = "CLK_" + tmp["clerkId"].astype(str)
        tmp["qualifier"] = "handledBy"
        rel.append(tmp[["from","to","qualifier"]])

    # Application involves Parent (0..2)
    if application_parents is not None and len(application_parents) > 0:
        tmp = application_parents[["applicationId","parentId"]].dropna()
        tmp["from"] = "APP_" + tmp["applicationId"].astype(str)
        tmp["to"]   = "PAR_" + tmp["parentId"].astype(str)
        tmp["qualifier"] = "involves"
        rel.append(tmp[["from","to","qualifier"]])

    # Application initialSubmission Document
    tmp = documents[["documentId","applicationId"]].dropna()
    tmp["from"] = "APP_" + tmp["applicationId"].astype(str)
    tmp["to"]   = "DOC_" + tmp["documentId"].astype(str)
    tmp["qualifier"] = "initialSubmission"
    rel.append(tmp[["from","to","qualifier"]])

    # Resubmission belongsTo Application
    tmp = resubmissions[["resubmissionId","applicationId"]].dropna()
    tmp["from"] = "RES_" + tmp["resubmissionId"].astype(str)
    tmp["to"]   = "APP_" + tmp["applicationId"].astype(str)
    tmp["qualifier"] = "belongsTo"
    rel.append(tmp[["from","to","qualifier"]])

    # Resubmission fulfilledBy Document (1..*)
    tmp = resubmission_documents[["resubmissionId","documentId"]].dropna()
    tmp["from"] = "RES_" + tmp["resubmissionId"].astype(str)
    tmp["to"]   = "DOC_" + tmp["documentId"].astype(str)
    tmp["qualifier"] = "fulfilledBy"
    rel.append(tmp[["from","to","qualifier"]])

    # Resubmission requestedFrom (Student or Parent) (0..1)
    if "requestedFromType" in resubmissions.columns and "requestedFromId" in resubmissions.columns:
        tmp = resubmissions[["resubmissionId","requestedFromType","requestedFromId"]].dropna()
        tmp["from"] = "RES_" + tmp["resubmissionId"].astype(str)

        def mk_to(row):
            if row["requestedFromType"] == "Student":
                return "STU_" + str(row["requestedFromId"])
            if row["requestedFromType"] == "Parent":
                return "PAR_" + str(row["requestedFromId"])
            return None

        tmp["to"] = tmp.apply(mk_to, axis=1)
        tmp = tmp.dropna(subset=["to"])
        tmp["qualifier"] = "requestedFrom"
        rel.append(tmp[["from","to","qualifier"]])

    # Notification concerns Application (always)
    tmp = notifications[["notificationId","applicationId"]].dropna()
    tmp["from"] = "NOT_" + tmp["notificationId"].astype(str)
    tmp["to"]   = "APP_" + tmp["applicationId"].astype(str)
    tmp["qualifier"] = "concerns"
    rel.append(tmp[["from","to","qualifier"]])

    # Notification concerns Resubmission (optional)
    if "resubmissionId" in notifications.columns:
        tmp = notifications[["notificationId","resubmissionId"]].dropna()
        tmp["from"] = "NOT_" + tmp["notificationId"].astype(str)
        tmp["to"]   = "RES_" + tmp["resubmissionId"].astype(str)
        tmp["qualifier"] = "concerns"
        rel.append(tmp[["from","to","qualifier"]])

    return pd.concat(rel, ignore_index=True)


In [4]:
def build_events_df(events_raw: pd.DataFrame):
    # Required columns in events_raw:
    # eventId, activity, timestamp
    e = events_raw.copy()
    e["ocel:eid"] = e["eventId"].astype(str)
    e["ocel:activity"] = e["activity"].astype(str)
    e["ocel:timestamp"] = pd.to_datetime(e["timestamp"])
    # Keep other columns as event attributes
    keep = ["ocel:eid","ocel:activity","ocel:timestamp"] + [c for c in e.columns if c not in ["eventId","activity","timestamp"]]
    return e[keep]


In [5]:
def build_event_object_relations(events_raw: pd.DataFrame):
    rel = []

    def add_rel(col, prefix, qualifier):
        tmp = events_raw[["eventId", col]].dropna()
        tmp["ocel:eid"] = tmp["eventId"].astype(str)
        tmp["ocel:oid"] = prefix + tmp[col].astype(str)
        tmp["ocel:qualifier"] = qualifier
        rel.append(tmp[["ocel:eid","ocel:oid","ocel:qualifier"]])

    if "applicationId" in events_raw.columns:
        add_rel("applicationId", "APP_", "concerns")
    if "studentId" in events_raw.columns:
        add_rel("studentId", "STU_", "involves")
    if "parentId" in events_raw.columns:
        add_rel("parentId", "PAR_", "involves")
    if "documentId" in events_raw.columns:
        add_rel("documentId", "DOC_", "concerns")
    if "resubmissionId" in events_raw.columns:
        add_rel("resubmissionId", "RES_", "concerns")
    if "notificationId" in events_raw.columns:
        add_rel("notificationId", "NOT_", "emits")
    if "clerkId" in events_raw.columns:
        add_rel("clerkId", "CLK_", "performedBy")

    relations_df = pd.concat(rel, ignore_index=True) if rel else pd.DataFrame(columns=["ocel:eid","ocel:oid","ocel:qualifier"])
    return relations_df


In [6]:
def build_ocel(events_df, objects_df, relations_df):
    # PM4Py expects the three dataframes in OCEL naming scheme
    ocel = pm4py.ocel.importer.apply_from_dataframes(
        events_df=events_df,
        objects_df=objects_df,
        relations_df=relations_df
    )
    return ocel

def export_ocel(ocel, path="bafoeg_ocel.jsonocel"):
    pm4py.write_ocel(ocel, path)
    return path


In [7]:
def sanity_checks(events_df, objects_df, relations_df):
    assert events_df["ocel:eid"].is_unique, "Event IDs are not unique"
    assert objects_df["ocel:oid"].is_unique, "Object IDs are not unique"

    # orphan events = events with no related objects
    linked_eids = set(relations_df["ocel:eid"].unique())
    orphan_events = events_df[~events_df["ocel:eid"].isin(linked_eids)]
    print("Orphan events:", len(orphan_events))

    # orphan objects = objects never referenced by any event
    linked_oids = set(relations_df["ocel:oid"].unique())
    orphan_objects = objects_df[~objects_df["ocel:oid"].isin(linked_oids)]
    print("Orphan objects:", len(orphan_objects))

    # quick counts
    print("\nEvents:", len(events_df))
    print("Objects:", len(objects_df))
    print("Relations:", len(relations_df))
    print("\nObjects by type:\n", objects_df["ocel:type"].value_counts())


In [8]:
# --- Load instance data (UML objects) ---
applications = pd.read_csv("applications.csv")      # applicationId, status, amount, submittedAt, handledAt, studentId, clerkId
students     = pd.read_csv("students.csv")          # studentId, name
parents      = pd.read_csv("parents.csv")           # parentId, role
documents    = pd.read_csv("documents.csv")         # documentId, type, isComplete, uploadedAt, applicationId
resubmissions = pd.read_csv("resubmissions.csv")    # resubmissionId, type, status, requestedAt, receivedAt, applicationId, requestedFromType, requestedFromId
notifications = pd.read_csv("notifications.csv")    # notificationId, type, sentAt, applicationId, resubmissionId(optional)
clerks       = pd.read_csv("clerks.csv")            # clerkId, name, office

application_parents = pd.read_csv("application_parents.csv")             # applicationId, parentId
resubmission_documents = pd.read_csv("resubmission_documents.csv")       # resubmissionId, documentId

# --- Build OCEL objects ---
objects_df = build_objects_df(
    applications, students, parents, documents, resubmissions, notifications, clerks
)

# --- Load events (you must have at least: eventId, activity, timestamp) ---
events_raw = pd.read_csv("events.csv")  # eventId, activity, timestamp + optional columns: applicationId/studentId/...
events_df = build_events_df(events_raw)

# --- Build event-object relations from events foreign keys ---
relations_df = build_event_object_relations(events_raw)

# --- Sanity checks ---
sanity_checks(events_df, objects_df, relations_df)

# --- Build OCEL ---
ocel = build_ocel(events_df, objects_df, relations_df)

# --- Export ---
path = export_ocel(ocel, "bafoeg_from_uml.jsonocel")
print("Exported:", path)


FileNotFoundError: [Errno 2] No such file or directory: 'applications.csv'