# Model Development

This Notebook contains all information pertaining to the development of the models in the paper "Leveraging Temporal Graphs for Enhancing Transformer-based Predictive Process Monitoring" and was developed by Marc C. Hennig (mhennig@hm.edu). Requires the preprocessed event logs in the first file.

# Environment

Initialization of the environment runnable in Google Colab and contianing all dependency installations and global variables used in the paper.

## Dependency installation

### A. PIP Dependencies

In [None]:
import packaging
import torch

torch_ver = packaging.version.Version(torch.__version__)

In [None]:
!pip install dgl -f https://data.dgl.ai/wheels/torch-{torch_ver.major}.{torch_ver.minor}/{torch_ver.local}/repo.html
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-{torch_ver.major}.{torch_ver.minor}.{torch_ver.micro}+{torch_ver.local}.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-{torch_ver.major}.{torch_ver.minor}.{torch_ver.micro}+{torch_ver.local}.html
!pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-{torch_ver.major}.{torch_ver.minor}.{torch_ver.micro}+{torch_ver.local}.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{torch_ver.major}.{torch_ver.minor}.{torch_ver.micro}+{torch_ver.local}.html
!pip install torch-geometric
!pip install ipdb pm4py gensim h5py
!pip freeze > requirements.txt

## Dependency Imports

In [None]:
# Python dependencies
import os
import sys
import re
import math
import datetime
import random
import copy
import json
import time
import shutil
import pickle
import warnings
import functools
from pathlib import Path
from typing import List, Tuple, Union, Optional, Literal, Callable, Dict
from collections import namedtuple
from enum import Enum

# Debugging
import ipdb
from tqdm.auto import tqdm, trange
import importlib

# Colab dependencies
from google.colab import files, drive

# Basic dependencies
import numpy as np
import pandas as pd

# Plotting dependencies
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

# Machine learning depenencies
import sklearn as sl
import sklearn.metrics

import torch
import torchdata
import torch_geometric as pyg

import dgl
import h5py
import hyperopt

# Process Mining dependencies
import pm4py

# NLP dependencies
import gensim

# Graph dependencies
import networkx as nx

## Variables & Global Settings

In [None]:
# Assign a random seed for reproduceability
RANDOM_STATE = 1337

os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

# Keras backend
os.environ["KERAS_BACKEND"] = "tensorflow"

# Show all Pandas columns
pd.set_option("display.max_columns", None)

# Set Matplotlib and Seaborn color scheme
plt.rcParams["image.cmap"] = "Blues"
sns.set_palette("Blues")

In [None]:
# Google Drive folders
GDRIVE_INPUT_DIR = "/content/drive/My Drive/Colab Notebooks/TGN-AST/Eventlogs"
GDRIVE_OUTPUT_DIR = "/content/drive/My Drive/Colab Notebooks/TGN-AST/Results"

# Local Colab folders
UTIL_DIR = os.path.join(".", "Util")
DATA_DIR = os.path.join(".", "Data")
INPUT_DATA_DIR = os.path.join(DATA_DIR, "Input")
INPUT_DATA_BPIC2013_DIR = os.path.join(INPUT_DATA_DIR, "BPIC 2013")
INPUT_DATA_BPIC2014_DIR = os.path.join(INPUT_DATA_DIR, "BPIC 2014")
INTERIM_DATA_DIR = os.path.join(DATA_DIR, "Interim")
OUTPUT_DATA_DIR = os.path.join(DATA_DIR, "Output")
OUTPUT_LOG_DATA_DIR = os.path.join(OUTPUT_DATA_DIR, "Logs")

GRAPHIC_DIR = os.path.join(".", "Graphics")
MODEL_DIR = os.path.join(".", "Models")
MODEL_CHECKPOINT_DIR = os.path.join(MODEL_DIR, "Checkpoints")
MODEL_BACKUP_DIR = os.path.join(MODEL_DIR, "Backups")

Path(DATA_DIR).mkdir(exist_ok=True)
Path(INTERIM_DATA_DIR).mkdir(exist_ok=True)
Path(OUTPUT_DATA_DIR).mkdir(exist_ok=True)
Path(OUTPUT_LOG_DATA_DIR).mkdir(exist_ok=True)
Path(GRAPHIC_DIR).mkdir(exist_ok=True)
Path(MODEL_DIR).mkdir(exist_ok=True)
Path(MODEL_BACKUP_DIR).mkdir(exist_ok=True)
Path(MODEL_CHECKPOINT_DIR).mkdir(exist_ok=True)

In [None]:
EVENTLOG_CASE = "case:concept:name"
EVENTLOG_ACTIVITY = "concept:name"
EVENTLOG_TIMESTAMP = "time:timestamp"
EVENTLOG_GROUP = "org:group"
EVENTLOG_RESOURCE = "org:resource"
EVENTLOG_CASE_PREFIX = "case:"
EVENTLOG_LABEL_PREFIX = "label:"

TOKEN_PAD = "[PAD]"
TOKEN_PAD_NUM = -1.0
TOKEN_OOV = "[OOV]"
TOKEN_NA = "[NA]"
TOKEN_EOC = "[EOC]"

DEFAULT_LEARNING_RATE = 0.00003
DEFAULT_MIN_LEARNING_RATE = 1e-6

DEFAULT_WARMUP_EPOCHS = 10
DEFAULT_EPOCHS = 20
DEFAULT_BATCH_SIZE = 64

TARGET_NEXT_ACTIVITY = "next_activity"
TARGET_NEXT_TIME = "next_time"
TARGET_REMAINING_TIME = "remaining_time"

DEFAULT_REMAINING_TIME_OUTPUT = "remaining_time"
DEFAULT_NEXT_ACTIVITY_OUTPUT = "next_activity"
DEFAULT_NEXT_TIME_OUTPUT = "next_time"

## Data Import

### A: Import from Google Drive

In [None]:
drive.mount("/content/drive")

!cp -r "$GDRIVE_INPUT_DIR" "$INPUT_DATA_DIR"

drive.flush_and_unmount()

### B: Upload from Local Machine

In [None]:
#uploaded = files.upload()

#for filename in uploaded.keys():
#  target = os.path.join(INPUT_DATA_DIR, filename)
#  !mv "$filename" "$target"

#del uploaded

## Common Functions

### Model Comparison

In [None]:
def evaluate_regression(y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str, float]:
  if y_true.ndim > 1:
      y_true = np.argmax(y_true, axis=1)
  if y_pred.ndim > 1:
      y_pred = np.argmax(y_pred, axis=1)

  def logcosh_error(y_true, y_pred):
    error = np.subtract(y_pred, y_true)
    return np.mean(np.log((np.exp(error) + np.exp(-error))/2))

  return {
    'mae': sl.metrics.mean_absolute_error(y_true, y_pred),
    'mse': sl.metrics.mean_squared_error(y_true, y_pred),
    'rmse': sl.metrics.root_mean_squared_error(y_true, y_pred),
    'mape': sl.metrics.mean_absolute_percentage_error(y_true, y_pred),
    'medae': sl.metrics.median_absolute_error(y_true, y_pred),
    'logcosh': logcosh_error(y_true, y_pred),
    'max_error': sl.metrics.max_error(y_true, y_pred),
  }

def evaluate_classification(y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str, float]:
  if y_true.ndim > 1:
      y_true = np.argmax(y_true, axis=1)
  if y_pred.ndim > 1:
      y_pred = np.argmax(y_pred, axis=1)

  return {
    'accuracy': sl.metrics.accuracy_score(y_true, y_pred),
    'accuracy_balanced': sl.metrics.balanced_accuracy_score(y_true, y_pred),
    'accuracy_balanced_adjusted': sl.metrics.balanced_accuracy_score(y_true, y_pred, adjusted=True),
    'f1_micro': sl.metrics.f1_score(y_true, y_pred, average='micro'),
    'f1_macro': sl.metrics.f1_score(y_true, y_pred, average='macro'),
    'f1_weighted': sl.metrics.f1_score(y_true, y_pred, average='weighted'),
    'precision_micro': sl.metrics.precision_score(y_true, y_pred, average='micro', zero_division='warn'),
    'precision_macro': sl.metrics.precision_score(y_true, y_pred, average='macro'),
    'precision_weighted': sl.metrics.precision_score(y_true, y_pred, average='weighted'),
    'recall_micro': sl.metrics.recall_score(y_true, y_pred, average='micro'),
    'recall_macro': sl.metrics.recall_score(y_true, y_pred, average='macro'),
    'recall_weighted': sl.metrics.recall_score(y_true, y_pred, average='weighted'),
  }

### PROPHET Model

The following sections contains the implementation of the PROPHET model, mostly adapted from: https://github.com/vinspdb/PROPHET

In [None]:
class TextDataset(dgl.data.DGLDataset):
  def __init__(self, X, y):
    self.X_act = X
    self.Y = y

  def __len__(self):
    return len(self.X_act)

  def __plotgraph__(self, data):
    g = pyg.utils.to_networkx(data, to_undirected=False)
    nx.draw(g, with_labels=True)
    plt.show()
    plt.clf()

  def __getitem__(self, idx):
    data = self.X_act[idx]
    label = self.Y[idx]
    return data, label

class TextDatasetOnDisk(dgl.data.DGLDataset):
  def __init__(self, file: h5py.File, cache_size: int = 1024, auto_prefetch: bool = False):
    self.file = file
    self._keys = list(self.file.keys())
    self._cache_size = cache_size
    self._auto_prefetch = auto_prefetch

    self._max_idx = 0

    # Initialize the cached getter with the specified cache size
    self._cached_get_item = functools.lru_cache(maxsize=self._cache_size)(self._get_item_from_disk)

  def key_at(self, idx: int) -> str:
    return self._keys[idx]

  def prefetch(self, indices):
    """Prefetch a list of indices into the cache"""
    for idx in indices:
      self._cached_get_item(idx)

  def clear_cache(self):
    """Clear the cache"""
    self._cached_get_item.cache_clear()

  def __len__(self):
    return len(self.file.keys())

  def __plotgraph__(self, data):
    g = pyg.utils.to_networkx(data, to_undirected=False)
    nx.draw(g, with_labels=True)
    plt.show()
    plt.clf()

  def __getitem__(self, idx: int):
    if self._auto_prefetch and idx % self._cache_size == 0:
      self.prefetch(range(idx, min(idx + self._cache_size, len(self))))
    return self._cached_get_item(idx)

  def _get_item_from_disk(self, idx: int):
    key = self.key_at(idx)
    data = pickle.loads(self.file[key][()])
    return data['graph'], data['label']

In [None]:
class RGCN(torch.nn.Module):
  def __init__(self, in_feats, params, rel_names):
    super(RGCN, self).__init__()
    self.convs = torch.nn.ModuleList([dgl.nn.pytorch.HeteroGraphConv({rel: dgl.nn.pytorch.GATv2Conv(in_feats=in_feats[rel], out_feats=params['hidden_dim'], num_heads=params['n_heads'], feat_drop=params['dropout'], share_weights=True, residual=True) for rel in rel_names}, aggregate='sum') for i in range(params['n_layers'])])
    self.conv_out = dgl.nn.pytorch.HeteroGraphConv({rel: dgl.nn.pytorch.GATv2Conv(in_feats=params['hidden_dim'] * params['n_heads'], out_feats=params['hidden_dim'], feat_drop=params['dropout'], num_heads=1, share_weights=True, residual=True) for rel in rel_names}, aggregate='sum')
    self.conv_out2 = dgl.nn.pytorch.HeteroGraphConv({rel: dgl.nn.pytorch.GATv2Conv(in_feats=in_feats[rel], out_feats=params['hidden_dim'], feat_drop=params['dropout'], num_heads=1, share_weights=True, residual=True) for rel in rel_names}, aggregate='sum')

  def forward(self, graph, inputs):
    for f in self.convs:
      h = f(graph, inputs)
      h = {k: torch.reshape(torch.nn.functional.relu(v), (v.shape[0], -1)) for k, v in h.items()}

      if len(self.convs) == 0:
        h = self.conv_out2(graph, inputs)
        h = {k: torch.reshape(torch.nn.functional.relu(v), (v.shape[0], -1)) for k, v in h.items()}
      else:
        h = self.conv_out(graph, h)
        h = {k: torch.reshape(torch.nn.functional.relu(v), (v.shape[0], -1)) for k, v in h.items()}

    return h

In [None]:
class HeteroClassifier(torch.nn.Module):
  def __init__(self, in_dim, params, n_classes, rel_names):
    super().__init__()

    self.rgcn = RGCN(in_dim, params, rel_names)
    self.classify = torch.nn.Linear(params['hidden_dim'], n_classes)

  def forward(self, graph, feat, edge_weight=None):
    h = self.rgcn(graph, feat)

    with graph.local_scope():
      graph.ndata['h'] = h
      hg = 0
      for ntype in graph.ntypes:
        hg = hg + dgl.sum_nodes(graph, 'h', ntype=ntype)
      return self.classify(hg)

class HeteroRegressor(torch.nn.Module):
  def __init__(self, in_dim, params, rel_names):
    super().__init__()

    self.rgcn = RGCN(in_dim, params, rel_names)
    self.classify = torch.nn.Linear(params['hidden_dim'], 1)

  def forward(self, graph, feat, edge_weight=None):
    h = self.rgcn(graph, feat)

    with graph.local_scope():
      graph.ndata['h'] = h
      hg = 0
      for ntype in graph.ntypes:
        hg = hg + dgl.sum_nodes(graph, 'h', ntype=ntype)
      return self.classify(hg)

### PROPHET Functions

In [None]:
class GenerateTrace:
    def __init__(self, eventlog):
        self._eventlog = eventlog

    def generate_prefix_trace(self, log, view):
        act = log.groupby('case:concept:name', sort=False).agg({view: lambda x: list(x)})
        return act

    def get_act(self):
        return self.__act

    def get_sequence(self, sequence):
        i = 0
        list_seq = []
        list_label = []
        while i < len(sequence):
            list_temp = []
            j = 0
            while j < (len(sequence.iat[i, 0]) - 1):
                list_temp.append(sequence.iat[i, 0][0 + j])
                list_seq.append(list_temp.copy())
                list_label.append(sequence.iat[i, 0][j + 1])
                j = j + 1
            i = i + 1
        return list_seq, list_label

    def get_sequence_num(self, sequence):
        i = 0
        list_seq = []
        while i < len(sequence):
            list_temp = []
            j = 0
            while j < (len(sequence.iat[i, 0]) - 1):
                list_temp.append(sequence.iat[i, 0][0 + j])
                list_seq.append(list_temp.copy())
                j = j + 1
            i = i + 1
        return list_seq

    @staticmethod
    def dataset_summary(log):
        print("Activity Distribution\n", log['activity'].value_counts())
        n_caseid = log['case'].nunique()
        n_activity = log['activity'].nunique()
        print("Number of CaseID", n_caseid)
        print("Number of Unique Activities", n_activity)
        print("Number of Activities", log['activity'].count())
        cont_trace = log['case'].value_counts(dropna=False)
        max_trace = max(cont_trace)
        mean = np.mean(cont_trace)
        print("Max lenght trace", max_trace)
        print("Mean lenght trace", np.mean(cont_trace))
        print("Min lenght trace", min(cont_trace))
        return max_trace,  int(round(mean)), n_caseid, n_activity

In [None]:
def prophet_create_triangular_matrix(columns: List[str]) -> List[Tuple[str, str, str]]:
    # Convert set to sorted list for consistent indexing
    cols = list(columns)
    n = len(cols)

    # Create empty matrix filled with zeros
    list_relation = []
    # Fill upper triangular part
    for i in range(n):
        for j in range(i, n):
            # Create pair of column names
            if cols[i] == cols[j]:
                pair = (cols[i], 'follow', cols[j])
            else:
                pair = (cols[i], 'has', cols[j])
            list_relation.append(pair)
    return list_relation

def apply_w2v(list_act, enc_act, mean):
    #list_act = clear_list(list_act)
    x_act_ohe = []
    for l in list_act:
        list_emb_temp = []
        for t in l:
            embed_vector = enc_act.get(t)
            if embed_vector is not None:
                list_emb_temp.append(embed_vector)
            else:
                list_emb_temp.append(np.zeros(shape=(mean,)))
        x_act_ohe.append(list_emb_temp)
    x_act_ohe = np.array(x_act_ohe)
    x_act_ohe = x_act_ohe.reshape(x_act_ohe.shape[0], mean)
    return x_act_ohe


def gen_flow(id):
    id.insert(0, 'START')
    remove_dup = list(dict.fromkeys(id))
    remove_dup = [[a] for a in remove_dup]
    id = np.array(id)
    node_encoder = sl.preprocessing.LabelEncoder()
    enc = node_encoder.fit_transform(id)

    return remove_dup, enc

def clear_list(prefix_list):
    temp_traces = []
    for k in prefix_list:
        listToStr = ' '.join([replace_char(str(elem)) for elem in k])
        temp_traces.append(listToStr)

    tokenized_words = []
    for s in temp_traces:
        tokenized_words.append(s.split(' '))
    return tokenized_words

def unique_edge(list1, list2):
    unique_tuples = []
    seen_tuples = set()

    for pair in zip(list1, list2):
        if pair not in seen_tuples:
            unique_tuples.append(pair)
            seen_tuples.add(pair)

    return unique_tuples

def gen_edge_weigts(list1, list2):
    combined_tuples = list(zip(list1, list2))
    tuple_counts = {}
    for pair in combined_tuples:
        if pair in tuple_counts:
            tuple_counts[pair] += 1
        else:
            tuple_counts[pair] = 1
    return list(tuple_counts.values())

def replace_char(ele):
  return re.sub(r" -+_.:()", "", ele)

def build_list_graphs(dict_view, dict_y, dict_enc, mean, c, event_attributes, case_attributes, relation, case_ids, start: int = 0, end = -1, append: bool = True):
    if len(case_ids) != len(dict_view['concept:name']):
      raise ValueError(f"Length {len(case_ids)} must be equal to {len(dict_view['concept:name'])}")

    end = len(case_ids) if end == -1 else min(len(case_ids), end)

    mode = 'a' if append else 'w'
    with h5py.File(os.path.join(OUTPUT_DATA_DIR, c), mode) as f:
        list_graphs = []
        for k in (pbar := tqdm(range(start, end))):
                key = f"{case_ids[k]}-{len(dict_view[EVENTLOG_ACTIVITY][k]):04d}"
                if key.endswith("-0"):
                  ipdb.set_trace()
                pbar.set_description(key)
                if key in f:
                    continue

                list_node = {}
                list_node_comp = {}
                list_node_feature = {}
                dgl_canonical_edge = {}
                weight_node_follow_node = {}
                for v in event_attributes:
                    list_node[v], list_node_comp[v] = gen_flow(dict_view[v][k])
                    list_node_feature[v] = apply_w2v(list_node[v], dict_enc[v], mean) #W2W

                list_att_trace = []
                for v in case_attributes:
                    embed_vector = dict_enc[v].get(dict_view[v][k][0])
                    res = embed_vector if embed_vector is not None else np.zeros(mean)
                    list_att_trace.append(res)

                if list_att_trace:
                    list_node_comp['trace_att'] = [0]
                    list_node_feature['trace_att'] = np.array([np.concatenate(list_att_trace)])

                for rel in relation:
                    if rel[1] == 'follow':
                        edge_res = np.array([[list_node_comp[rel[0]][i], list_node_comp[rel[0]][i + 1]] for i in range(len(list_node_comp[rel[0]]) - 1)])
                    elif rel[1] == 'has_ta':
                        list_node_comp[rel[2]] = [0]*len(np.unique(list_node_comp[rel[0]]))
                        edge_res = list(map(lambda X: [X[0], X[1]], list(zip(np.unique(list_node_comp[rel[0]]), list_node_comp[rel[2]]))))
                    else:
                        edge_res = list(map(lambda X: [X[0], X[1]], list(zip(list_node_comp[rel[0]], list_node_comp[rel[2]]))))

                    src = [item[0] for item in edge_res]
                    dst = [item[1] for item in edge_res]

                    tuple_src_dst = unique_edge(src, dst)
                    dgl_canonical_edge[rel] = tuple_src_dst
                    weight_node_follow_node[rel] = gen_edge_weigts(src,dst)

                hetero_graph = dgl.heterograph(dgl_canonical_edge)

                for nn in list_node_feature:
                        hetero_graph.nodes[nn].data[nn] = torch.tensor(list_node_feature[nn], dtype=torch.float)

                for rel in weight_node_follow_node:
                    hetero_graph.edata['h'] = {rel:torch.tensor(weight_node_follow_node[rel])}
                new_g = dgl.AddReverse(copy_edata=True)(hetero_graph)
                list_graphs.append(new_g)
                pickled_graph = pickle.dumps({'graph':new_g, 'label':dict_y[k]})


                f.create_dataset(key, data=np.void(pickled_graph))

In [None]:
def prophet_generate_dgl_graph(
    df_train: pd.DataFrame,
    df_train_dyn: pd.DataFrame,
    df_test: pd.DataFrame,
    df_test_dyn: pd.DataFrame,
    name: str,
    dynamic_attrs: List[str] = [],
    static_attrs: List[str] = [],
    word2vec_dim: int = 100,
    word2vec_epochs: int = 50,
    case_col: str = EVENTLOG_CASE,
    activity_col: str = EVENTLOG_ACTIVITY,
    resource_col: str = EVENTLOG_RESOURCE,
    label_col: str = "label:time:timestamp:last",
    elapsed_since_start_col: str = "time:timestamp:elapsedcycle:seconds",
    elapsed_since_last_col: str = "time:timestamp:elapsedprev:seconds",
    random_state: int = RANDOM_STATE,
  ):
  df_train = df_train.copy()
  df_test = df_test.copy()
  df_train_dyn = df_train_dyn.copy()
  df_test_dyn = df_test_dyn.copy()
  df_complete = pd.concat([df_train, df_test]).reset_index(drop=True)

  dynamic_attrs_mat = prophet_create_triangular_matrix(dynamic_attrs)

  static_attrs_mat = []
  if len(static_attrs) > 0:
    static_attrs_mat = [(x, 'has_ta', 'trace_att') for x in dynamic_attrs]

  pm = GenerateTrace("TEST")

  dict_card = {}
  dict_view_train = {}
  dict_view_test = {}
  dict_view_train_y = {}
  #dict_view_test_y = {}
  dict_enc = {}

  relation = dynamic_attrs_mat + static_attrs_mat

  if resource_col in dynamic_attrs:
    n_bin = (df_complete[activity_col].nunique() + df_complete[resource_col].nunique()) // 2
  else:
    n_bin = df_complete[activity_col].nunique()

  for attr in tqdm(static_attrs):
    print(attr)
    if pd.api.types.is_numeric_dtype(df_complete[attr]):
      discretizer = sl.preprocessing.KBinsDiscretizer(n_bins=n_bin, encode='ordinal', strategy='quantile', random_state=random_state)
      discretizer.fit(df_complete[attr].dropna().unique().reshape(-1, 1))
      df_train[attr] = df_train[attr].map(lambda x: np.squeeze(discretizer.transform([[x]])), na_action='ignore')
      df_test[attr] = df_test[attr].map(lambda x: np.squeeze(discretizer.transform([[x]])), na_action='ignore')

    df_train[attr] = df_train[attr].astype('string').fillna(TOKEN_NA)
    df_test[attr] = df_test[attr].astype('string').fillna(TOKEN_NA)
    df_complete = pd.concat([df_train, df_test]).reset_index(drop=True)

    dict_card[attr] = list(df_complete[attr].unique())

    dict_view_train[attr] = []
    for l in df_train.groupby(case_col).agg({attr: lambda x: list(x)}).to_numpy().tolist():
      prefix = np.tril(np.repeat(l, len(l[0]), axis=0)).tolist()
      prefix = [list(filter(lambda x: not pd.isna(x) and x != "", l)) for l in prefix]
      dict_view_train[attr].extend(prefix)

    dict_view_test[attr] = []
    for l in df_test.groupby(case_col).agg({attr: lambda x: list(x)}).to_numpy().tolist():
      prefix = np.tril(np.repeat(l, len(l[0]), axis=0)).tolist()
      prefix = [list(filter(lambda x: not pd.isna(x) and x != "", l)) for l in prefix]
      dict_view_test[attr].extend(prefix)

    word2vec = gensim.models.Word2Vec(vector_size=word2vec_dim, min_count=1, sg=0, workers=1, seed=random_state)
    word2vec.build_vocab(dict_view_train[attr], min_count=1)
    word2vec.train(dict_view_train[attr], total_examples=word2vec.corpus_count, epochs=50)

    dict_enc[attr] = {}
    for word in word2vec.wv.index_to_key:
      dict_enc[attr][word] = word2vec.wv.get_vector(word).tolist()

  for attr in tqdm(dynamic_attrs):
    print(attr)
    if pd.api.types.is_numeric_dtype(df_complete[attr]):
      discretizer = sl.preprocessing.KBinsDiscretizer(n_bins=n_bin, encode='ordinal', strategy='quantile', random_state=random_state)
      discretizer.fit(df_complete[attr].dropna().unique().reshape(-1, 1))

      df_train[attr] = df_train[attr].map(lambda x: np.squeeze(discretizer.transform([[x]])), na_action='ignore')
      df_test[attr] = df_test[attr].map(lambda x: np.squeeze(discretizer.transform([[x]])), na_action='ignore')
      df_complete = pd.concat([df_train, df_test]).reset_index(drop=True)

      df_train_dyn[attr] = df_train_dyn[attr].map(lambda x: np.squeeze(discretizer.transform([[x]])), na_action='ignore')
      df_test_dyn[attr] = df_test_dyn[attr].map(lambda x: np.squeeze(discretizer.transform([[x]])), na_action='ignore')

    df_train[attr] = df_train[attr].astype('string').fillna(TOKEN_NA)
    df_test[attr] = df_test[attr].astype('string').fillna(TOKEN_NA)

    dict_card[attr] = list(df_complete[attr].unique())
    dict_card[attr].insert(0, 'START')

    dict_view_train[attr] = df_train_dyn[attr].to_numpy().tolist()
    dict_view_train[attr] = [list(filter(lambda x: x is not None and not pd.isna(x), l)) for l in dict_view_train[attr]]
    #dict_view_train_y[attr] = df_train[label_col].to_numpy().tolist()

    dict_view_test[attr] = df_test_dyn[attr].to_numpy().tolist()
    dict_view_test[attr] = [list(filter(lambda x: x is not None and not pd.isna(x), l)) for l in dict_view_test[attr]]
    #dict_view_test_y[attr] = df_test[label_col].to_numpy().tolist()

    word2vec = gensim.models.Word2Vec(vector_size=word2vec_dim, min_count=1, sg=0, workers=1, seed=random_state)
    word2vec.build_vocab(dict_view_train[attr], min_count=1)
    word2vec.train(dict_view_train[attr], total_examples=word2vec.corpus_count, epochs=50)

    dict_enc[attr] = {}
    for word in word2vec.wv.index_to_key:
      dict_enc[attr][word] = word2vec.wv.get_vector(word).tolist()

  y_train = df_train[label_col].to_numpy()
  y_test = df_test[label_col].to_numpy()

  pickle.dump(dict_card, open(os.path.join(OUTPUT_DATA_DIR, f"{name}_card.pkl"), "wb"))
  pickle.dump(dict_enc, open(os.path.join(OUTPUT_DATA_DIR, f"{name}_enc.pkl"), "wb"))
  pickle.dump(dict_view_train, open(os.path.join(OUTPUT_DATA_DIR, f"{name}_view_train.pkl"), "wb"))
  pickle.dump(dict_view_test, open(os.path.join(OUTPUT_DATA_DIR, f"{name}_view_test.pkl"), "wb"))
  pickle.dump(y_train, open(os.path.join(OUTPUT_DATA_DIR, f"{name}_train_y.pkl"), "wb"))
  pickle.dump(y_test, open(os.path.join(OUTPUT_DATA_DIR, f"{name}_test_y.pkl"), "wb"))

  build_list_graphs(dict_view_test, y_test, dict_enc, word2vec_dim, f'{name}_test.db', dynamic_attrs, static_attrs, relation, df_test[case_col].astype('string').values)
  build_list_graphs(dict_view_train, y_train, dict_enc, word2vec_dim, f'{name}_train.db', dynamic_attrs, static_attrs, relation, df_train[case_col].astype('string').values)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train_fn(model, X_train_batch, y_train_batch, optimizer, criterion, pred_type: Literal['regression', 'classification'] = 'regression'):
        model.train()
        X_train_batch = X_train_batch.to(device)
        y_train_batch = y_train_batch.to(device)

        optimizer.zero_grad()
        feature = {}
        feature = {n: X_train_batch.ndata[n][n] for n in X_train_batch.ntypes}
        #for n in X_train_batch.ntypes:
        #    feature[n] = X_train_batch.ndata[n][n]
        eweight = None
        y_train_pred = model(X_train_batch, feature, eweight)

        if 'regression' in pred_type:
          y_train_pred = torch.flatten(y_train_pred)
          y_train_batch = torch.flatten(y_train_batch)

        loss = criterion(y_train_pred, y_train_batch)

        loss.backward()
        optimizer.step()
        return loss.item()

def evaluate_fn(model, data_loader, criterion, device, pred_type: Literal['regression', 'classification'] = 'regression'):
    model.eval()
    epoch_loss = 0
    global y_pred
    global y_true
    y_pred = []
    y_true = []
    with torch.no_grad():
        for X, y in tqdm(data_loader):
            X = X.to(device)
            y = y.to(device)
            feature = {}
            for n in X.ntypes:
                feature[n] = X.ndata[n][n]

            pred = model(X, feature)

            if 'regression' == pred_type:
                pred = torch.flatten(pred)
                y = torch.flatten(y)

            loss = criterion(pred, y)

            if 'classification' == pred_type:
                pred = torch.nn.functional.softmax(pred, dim=-1)

            y_pred.append(pred.detach().cpu().numpy())
            y_true.append(y.detach().cpu().numpy())

            epoch_loss += loss.item()
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)

    metrics = {}
    if 'regression' == pred_type:
      metrics = evaluate_regression(y_true, y_pred)
    elif 'classification' == pred_type:
      metrics = evaluate_classification(y_true, y_pred)

    np.save(os.path.join(OUTPUT_DATA_DIR, "y_true.npy"), y_true, allow_pickle=False)
    np.save(os.path.join(OUTPUT_DATA_DIR, "y_pred.npy"), y_pred, allow_pickle=False)

    return epoch_loss / len(data_loader), metrics

def train_gnn(model, train_data_loader, valid_data_loader, optimizer, epochs = 200, pred_type: Literal['regression', 'classification'] = 'regression'):
    best_valid_loss = float("inf")
    early_stop_counter = 0
    patience = 10
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
    best_model = None

    if 'regression' in pred_type:
      criterion = torch.nn.L1Loss()
    elif 'classification' in pred_type:
      criterion = torch.nn.CrossEntropyLoss()
    else:
      raise ValueError(f"Invalid prediction type {pred_type}")

    for epoch in range(epochs):
        train_loss = 0
        for x_batch, y_batch in (pbar:= tqdm(train_data_loader)):
            batch_loss = train_fn(model.to(device), x_batch, y_batch, optimizer, criterion, pred_type)
            train_loss += batch_loss
            pbar.set_description(f"Epoch Loss {epoch}: {batch_loss:.4f}")

        avg_train_loss = train_loss / len(train_data_loader)
        valid_loss, metrics = evaluate_fn(model.to(device), valid_data_loader, criterion, device, pred_type)
        scheduler.step(valid_loss)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            early_stop_counter = 0  # Reset early stopping counter
            best_model = copy.deepcopy(model)
        else:
            early_stop_counter += 1

        print(f"Epoch {epoch + 1}/{epochs} - Train Loss: {avg_train_loss:.4f} - Val Loss: {valid_loss:.4f}", end=' ')
        print(metrics)
        if early_stop_counter >= patience:
            print("Validation loss hasn't improved for", patience, "epochs. Early stopping...")
            break
    return best_valid_loss, best_model


def get_model(params, train_loader, pred_type: Literal['regression', 'classification'] = 'regression', n_classes=0):
    g, label = next(iter(train_loader))
    f_map = {ntype: (g.nodes[ntype[0]].data[ntype[0]].shape[1], g.nodes[ntype[2]].data[ntype[2]].shape[1]) for ntype in g.canonical_etypes}  # for SageConv
    if 'regression' == pred_type:
      model = HeteroRegressor(f_map, params, g.canonical_etypes).to(device)
    elif 'classification' == pred_type:
      model = HeteroClassifier(f_map, params, n_classes, g.canonical_etypes).to(device)
    else:
      raise ValueError(f"Invalid prediction type {pred_type}")
    return model

def load_graphs_from_hdf5(filename):
    graphs = []
    label = []
    with h5py.File(filename, 'r') as f:
        for key in tqdm(f.keys()):
            pickled_graph = f[key][()]
            graph = pickle.loads(pickled_graph)
            graphs.append(graph['graph'])
            label.append(graph['label'])
    return graphs, label

def prophet_train(name: str):
    # Load graphs
    print("Loading training split...")
    X_train, y_train = load_graphs_from_hdf5(os.path.join(OUTPUT_DATA_DIR, f"{name}_train.db"))
    df_train = TextDataset(X_train, y_train)
    #file_train = h5py.File(os.path.join(OUTPUT_DATA_DIR, f"{name}_train.db"), 'r')
    #df_train = TextDatasetOnDisk(file_train)
    print("Loading test split...")
    #file_test = h5py.File(os.path.join(OUTPUT_DATA_DIR, f"{name}_test.db"), 'r')
    #df_test = TextDatasetOnDisk(file_test, cache_size=20000, auto_prefetch=True)
    X_val, y_val = load_graphs_from_hdf5(os.path.join(OUTPUT_DATA_DIR, f"{name}_test.db"))
    df_test = TextDataset(X_val, y_val)

    train_loader = dgl.dataloading.GraphDataLoader(df_train, batch_size=64, drop_last=False, shuffle=True)
    test_loader = dgl.dataloading.GraphDataLoader(df_test, batch_size=128, drop_last=False, shuffle=False)

    params = {
      'hidden_dim': 64,
      'dropout': 0.1,
      'n_heads': 2,
      'n_layers': 4,
      'learning_rate': 0.001,
    }

    model = get_model(params, train_loader)
    optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'])

    score, best_model = train_gnn(model, train_loader, test_loader, optimizer, DEFAULT_EPOCHS)

    #file_train.close()
    #file_test.close()

    return score, best_model

def prophet_train_classification(name: str, n_classes: int):
    # Load graphs
    print("Loading training split...")
    X_train, y_train = load_graphs_from_hdf5(os.path.join(OUTPUT_DATA_DIR, f"{name}_train.db"))
    df_train = TextDataset(X_train, y_train)
    #file_train = h5py.File(os.path.join(OUTPUT_DATA_DIR, f"{name}_train.db"), 'r')
    #df_train = TextDatasetOnDisk(file_train)
    print("Loading test split...")
    #file_test = h5py.File(os.path.join(OUTPUT_DATA_DIR, f"{name}_test.db"), 'r')
    #df_test = TextDatasetOnDisk(file_test, cache_size=20000, auto_prefetch=True)
    X_val, y_val = load_graphs_from_hdf5(os.path.join(OUTPUT_DATA_DIR, f"{name}_test.db"))
    df_test = TextDataset(X_val, y_val)

    train_loader = dgl.dataloading.GraphDataLoader(df_train, batch_size=64, drop_last=False, shuffle=True)
    test_loader = dgl.dataloading.GraphDataLoader(df_test, batch_size=128, drop_last=False, shuffle=False)

    params = {
      'hidden_dim': 64,
      'dropout': 0.1,
      'n_heads': 2,
      'n_layers': 4,
      'learning_rate': 0.001,
    }

    model = get_model(params, train_loader, 'classification', n_classes)
    optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'])

    score, best_model = train_gnn(model, train_loader, test_loader, optimizer, DEFAULT_EPOCHS, 'classification')

    #file_train.close()
    #file_test.close()

    return score, best_model

def prophet_predict(model, graphs, label, pred_type: Literal['regression', 'classification'] = 'regression'):
  ds = TextDataset(graphs, label)
  loader = dgl.dataloading.GraphDataLoader(ds, batch_size=128, drop_last=False, shuffle=False)

  y_pred = []
  y_true = []

  model.eval()
  with torch.no_grad():
    for X, y in tqdm(loader):
      X = X.to(device)
      y = y.to(device)
      feature = {}
      for n in X.ntypes:
        feature[n] = X.ndata[n][n]

      pred = model(X, feature)
      if 'classification' == pred_type:
        pred = torch.nn.functional.softmax(pred, dim=-1)

      y_pred.append(pred.detach().cpu().numpy())
      y_true.append(y.detach().cpu().numpy())


  y_pred = np.concatenate(y_pred)
  if 'regression' == pred_type:
    y_pred = y_pred.flatten()

  return np.concatenate(y_true).flatten(), y_pred

In [None]:
def prophet_replace_target(
    file_path: str,
    targets: list,
    name: str = "",
  ):
  with h5py.File(file_path, 'r+') as f:
    keys = list(f.keys())
    for key, target in tqdm(zip(keys, targets, strict=True), total=len(f.keys())):
      pickled_graph = f[key][()]
      unpickled_graph = pickle.loads(pickled_graph)
      unpickled_graph['label'] = target
      pickled_graph = pickle.dumps(unpickled_graph)

      del f[key]
      f.create_dataset(key, data=np.void(pickled_graph))

# Dataset: Incident Management Process Enriched Event Log

## Input Preparation

In [None]:
df_servicenow = pd.read_feather(os.path.join(INPUT_DATA_DIR, "incident_event_log_labeled.feather"))
df_servicenow

In [None]:
df_servicenow_train = pd.read_feather(os.path.join(INPUT_DATA_DIR, "CAiSE 2025", "incident_event_log_train.feather"))
df_servicenow_train

In [None]:
df_servicenow_train_dyn = pd.read_feather(os.path.join(INPUT_DATA_DIR, "CAiSE 2025", "incident_event_log_train_dyn.feather"))
df_servicenow_train_dyn

In [None]:
df_servicenow_test = pd.read_feather(os.path.join(INPUT_DATA_DIR, "CAiSE 2025", "incident_event_log_test.feather"))
df_servicenow_test

In [None]:
df_servicenow_test_dyn = pd.read_feather(os.path.join(INPUT_DATA_DIR, "CAiSE 2025", "incident_event_log_test_dyn.feather"))
df_servicenow_test_dyn

In [None]:
PROJECT_NAME = "servicenow"
ACTIVITY_VOCAB = df_servicenow[EVENTLOG_ACTIVITY].unique().tolist() + [TOKEN_EOC]

## Prepare PROPHET Graph

### Remaining Time Prediction

In [None]:
prophet_generate_dgl_graph(
    df_servicenow_train,
    df_servicenow_train_dyn,
    df_servicenow_test,
    df_servicenow_test_dyn,
    dynamic_attrs=[EVENTLOG_ACTIVITY, EVENTLOG_RESOURCE, EVENTLOG_GROUP, "category", "subcategory", "location", "u_symptom", "caller_id", "sys_updated_by", "contact_type", "sys_mod_count", "u_priority_confirmation", "knowledge", "priority", "reopen_count", "reassignment_count", "time:timestamp:elapsedprev:seconds", "time:timestamp:elapsedcycle:seconds"],
    static_attrs=["case:notify", "case:opened_by", "case:sys_created_by"],
    name=PROJECT_NAME
)

In [None]:
!gzip -c ./Data/Output/servicenow_test.db > ./Data/Output/servicenow_test.db.gz

In [None]:
!gzip -c ./Data/Output/servicenow_train.db > ./servicenow_train.db.gz

### Next Time Prediction

In [None]:
new_train_file_path = os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_next_time_train.db")
new_test_file_path = os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_next_time_test.db")

In [None]:
!gunzip ./Data/Input/italy_train.db.gz -c > "$new_train_file_path"
!gunzip ./Data/Input/italy_test.db.gz -c > "$new_test_file_path"

In [None]:
prophet_replace_target(new_train_file_path, df_servicenow_train['label:time:timestamp:next'].to_numpy())
!gzip -c "$new_train_file_path" > "$new_train_file_path".gz

In [None]:
prophet_replace_target(new_test_file_path, df_servicenow_test['label:time:timestamp:next'].to_numpy())
!gzip -c "$new_test_file_path" > "$new_test_file_path".gz

### Next Activity Prediction

In [None]:
new_train_file_path = os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_next_activity_train.db")
new_test_file_path = os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_next_activity_test.db")

In [None]:
!gunzip ./Data/Input/italy_train.db.gz -c > "$new_train_file_path"
!gunzip ./Data/Input/italy_test.db.gz -c > "$new_test_file_path"

In [None]:
prophet_replace_target(new_train_file_path, df_servicenow_train['label:concept:name:next'].to_numpy(dtype=int))
!gzip -c "$new_train_file_path" > "$new_train_file_path".gz

In [None]:
prophet_replace_target(new_test_file_path, df_servicenow_test['label:concept:name:next'].to_numpy(dtype=int))
!gzip -c "$new_test_file_path" > "$new_test_file_path".gz

## Train PROPHET Model

### Remaining Time Prediction

In [None]:
%pdb off
best_loss, best_model = prophet_train(PROJECT_NAME)

In [None]:
graphs, labels = load_graphs_from_hdf5(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_test.db"))
y_true, y_pred = prophet_predict(best_model, graphs, labels)

In [None]:
np.save(f"{PROJECT_NAME}_prophet_pred.npy", y_pred, allow_pickle=False)
np.save(f"{PROJECT_NAME}_prophet_true.npy", y_true, allow_pickle=False)

### Next Activity Prediction

In [None]:
!gunzip ./Data/Input/servicenow_next_activity_train.db.gz -c > ./Data/Output/servicenow_train.db
!gunzip ./Data/Input/servicenow_next_activity_test.db.gz -c > ./Data/Output/servicenow_test.db

In [None]:
%pdb off
best_loss, best_model = prophet_train_classification(PROJECT_NAME, n_classes=len(ACTIVITY_VOCAB))

In [None]:
graphs, labels = load_graphs_from_hdf5(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_test.db"))
y_true, y_pred = prophet_predict(best_model, graphs, labels)

In [None]:
np.save(f"{PROJECT_NAME}_prophet_pred.npy", y_pred, allow_pickle=False)
np.save(f"{PROJECT_NAME}_prophet_true.npy", y_true, allow_pickle=False)

In [None]:
files.download(f"{PROJECT_NAME}_prophet_pred.npy")
files.download(f"{PROJECT_NAME}_prophet_true.npy")

### Next Time Prediction

In [None]:
!gunzip ./Data/Input/servicenow_next_time_train.db.gz -c > ./Data/Output/servicenow_train.db
!gunzip ./Data/Input/servicenow_next_time_test.db.gz -c > ./Data/Output/servicenow_test.db

In [None]:
%pdb off
best_loss, best_model = prophet_train(PROJECT_NAME)

In [None]:
graphs, labels = load_graphs_from_hdf5(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_test.db"))
y_true, y_pred = prophet_predict(best_model, graphs, labels)

In [None]:
np.save(f"{PROJECT_NAME}_prophet_pred.npy", y_pred, allow_pickle=False)
np.save(f"{PROJECT_NAME}_prophet_true.npy", y_true, allow_pickle=False)

In [None]:
files.download(f"{PROJECT_NAME}_prophet_pred.npy")
files.download(f"{PROJECT_NAME}_prophet_true.npy")

# Dataset: Dataset belonging to the help desk log of an Italian Company

## Input Preparation

In [None]:
df_italy = pd.read_feather(os.path.join(INPUT_DATA_DIR, "finale_labeled.feather"))
df_italy

In [None]:
df_italy_train = pd.read_feather(os.path.join(INPUT_DATA_DIR, "CAiSE 2025", "Splits", "Italy", "finale_train.feather"))
df_italy_train

In [None]:
df_italy_test = pd.read_feather(os.path.join(INPUT_DATA_DIR, "CAiSE 2025", "Splits", "Italy", "finale_test.feather"))
df_italy_test

In [None]:
df_italy_train_dyn = pd.read_feather(os.path.join(INPUT_DATA_DIR, "CAiSE 2025", "finale_train_dyn.feather"))
df_italy_train_dyn

In [None]:
df_italy_test_dyn = pd.read_feather(os.path.join(INPUT_DATA_DIR, "CAiSE 2025", "finale_test_dyn.feather"))
df_italy_test_dyn

In [None]:
PROJECT_NAME = "italy"
ACTIVITY_VOCAB = df_italy[EVENTLOG_ACTIVITY].unique().tolist() + [TOKEN_EOC]

## Prepare PROPHET Graph

### Remaining Time Prediction

In [None]:
prophet_generate_dgl_graph(
    df_italy_train,
    df_italy_train_dyn,
    df_italy_test,
    df_italy_test_dyn,
    dynamic_attrs=[EVENTLOG_ACTIVITY, EVENTLOG_RESOURCE, EVENTLOG_GROUP, "customer", "product", "service_type", "seriousness_2", "service_level", "time:timestamp:elapsedprev:seconds", "time:timestamp:elapsedcycle:seconds"],
    static_attrs=["case:responsible_section", "case:support_section"],
    name=PROJECT_NAME
)

In [None]:
!gzip -c ./Data/Output/italy_test.db > ./Data/Output/italy_test.db.gz

In [None]:
!gzip -c ./Data/Output/italy_train.db > ./Data/Output/italy_train.db.gz

### Next Time Prediction

In [None]:
new_train_file_path = os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_next_time_train.db")
new_test_file_path = os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_next_time_test.db")

In [None]:
!gunzip ./Data/Input/italy_train.db.gz -c > "$new_train_file_path"
!gunzip ./Data/Input/italy_test.db.gz -c > "$new_test_file_path"

In [None]:
prophet_replace_target(new_train_file_path, df_italy_train['label:time:timestamp:next'].to_numpy())
!gzip -c "$new_train_file_path" > "$new_train_file_path".gz

In [None]:
prophet_replace_target(new_test_file_path, df_italy_test['label:time:timestamp:next'].to_numpy())
!gzip -c "$new_test_file_path" > "$new_test_file_path".gz

### Next Activity Prediction

In [None]:
new_train_file_path = os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_next_activity_train.db")
new_test_file_path = os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_next_activity_test.db")

In [None]:
!gunzip ./Data/Input/italy_train.db.gz -c > "$new_train_file_path"
!gunzip ./Data/Input/italy_test.db.gz -c > "$new_test_file_path"

In [None]:
prophet_replace_target(new_train_file_path, df_italy_train['label:concept:name:next'].to_numpy(dtype=int))
!gzip -c "$new_train_file_path" > "$new_train_file_path".gz

In [None]:
prophet_replace_target(new_test_file_path, df_italy_test['label:concept:name:next'].to_numpy(dtype=int))
!gzip -c "$new_test_file_path" > "$new_test_file_path".gz

## Train PROPHET Model

### Remaining Time Prediction

In [None]:
%pdb off
best_loss, best_model = prophet_train(PROJECT_NAME)

In [None]:
graphs, labels = load_graphs_from_hdf5(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_test.db"))
y_true, y_pred = prophet_predict(best_model, graphs, labels)

In [None]:
np.save(f"{PROJECT_NAME}_prophet_pred.npy", y_pred, allow_pickle=False)
np.save(f"{PROJECT_NAME}_prophet_true.npy", y_true, allow_pickle=False)

### Next Activity Prediction

In [None]:
!gunzip ./Data/Input/italy_next_activity_train.db.gz -c > ./Data/Output/italy_train.db
!gunzip ./Data/Input/italy_next_activity_test.db.gz -c > ./Data/Output/italy_test.db

In [None]:
%pdb off
best_loss, best_model = prophet_train_classification(PROJECT_NAME, n_classes=len(ACTIVITY_VOCAB))

In [None]:
graphs, labels = load_graphs_from_hdf5(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_test.db"))
y_true, y_pred = prophet_predict(best_model, graphs, labels)

In [None]:
np.save(f"{PROJECT_NAME}_prophet_pred.npy", y_pred, allow_pickle=False)
np.save(f"{PROJECT_NAME}_prophet_true.npy", y_true, allow_pickle=False)

In [None]:
files.download(f"{PROJECT_NAME}_prophet_pred.npy")
files.download(f"{PROJECT_NAME}_prophet_true.npy")

### Next Time Prediction

In [None]:
!gunzip ./Data/Input/italy_next_time_train.db.gz -c > ./Data/Output/italy_train.db
!gunzip ./Data/Input/italy_next_time_test.db.gz -c > ./Data/Output/italy_test.db

In [None]:
%pdb off
best_loss, best_model = prophet_train(PROJECT_NAME)

In [None]:
graphs, labels = load_graphs_from_hdf5(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_test.db"))
y_true, y_pred = prophet_predict(best_model, graphs, labels)

In [None]:
np.save(f"{PROJECT_NAME}_prophet_pred.npy", y_pred, allow_pickle=False)
np.save(f"{PROJECT_NAME}_prophet_true.npy", y_true, allow_pickle=False)

In [None]:
files.download(f"{PROJECT_NAME}_prophet_pred.npy")
files.download(f"{PROJECT_NAME}_prophet_true.npy")

# Dataset: BPIC 2014

## Input Preparation

In [None]:
df_bpic14 = pd.read_feather(os.path.join(INPUT_DATA_DIR, "Detail_Incident_Activity_labeled.feather"))
df_bpic14

In [None]:
df_bpic14_train = pd.read_feather(os.path.join(INPUT_DATA_DIR, "CAiSE 2025", "Detail_Incident_Activity_train.feather"))
df_bpic14_train

In [None]:
df_bpic14_train_dyn = pd.read_feather(os.path.join(INPUT_DATA_DIR, "CAiSE 2025", "Detail_Incident_Activity_train_dyn.feather"))
df_bpic14_train_dyn

In [None]:
df_bpic14_test = pd.read_feather(os.path.join(INPUT_DATA_DIR, "CAiSE 2025", "Detail_Incident_Activity_test.feather"))
df_bpic14_test

In [None]:
df_bpic14_test_dyn = pd.read_feather(os.path.join(INPUT_DATA_DIR, "CAiSE 2025", "Detail_Incident_Activity_test_dyn.feather"))
df_bpic14_test_dyn

In [None]:
PROJECT_NAME = "bpic2014"

ACTIVITY_VOCAB = df_bpic14[EVENTLOG_ACTIVITY].unique().tolist() + [TOKEN_EOC]

## Prepare PROPHET Graph

### Remaining Time Prediction

In [None]:
prophet_generate_dgl_graph(
    df_bpic14_train,
    df_bpic14_train_dyn,
    df_bpic14_test,
    df_bpic14_test_dyn,
    dynamic_attrs=[EVENTLOG_ACTIVITY, EVENTLOG_GROUP, "time:timestamp:elapsedprev:seconds", "time:timestamp:elapsedcycle:seconds"],
    static_attrs=["case:KM number", "case:incident_Category", "case:incident_CI Type (aff)", "case:incident_CI Subtype (aff)", "case:incident_Service Component WBS (aff)", "case:incident_CI Name (CBy)", "case:incident_CI Type (CBy)", "case:incident_Priority", "case:interaction_Priority"],
    name=PROJECT_NAME,
)

### Next Time Prediction

In [None]:
!gunzip ./Data/Input/bpic2014_train.db.gz -c > ./Data/Output/bpic2014_train.db
!gunzip ./Data/Input/bpic2014_test.db.gz -c > ./Data/Output/bpic2014_test.db

In [None]:
new_train_file_path = os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_next_time_train.db")
new_test_file_path = os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_next_time_test.db")

In [None]:
shutil.copyfile(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_train.db"), new_train_file_path)
shutil.copyfile(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_test.db"), new_test_file_path)

In [None]:
prophet_replace_target(new_train_file_path, df_bpic14_train['label:time:timestamp:next'].to_numpy())
!gzip -c "$new_train_file_path" > "$new_train_file_path".gz

In [None]:
prophet_replace_target(new_test_file_path, df_bpic14_test['label:time:timestamp:next'].to_numpy())
!gzip -c "$new_test_file_path" > "$new_test_file_path".gz

### Next Activity Prediction

In [None]:
!gunzip ./Data/Input/bpic2014_train.db.gz -c > ./Data/Output/bpic2014_train.db
!gunzip ./Data/Input/bpic2014_test.db.gz -c > ./Data/Output/bpic2014_test.db

In [None]:
new_train_file_path = os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_next_activity_train.db")
new_test_file_path = os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_next_activity_test.db")

In [None]:
shutil.copyfile(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_train.db"), new_train_file_path)
shutil.copyfile(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_test.db"), new_test_file_path)

In [None]:
prophet_replace_target(new_train_file_path, df_bpic14_train['label:concept:name:next'].to_numpy(dtype=int))
!gzip -c "$new_train_file_path" > "$new_train_file_path".gz

In [None]:
prophet_replace_target(new_test_file_path, df_bpic14_test['label:concept:name:next'].to_numpy(dtype=int))
!gzip -c "$new_test_file_path" > "$new_test_file_path".gz

## Train PROPHET Model

### Remaining Time Prediction

In [None]:
!gunzip ./Data/Input/bpic2014_train.db.gz -c > ./Data/Output/bpic2014_train.db
!gunzip ./Data/Input/bpic2014_test.db.gz -c > ./Data/Output/bpic2014_test.db

In [None]:
%pdb off
best_loss, best_model = prophet_train(PROJECT_NAME)

In [None]:
graphs, labels = load_graphs_from_hdf5(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_test.db"))
y_true, y_pred = prophet_predict(best_model, graphs, labels)

In [None]:
np.save(f"{PROJECT_NAME}_prophet_pred.npy", y_pred, allow_pickle=False)
np.save(f"{PROJECT_NAME}_prophet_true.npy", y_true, allow_pickle=False)

In [None]:
files.download(f"{PROJECT_NAME}_prophet_pred.npy")
files.download(f"{PROJECT_NAME}_prophet_true.npy")

### Next Activity Prediction

In [None]:
!gunzip ./Data/Input/bpic2014_next_activity_train.db.gz -c > ./Data/Output/bpic2014_train.db
!gunzip ./Data/Input/bpic2014_next_activity_test.db.gz -c > ./Data/Output/bpic2014_test.db

In [None]:
%pdb off
best_loss, best_model = prophet_train_classification(PROJECT_NAME, n_classes=len(ACTIVITY_VOCAB))

In [None]:
graphs, labels = load_graphs_from_hdf5(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_test.db"))
y_true, y_pred = prophet_predict(best_model, graphs, labels)

In [None]:
np.save(f"{PROJECT_NAME}_prophet_pred.npy", y_pred, allow_pickle=False)
np.save(f"{PROJECT_NAME}_prophet_true.npy", y_true, allow_pickle=False)

In [None]:
files.download(f"{PROJECT_NAME}_prophet_pred.npy")
files.download(f"{PROJECT_NAME}_prophet_true.npy")

### Next Time Prediction

In [None]:
!gunzip ./Data/Input/bpic2014_next_time_train.db.gz -c > ./Data/Output/bpic2014_train.db
!gunzip ./Data/Input/bpic2014_next_time_test.db.gz -c > ./Data/Output/bpic2014_test.db

In [None]:
%pdb off
best_loss, best_model = prophet_train(PROJECT_NAME)

In [None]:
graphs, labels = load_graphs_from_hdf5(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_test.db"))
y_true, y_pred = prophet_predict(best_model, graphs, labels)

In [None]:
np.save(f"{PROJECT_NAME}_prophet_pred.npy", y_pred, allow_pickle=False)
np.save(f"{PROJECT_NAME}_prophet_true.npy", y_true, allow_pickle=False)

In [None]:
files.download(f"{PROJECT_NAME}_prophet_pred.npy")
files.download(f"{PROJECT_NAME}_prophet_true.npy")

# Dataset: Helpdesk

## Input Preparation

In [None]:
df_helpdesk = pd.read_feather(os.path.join(INPUT_DATA_DIR, "helpdesk_labeled.feather"))
df_helpdesk

In [None]:
df_helpdesk_train = pd.read_feather(os.path.join(INPUT_DATA_DIR, "CAiSE 2025", "helpdesk_train.feather"))
df_helpdesk_train

In [None]:
df_helpdesk_test = pd.read_feather(os.path.join(INPUT_DATA_DIR, "CAiSE 2025", "Splits", "Helpdesk", "helpdesk_test.feather"))
df_helpdesk_test

In [None]:
df_helpdesk_train_dyn = pd.read_feather(os.path.join(INPUT_DATA_DIR, "CAiSE 2025", "helpdesk_train_dyn.feather"))
df_helpdesk_train_dyn

In [None]:
df_helpdesk_test_dyn = pd.read_feather(os.path.join(INPUT_DATA_DIR, "CAiSE 2025", "helpdesk_test_dyn.feather"))
df_helpdesk_test_dyn

In [None]:
PROJECT_NAME = "helpdesk"

ACTIVITY_VOCAB = df_helpdesk[EVENTLOG_ACTIVITY].unique().tolist() + [TOKEN_EOC]

## Prepare PROPHET Graph

### Remaining Time Prediction

In [None]:
prophet_generate_dgl_graph(
    df_helpdesk_train,
    df_helpdesk_train_dyn,
    df_helpdesk_test,
    df_helpdesk_test_dyn,
    dynamic_attrs=[EVENTLOG_ACTIVITY, "time:timestamp:elapsedprev:seconds", "time:timestamp:elapsedcycle:seconds"],
    static_attrs=[],
    name=PROJECT_NAME
)

### Next Time Prediction

In [None]:
!gunzip ./Data/Input/helpdesk_train.db.gz -c > ./Data/Output/helpdesk_train.db
!gunzip ./Data/Input/helpdesk_test.db.gz -c > ./Data/Output/helpdesk_test.db

In [None]:
new_train_file_path = os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_next_time_train.db")
new_test_file_path = os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_next_time_test.db")

In [None]:
shutil.copyfile(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_train.db"), new_train_file_path)
shutil.copyfile(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_test.db"), new_test_file_path)

In [None]:
prophet_replace_target(new_train_file_path, df_helpdesk_train['label:time:timestamp:next'].to_numpy())
!gzip -c "$new_train_file_path" > "$new_train_file_path".gz

In [None]:
prophet_replace_target(new_test_file_path, df_helpdesk_test['label:time:timestamp:next'].to_numpy())
!gzip -c "$new_test_file_path" > "$new_test_file_path".gz

### Next Activity Prediction

In [None]:
new_train_file_path = os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_next_activity_train.db")
new_test_file_path = os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_next_activity_test.db")

In [None]:
shutil.copyfile(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_train.db"), new_train_file_path)
shutil.copyfile(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_test.db"), new_test_file_path)

In [None]:
prophet_replace_target(new_train_file_path, df_helpdesk_train['label:concept:name:next'].to_numpy(dtype=int))

!gzip -c "$new_train_file_path" > "$new_train_file_path".gz

In [None]:
prophet_replace_target(new_test_file_path, df_helpdesk_test['label:concept:name:next'].to_numpy(dtype=int))

!gzip -c "$new_test_file_path" > "$new_test_file_path".gz

## Train PROPHET Model

### Remaining Time Prediction

In [None]:
%pdb off
best_loss, best_model = prophet_train(PROJECT_NAME)

In [None]:
graphs, labels = load_graphs_from_hdf5(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_test.db"))
y_true, y_pred = prophet_predict(best_model, graphs, labels)

In [None]:
np.save(f"{PROJECT_NAME}_prophet_pred.npy", y_pred, allow_pickle=False)
np.save(f"{PROJECT_NAME}_prophet_true.npy", y_true, allow_pickle=False)

### Next Time Prediction

In [None]:
!gunzip ./Data/Output/helpdesk_next_time_train.db.gz -c > ./Data/Output/helpdesk_train.db
!gunzip ./Data/Output/helpdesk_next_time_test.db.gz -c > ./Data/Output/helpdesk_test.db

In [None]:
%pdb off
best_loss, best_model = prophet_train(PROJECT_NAME)

In [None]:
graphs, labels = load_graphs_from_hdf5(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_test.db"))
y_true, y_pred = prophet_predict(best_model, graphs, labels)

In [None]:
np.save(f"{PROJECT_NAME}_prophet_pred.npy", y_pred, allow_pickle=False)
np.save(f"{PROJECT_NAME}_prophet_true.npy", y_true, allow_pickle=False)

### Next Activity Prediction

In [None]:
!gunzip ./Data/Output/helpdesk_next_activity_train.db.gz -c > ./Data/Output/helpdesk_train.db
!gunzip ./Data/Output/helpdesk_next_activity_test.db.gz -c > ./Data/Output/helpdesk_test.db

In [None]:
%pdb off
best_loss, best_model = prophet_train_classification(PROJECT_NAME, n_classes=len(ACTIVITY_VOCAB))

In [None]:
graphs, labels = load_graphs_from_hdf5(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_test.db"))
y_true, y_pred = prophet_predict(best_model, graphs, labels, 'classification')

In [None]:
np.save(f"{PROJECT_NAME}_prophet_pred.npy", y_pred, allow_pickle=False)
np.save(f"{PROJECT_NAME}_prophet_true.npy", y_true, allow_pickle=False)

# Dataset: BPIC 2013

## Input Preparation

In [None]:
df_bpic13 = pd.read_feather(os.path.join(INPUT_DATA_DIR, "BPI_Challenge_2013_incidents_labeled.feather"))
df_bpic13

In [None]:
df_bpic13_train = pd.read_feather(os.path.join(INPUT_DATA_DIR, "CAiSE 2025", "BPI_Challenge_2013_incidents_train.feather"))
df_bpic13_train

In [None]:
df_bpic13_train_dyn = pd.read_feather(os.path.join(INPUT_DATA_DIR, "CAiSE 2025", "BPI_Challenge_2013_incidents_train_dyn.feather"))
df_bpic13_train_dyn

In [None]:
df_bpic13_test = pd.read_feather(os.path.join(INPUT_DATA_DIR, "CAiSE 2025", "BPI_Challenge_2013_incidents_test.feather"))
df_bpic13_test

In [None]:
df_bpic13_test_dyn = pd.read_feather(os.path.join(INPUT_DATA_DIR, "CAiSE 2025", "BPI_Challenge_2013_incidents_test_dyn.feather"))
df_bpic13_test_dyn

In [None]:
PROJECT_NAME = "bpic2013"
ACTIVITY_VOCAB = df_bpic13[EVENTLOG_ACTIVITY].unique().tolist() + [TOKEN_EOC]

## Prepare PROPHET Graph

### Remaining Time Prediction

In [None]:
prophet_generate_dgl_graph(
    df_bpic13_train,
    df_bpic13_train_dyn,
    df_bpic13_test,
    df_bpic13_test_dyn,
    dynamic_attrs=[EVENTLOG_ACTIVITY, EVENTLOG_RESOURCE, EVENTLOG_GROUP, "org:role", "Involved Org line 3", "Owner Country", "Status", "Sub Status", "time:timestamp:elapsedprev", "time:timestamp:elapsedcycle"],
    static_attrs=["case:Product", "case:Country", "case:SR Latest Impact"],
    name=PROJECT_NAME
)

### Next Time Prediction

In [None]:
new_train_file_path = os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_next_time_train.db")
new_test_file_path = os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_next_time_test.db")

In [None]:
!gunzip ./Data/Input/bpic2013_train.db.gz -c > "$new_train_file_path"
!gunzip ./Data/Input/bpic2013_test.db.gz -c > "$new_test_file_path"

In [None]:
prophet_replace_target(new_train_file_path, df_bpic13_train['label:time:timestamp:next'].to_numpy())
!gzip -c "$new_train_file_path" > "$new_train_file_path".gz

In [None]:
prophet_replace_target(new_test_file_path, df_bpic13_test['label:time:timestamp:next'].to_numpy())
!gzip -c "$new_test_file_path" > "$new_test_file_path".gz

### Next Activity Prediction

---



In [None]:
new_train_file_path = os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_next_activity_train.db")
new_test_file_path = os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_next_activity_test.db")

In [None]:
!gunzip ./Data/Input/bpic2013_train.db.gz -c > "$new_train_file_path"
!gunzip ./Data/Input/bpic2013_test.db.gz -c > "$new_test_file_path"

In [None]:
prophet_replace_target(new_train_file_path, df_bpic13_train['label:concept:name:next'].to_numpy(dtype=int))
!gzip -c "$new_train_file_path" > "$new_train_file_path".gz

In [None]:
prophet_replace_target(new_test_file_path, df_bpic13_test['label:concept:name:next'].to_numpy(dtype=int))
!gzip -c "$new_test_file_path" > "$new_test_file_path".gz

## Train PROPHET Model

### Remaining Time Prediction

In [None]:
!gunzip ./Data/Input/bpic13_train.db.gz
!mv ./Data/Input/bpic13_train.db ./Data/Output/bpic13_train.db

In [None]:
!gunzip ./Data/Input/bpic13_test.db.gz
!mv ./Data/Input/bpic13_test.db ./Data/Output/bpic13_test.db

In [None]:
%pdb off
best_loss, best_model = prophet_train(PROJECT_NAME)

In [None]:
graphs, labels = load_graphs_from_hdf5(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_test.db"))
y_true, y_pred = prophet_predict(best_model, graphs, labels)

In [None]:
np.save(f"{PROJECT_NAME}_prophet_pred.npy", y_pred, allow_pickle=False)
np.save(f"{PROJECT_NAME}_prophet_true.npy", y_true, allow_pickle=False)

### Next Time Prediction

In [None]:
!gunzip ./Data/Input/bpic2013_next_time_train.db.gz -c > ./Data/Output/bpic2013_train.db
!gunzip ./Data/Input/bpic2013_next_time_test.db.gz -c > ./Data/Output/bpic2013_test.db

In [None]:
%pdb off
best_loss, best_model = prophet_train(PROJECT_NAME)

In [None]:
graphs, labels = load_graphs_from_hdf5(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_test.db"))
y_true, y_pred = prophet_predict(best_model, graphs, labels)

In [None]:
np.save(f"{PROJECT_NAME}_prophet_pred.npy", y_pred, allow_pickle=False)
np.save(f"{PROJECT_NAME}_prophet_true.npy", y_true, allow_pickle=False)

### Next Activity Prediction

In [None]:
!gunzip ./Data/Input/bpic2013_next_activity_train.db.gz -c > ./Data/Output/bpic2013_train.db
!gunzip ./Data/Input/bpic2013_next_activity_test.db.gz -c > ./Data/Output/bpic2013_test.db

In [None]:
%pdb off
best_loss, best_model = prophet_train_classification(PROJECT_NAME, n_classes=len(ACTIVITY_VOCAB))

In [None]:
graphs, labels = load_graphs_from_hdf5(os.path.join(OUTPUT_DATA_DIR, f"{PROJECT_NAME}_test.db"))
y_true, y_pred = prophet_predict(best_model, graphs, labels, 'classification')

In [None]:
np.save(f"{PROJECT_NAME}_prophet_pred.npy", y_pred, allow_pickle=False)
np.save(f"{PROJECT_NAME}_prophet_true.npy", y_true, allow_pickle=False)

# Data Export

In [None]:
output_file = f"results_{datetime.datetime.now().strftime('%Y-%m-%d_%H.%M.%S%z')}.zip"

!zip -r "$output_file" "$DATA_DIR" "$GRAPHIC_DIR" "$MODEL_DIR"

## A: Export to Google Drive

In [None]:
drive.mount("/content/drive")

Path(GDRIVE_OUTPUT_DIR).mkdir(exist_ok=True)

!cp "$output_file" "$GDRIVE_OUTPUT_DIR"

drive.flush_and_unmount()

## B: Download to Local Machine

In [None]:
files.download(output_file)