# Preprocessing & Analysis

This file contains all analysis and preprocessings performed on the ITSM datasets and was developed by Marc C. Hennig (mhennig@hm.edu).

# Environment

## Dependency installation

### A. PIP Dependencies

In [None]:
!pip install ipdb pm4py category_encoders

In [None]:
# Workaround for installing torch dependencies on Colab without long waiting
import torch
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html torch-cluster -f https://data.pyg.org/whl/torch-{torch.__version__}.html git+https://github.com/pyg-team/pytorch_geometric.git git+https://github.com/benedekrozemberczki/pytorch_geometric_temporal
!pip freeze > requirements.txt

### B. Git Dependencies

In [None]:
!git clone https://github.com/MaxVidgof/process-complexity

## Dependency Imports

In [None]:
# Python dependencies
import os
import re
import sys
from pathlib import Path
import shutil
import subprocess

import math
import numbers
import operator
import statistics

import random
import collections

import functools
import itertools

from typing import List, Tuple, Union, Optional, Literal, Callable

import time
import calendar
import dateutil
from dateutil import rrule
from dateutil.relativedelta import relativedelta
import datetime

import json

# Debugging
import ipdb

# Colab dependencies
from google.colab import files, drive, output

# Basic dependencies
import pandas as pd
import numpy as np
import scipy as sp

# Plotting dependencies
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

#import pyvis

# Process mining dependencies
import pm4py

# Graph dependencies
#import raphtory as rp
import networkx as nx

# Machine learning depenencies
import sklearn as sl
import sklearn.metrics

import category_encoders as ce
from category_encoders import OneHotEncoder

import tensorflow as tf
import keras

import torch
import torch_geometric as pyg

## Variables & Global Settings

In [None]:
# Assign a random seed for reproduceability
RANDOM_STATE = 1337

os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

# Show all Pandas columns
pd.set_option("display.max_columns", None)

# Set Matplotlib and Seaborn color scheme
plt.rcParams["image.cmap"] = "Blues"
sns.set_palette("Blues")

# Set pytorch device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Colab settings
output.enable_custom_widget_manager()

In [None]:
# Google Drive folders
GDRIVE_INPUT_DIR = "/content/drive/My Drive/Colab Notebooks/TGN-AST/Eventlogs"
GDRIVE_OUTPUT_DIR = "/content/drive/My Drive/Colab Notebooks/TGN-AST/Results"

# Local Colab folders
UTIL_DIR = os.path.join(".", "Util")
DATA_DIR = os.path.join(".", "Data")
INPUT_DATA_DIR = os.path.join(DATA_DIR, "Input")
INPUT_DATA_BPIC2013_DIR = os.path.join(INPUT_DATA_DIR, "BPIC 2013")
INPUT_DATA_BPIC2014_DIR = os.path.join(INPUT_DATA_DIR, "BPIC 2014")
INPUT_DATA_BPIC2015_DIR = os.path.join(INPUT_DATA_DIR, "BPIC 2015")
INTERIM_DATA_DIR = os.path.join(DATA_DIR, "Interim")
OUTPUT_DATA_DIR = os.path.join(DATA_DIR, "Output")

GRAPHIC_DIR = os.path.join(".", "Graphics")
MODEL_DIR = os.path.join(".", "Models")

Path(DATA_DIR).mkdir(exist_ok=True)
Path(INTERIM_DATA_DIR).mkdir(exist_ok=True)
Path(OUTPUT_DATA_DIR).mkdir(exist_ok=True)
Path(GRAPHIC_DIR).mkdir(exist_ok=True)
Path(MODEL_DIR).mkdir(exist_ok=True)

In [None]:
EMBEDDING_DIM = 64

## Common Functions

### Cleaning & Formatting Functions

In [None]:
EVENTLOG_CASE = "case:concept:name"
EVENTLOG_ACTIVITY = "concept:name"
EVENTLOG_TIMESTAMP = "time:timestamp"
EVENTLOG_GROUP = "org:group"
EVENTLOG_RESOURCE = "org:resource"
EVENTLOG_ROLE = "org:role"
EVENTLOG_CASE_PREFIX = "case:"
EVENTLOG_LABEL_PREFIX = "label:"

EVENTLOG_LABEL_REM_TIME = f"{EVENTLOG_LABEL_PREFIX}time:timestamp:last"
EVENTLOG_LABEL_NEXT_ACT = f"{EVENTLOG_LABEL_PREFIX}concept:name:next"
EVENTLOG_LABEL_NEXT_TIME = f"{EVENTLOG_LABEL_PREFIX}time:timestamp:next"

EVENTLOG_FEAT_TIME_OF_YEAR_SUFFIX = ":timeofyear"
EVENTLOG_FEAT_TIME_OF_MONTH_SUFFIX = ":timeofmonth"
EVENTLOG_FEAT_TIME_OF_WEEK_SUFFIX = ":timeofweek"
EVENTLOG_FEAT_TIME_OF_DAY_SUFFIX = ":timeofday"
EVENTLOG_FEAT_TIME_ELAPSED_CYCLE_SUFFIX = ":elapsedcycle"
EVENTLOG_FEAT_TIME_ELAPSED_PREV_SUFFIX = ":elapsedprev"

TOKEN_PADDING = "[PAD]"
TOKEN_PADDING_NUM = 0
TOKEN_NA = "[NA]"
TOKEN_EOC = "[EOC]"

def df_find_case_attributes(df: pd.DataFrame, case_col: str = EVENTLOG_CASE, label_prefix: str = EVENTLOG_LABEL_PREFIX, exclude_labels: bool = False) -> List[str]:
  """
  Identifies and returns a list of attributes from the DataFrame that have a constant value within each case. Attributes are considered 'case attributes' if they have the same single value for all rows associated with a given case.

  Parameters:
    df (pd.DataFrame): A pandas DataFrame containing the event log data.
    case_col (str, optional): The name of the column in df that represents the case identifier. Defaults to `EVENTLOG_CASE`.
    label_prefix (str, optional): The prefix used to identify label columns within df. Defaults to `EVENTLOG_LABEL_PREFIX`.
    exclude_labels (bool, optional): If True, attributes that are considered labels (i.e., start with label_prefix) will be excluded from the result.

  Returns:
    List[str]: A list of case attributes that are constant within each case. If `exclude_labels` is set to True, attributes considered as labels will not be included in the list.
  """
  attrs = df.groupby(case_col).agg('nunique', dropna=False).agg('max', axis='rows')
  attrs = attrs.where(attrs == 1).dropna().index.to_list()
  if exclude_labels:
    attrs = [attr for attr in attrs if attr not in df_find_labels(df, label_prefix)]
  return attrs

def df_find_event_attributes(df: pd.DataFrame, case_col: str = EVENTLOG_CASE, label_prefix: str = EVENTLOG_LABEL_PREFIX, exclude_labels: bool = False) -> List[str]:
  """
  Identifies and returns a list of attributes from the DataFrame that have varying values across events within the same case. Attributes are considered 'event attributes' if they do not have the same single value for all rows associated with a given case  (i.e., their value varies across events within the same case).

  Parameters:
    df (pd.DataFrame): A pandas DataFrame containing the event log data.
    case_col (str, optional): The name of the column in df that represents the case identifier. Defaults to `EVENTLOG_CASE`.
    label_prefix (str, optional): The prefix used to identify label columns within df. Defaults to `EVENTLOG_LABEL_PREFIX`.
    exclude_labels (bool, optional): If True, attributes that are considered labels (i.e., start with label_prefix) will be excluded from the result.

  Returns:
    List[str]: A list of event attributes that have more than one unique value within each case. If `exclude_labels` is set to True, attributes considered as labels will not be included in the list.
  """
  attrs = df.groupby(case_col).agg('nunique', dropna=False).agg('max', axis='rows')
  attrs = attrs.where(attrs > 1).dropna().index.to_list()
  if exclude_labels:
    attrs = [attr for attr in attrs if attr not in df_find_labels(df, label_prefix)]
  return attrs



def df_find_labels(df: pd.DataFrame, label_prefix: str = EVENTLOG_LABEL_PREFIX) -> List[str]:
  """
  Identifies and returns a list of column names that are considered labels in the DataFrame based on a given prefix. Label columns are those that start with the specified `label_prefix`.

  Parameters:
    df (pd.DataFrame): A pandas DataFrame containing the event log data or similar structured data.
    label_prefix (str, optional): The prefix used to identify label columns within df. Defaults to `EVENTLOG_LABEL_PREFIX`.

  Returns:
    List[str]: A list of column names that are identified as labels based on the prefix.
  """
  return [col for col in df.columns if col.startswith(label_prefix)]

def df_separate_categoricals(df: pd.DataFrame) -> Tuple[List[str], List[str]]:
  ordered_cols = []
  unordered_cols = []
  for col in df.select_dtypes(include='category').columns:
    if df[col].cat.ordered:
      ordered_cols.append(col)
    else:
      unordered_cols.append(col)
  return ordered_cols, unordered_cols

def df_convert_datetimes(df: pd.DataFrame, cols: List[str] = [], dayfirst: bool = False, yearfirst: bool = False, tz: Optional[Union[str, datetime.tzinfo]] = None) -> pd.DataFrame:
  """
  Converts specified columns of a DataFrame to datetime format and localizes the datetime objects to the specified timezone if provided. Attempts to parse the columns as date times, optionally interpreting the day first or year first. If the initial parsing fails, it retries with the assumption that the datetime is in UTC. After conversion, the datetime objects may be localized to a specific timezone if `tz` is not None.

  Parameters:
    df (pd.DataFrame): The DataFrame containing columns to be converted to datetime.
    cols (List[str], optional): The list of column names to convert to datetime. Defaults to an empty list.
    dayfirst (bool, optional): Boolean indicating if the day is the first number in the date string. Defaults to False.
    yearfirst (bool, optional): Boolean indicating if the year is the first number in the date string. Defaults to False.
    tz (Optional[Union[str, datetime.tzinfo]], optional): Optional timezone information to which the datetimes will be localized. Defaults to None.

  Returns:
    pd.DataFrame: The DataFrame with the specified columns converted to datetime format.

  Raises:
    ValueError: If parsing the dates fails even after assuming UTC.
  """
  for col in cols:
    try:
      df[col] = pd.to_datetime(df[col], dayfirst=dayfirst, yearfirst=yearfirst)
    except ValueError:
      df[col] = pd.to_datetime(df[col], dayfirst=dayfirst, yearfirst=yearfirst, utc=True)

    df[col] = df[col].dt.tz_localize(tz=tz)
  return df

def df_convert_timedeltas(df: pd.DataFrame, cols: List[str] = [], unit: str = 'nanoseconds') -> pd.DataFrame:
  """
  Converts specified columns of a DataFrame to timedelta format using the given time unit. Each value in the specified columns will be converted into a timedelta object, interpreting the value according to the specified unit.

  Parameters:
    df (pd.DataFrame): The DataFrame containing columns to be converted to timedelta.
    cols (List[str], optional): The list of column names to convert to timedelta. Defaults to an empty list.
    unit (str, optional): The time unit to interpret the values in `cols` when converting. Defaults to 'nanoseconds'.

  Returns:
    pd.DataFrame: The DataFrame with the specified columns converted to timedelta format.
  """
  for col in cols:
    df[col] = pd.to_timedelta(df[col], unit=unit)
  return df

def df_convert_bools(df: pd.DataFrame, cols: list[str] = [], true_vals: Union[str, List[str]] = [], false_vals: Union[str, List[str]] = []) -> pd.DataFrame:
  """
  Converts specified columns of a DataFrame to boolean format based on provided true and false values. String values from `true_vals` are mapped to `True`, while values from `false_vals` are mapped to `False`. All other values not included in `true_vals` or `false_vals` will be converted based on the presence of either list; if only `true_vals` is provided, all other values are considered `False`, and vice-versa.

  Parameters:
    df (pd.DataFrame): The DataFrame containing columns to be converted to boolean.
    cols (List[str], optional): The list of column names to convert to boolean. Defaults to an empty list.
    true_vals (Union[str, List[str]], optional): Values to be mapped to `True`. Can be a single string or a list of strings. Defaults to an empty list.
    false_vals (Union[str, List[str]], optional): Values to be mapped to `False`. Can be a single string or a list of strings. Defaults to an empty list.

  Returns:
    pd.DataFrame: The DataFrame with the specified columns converted to boolean format.

  """
  if isinstance(true_vals, str):
    true_vals = [true_vals]
  if isinstance(false_vals, str):
    false_vals = [false_vals]

  map = {true_val: True for true_val in true_vals} | {false_val: False for false_val in false_vals}
  if len(true_vals) == 0 and len(false_vals) > 0:
    # Replace unknown values with True
    map['__missing__'] = True
  elif len(false_vals) == 0 and len(true_vals) > 0:
    # Replace unknown values with False
    map['__missing__'] = False
  else:
    map["__missing__"] = pd.NA

  for col in cols:
    df[col] = df[col].map(map).astype('boolean')

  return df

def df_convert_bool_to_int(df: pd.DataFrame, cols: Optional[Union[str, List[str]]] = None) -> pd.DataFrame:
  df = df.copy()
  if cols is None:
    cols = df.select_dtypes('boolean').columns
  elif isinstance(cols, str):
    cols = [cols]

  for col in cols:
    df[col] = df[col].astype('Int8')

  return df

def df_convert_ordered_cat_to_int(df: pd.DataFrame, cols: Optional[Union[str, List[str]]] = None, relative: bool = False) -> pd.DataFrame:
  df = df.copy()
  if cols is None:
    cols, _ = df_separate_categoricals(df)
  elif isinstance(cols, str):
    cols = [cols]

  for col in cols:
    max_code = df[col].cat.codes.astype('Int16').max()
    na_rows = df[df[col].isna()].index

    df[col] = df[col].cat.codes.astype('Int16')
    df.loc[na_rows, col] = pd.NA
    if relative:
      df[col] = df[col] / max_code



  return df

def df_fillna_str(df: pd.DataFrame, val: str, cols: Optional[Union[str, List[str]]] = None) -> pd.DataFrame:
  df = df.copy()
  if cols is None:
    cols = df.select_dtypes('object').columns
  elif isinstance(cols, str):
    cols = [cols]

  for col in cols:
    df[col] = df[col].astype('string').fillna(val)

  return df

def df_fillna_num(df: pd.DataFrame, val: Union[int, float], cols: Optional[Union[str, List[str]]] = None) -> pd.DataFrame:
  df = df.copy()
  if cols is None:
    cols = df.select_dtypes('number').columns
  elif isinstance(cols, str):
    cols = [cols]

  for col in cols:
    df[col] = df[col].fillna(val)

  return df

def df_fillna_cat(df: pd.DataFrame, val: str, cols: Optional[Union[str, List[str]]] = None) -> pd.DataFrame:
  df = df.copy()
  if cols is None:
    cols = df.select_dtypes('category').columns
  elif isinstance(cols, str):
    cols = [cols]

  for col in cols:
    if val not in df[col].cat.categories.array:
      df[col] = df[col].cat.add_categories(val)
    df[col] = df[col].fillna(val)

  return df

def df_rename_cat_values(df: pd.DataFrame, cols: Union[str, List[str]], from_cats: Union[str, List[str]], to_cat: str) -> pd.DataFrame:
  """
  Changes one or more categorical values to another specified value within the provided columns, and removes any categories that are no longer used.

  Parameters:
    df (pd.DataFrame): The DataFrame containing categorical columns where values will be renamed.
    cols (Union[str, List[str]]): A column name or list of column names to be modified.
    from_cats (Union[str, List[str]]): The category or list of categories to be changed.
    to_cat (str): The new category value that replaces the `from_cats`.

  Returns:
    pd.DataFrame: The DataFrame with renamed categorical values and cleaned categories.

  Raises:
    TypeError: If the columns specified are not categorical dtype.
  """
  if isinstance(cols, str):
    cols = [cols]
  for col in cols:
    df[col] = df[col].cat.remove_categories(from_cats).fillna(to_cat)
    df[col] = df[col].cat.remove_unused_categories()

  return df

def df_drop_duplicate_rows(df: pd.DataFrame, inplace: bool = True, ignore_index: bool = True, keep: str = 'first') -> Union[pd.DataFrame, None]:
  """
  Removes duplicate rows from the DataFrame, optionally updating the DataFrame in place and resetting the index.

  Parameters:
    df (pd.DataFrame): The DataFrame from which duplicate rows will be removed.
    inplace (bool, optional): If True, the DataFrame will be updated in place and None will be returned. Otherwise, a new DataFrame is returned. Defaults to True.
    ignore_index (bool, optional): If True, the index will be reset to the default integer index after dropping duplicates. Otherwise, the original index will be preserved. This parameter is ignored when inplace is True. Defaults to True.
    keep (str, optional): Determines which duplicates (if any) to keep.
        - 'first': Drop duplicates except for the first occurrence.
        - 'last': Drop duplicates except for the last occurrence.
        - False: Drop all duplicates.
        Defaults to 'first'.

  Returns:
    Union[pd.DataFrame, None]: The DataFrame with duplicate rows removed if inplace is set to False, otherwise None.
  """
  df = df.drop_duplicates(keep=keep, ignore_index=ignore_index, inplace=inplace)
  return df

def df_drop_duplicate_cols(df: pd.DataFrame, keep: str = 'first') -> pd.DataFrame:
  """
  Removes duplicate columns from the DataFrame while keeping the first occurrence by default. It also ensures that the data types of the remaining columns are preserved.

  Parameters:
    df (pd.DataFrame): The DataFrame from which duplicate columns will be removed.
    keep (str, optional): Determines which duplicates (if any) to keep.
        - 'first': Drop duplicates except for the first occurrence.
        - 'last': Drop duplicates except for the last occurrence.
        - False: Drop all duplicates.
        Defaults to 'first'.

  Returns:
    pd.DataFrame: A new DataFrame with duplicate columns removed and original data types intact.
  """
  dtypes = df.dtypes
  df = df.T.drop_duplicates(keep=keep).T
  dtypes.drop(dtypes.index[~dtypes.index.isin(df.columns)], inplace=True)
  return df.astype(dtypes)

# Remove rows and columns that are completely empty
def df_drop_na_rows_and_cols(df: pd.DataFrame, inplace: bool = True) -> Union[pd.DataFrame, None]:
  """
  Removes rows and columns from the DataFrame that are completely empty (all values are NaN).

  Parameters:
    df (pd.DataFrame): The DataFrame from which completely empty rows and columns will be removed.
    inplace (bool, optional): If True, the DataFrame will be updated in place, which modifies the original DataFrame and returns None.
                              If False, a new DataFrame is returned with the empty rows and columns removed.
                              Defaults to True.

  Returns:
    Union[pd.DataFrame, None]: None if inplace is True; otherwise, a new DataFrame with empty rows and columns removed.
  """
  if inplace:
    df.dropna(how="all", axis='index', inplace=inplace)
    df = df.dropna(how="all", axis='columns', inplace=inplace)
  else:
    df = df.dropna(how="all", axis='index', inplace=inplace).dropna(how="all", axis='columns', inplace=inplace)
  return df

def df_drop_single_val_cols(df: pd.DataFrame, inplace: bool = True) -> Union[pd.DataFrame, None]:
  """
  Removes columns from the DataFrame that only contain a single unique value.

  Parameters:
    df (pd.DataFrame): The DataFrame from which columns with a single unique value will be removed.
    inplace (bool, optional): If True, the operation will be performed inplace and the function will return None.
                              If False, a new DataFrame with the specified columns removed will be returned.
                              Defaults to True.

  Returns:
    Union[pd.DataFrame, None]: None if inplace is True; otherwise, a new DataFrame with columns that have a single unique value removed.
  """
  df = df.drop(columns=df.columns[df.nunique(dropna=True) == 1], inplace=inplace)
  return df

def df_drop_threshold_cols(df: pd.DataFrame, gte: float = sys.float_info.min, lt: float = sys.float_info.max, cols: List[str] = [], absolute: bool = False) -> Union[pd.DataFrame, None]:
  """
  Removes columns from the DataFrame where all values meet a threshold condition. Greater than or equal to `gte` and less than `lt` thresholds can be set, and optionally, absolute value conditions can be considered.

  Parameters:
    df (pd.DataFrame): The DataFrame from which columns will be dropped based on threshold conditions.
    gte (float, optional): The 'greater than or equal to' threshold condition. Defaults to the smallest representable float.
    lt (float, optional): The 'less than' threshold condition. Defaults to the largest representable float.
    cols (List[str], optional): The list of columns to check for the threshold conditions. If empty, all numeric columns will be checked. Defaults to an empty list.
    absolute (bool, optional): If True, the absolute values of the column data will be considered for the thresholds. Defaults to False.

  Returns:
    Union[pd.DataFrame, None]: The modified DataFrame with thresholded columns removed. As per the modification of the function, this will always return a new DataFrame and never None.
  """
  df = df.copy()
  if cols is None or len(cols) == 0:
    cols = df.select_dtypes('number').columns.to_list()

  if absolute:
    if gte != sys.float_info.min:
      drop_cols = df[cols].mask(df[cols].abs() >= abs(gte)).dropna(axis='columns', how='all').columns.to_list()
    elif lt != sys.float_info.max:
      drop_cols = df[cols].mask(df[cols].abs() < abs(lt)).dropna(axis='columns', how='all').columns.to_list()
  else:
    drop_cols = df[cols].mask((df[cols] >= gte) & (df[cols] < lt)).dropna(axis='columns', how='all').columns.to_list()

  return df.drop(columns=drop_cols)

def df_drop_threshold_na_cols(df: pd.DataFrame, threshold: Union[float, int], inplace: bool = True) -> pd.DataFrame:
  """
  Removes columns from the DataFrame that have NaN values equal to or exceeding the specified threshold.

  Parameters:
    df (pd.DataFrame): The DataFrame from which columns with excessive NaN values will be removed.
    threshold (Union[float, int]): The threshold for NaN values (absolute number or percentage). If provided as a float,
                                   it is interpreted as a percentage of the total number of rows.
    inplace (bool, optional): If True, the DataFrame will be updated in place, and None will be returned.
                              If False, a new DataFrame with the specified columns removed will be returned.
                              Defaults to True.

  Returns:
    pd.DataFrame: The DataFrame with columns removed if `inplace` is False. If `inplace` is True, the original DataFrame is modified and the function will return None.

  Raises:
    ValueError: If the threshold is greater than the size of the DataFrame.
  """
  if isinstance(threshold, float):
    threshold = threshold * len(df)

  if threshold > len(df):
    raise ValueError(f"Threshold {threshold} must be lower than or equal to the number of rows in the DataFrame {len(df)}")

  df_na = df.isna().sum()
  df_na = df_na[df_na >= threshold]

  return df.drop(columns=df_na.index.array, inplace=inplace)

def df_drop_threshold_rows(df: pd.DataFrame, gte: float = sys.float_info.min, lt: float = sys.float_info.max, cols: List[str] = [], absolute: bool = False) -> Union[pd.DataFrame, None]:
  """
  Drops rows from the DataFrame where all numeric values in specified columns meet threshold conditions of either 'greater than or equal to' (`gte`) or 'less than' (`lt`). Optionally, absolute values can be considered for the thresholds.

  Parameters:
    df (pd.DataFrame): The DataFrame from which rows will be dropped.
    gte (float, optional): The 'greater than or equal to' threshold condition. Defaults to the smallest representable float.
    lt (float, optional): The 'less than' threshold condition. Defaults to the largest representable float.
    cols (List[str], optional): The list of column names to check against the threshold conditions. If empty, all numeric columns will be checked. Defaults to an empty list.
    absolute (bool, optional): If True, the absolute values of the data in the columns will be considered when comparing against the thresholds. Defaults to False.

  Returns:
    pd.DataFrame: A DataFrame with the specified rows dropped.
  """
  df = df.copy()
  if cols is None or len(cols) == 0:
    cols = df.select_dtypes('number').columns.to_list()

  if absolute:
    if gte != sys.float_info.min:
      drop_rows = df[cols].mask(df[cols].abs() >= abs(gte)).dropna(axis='index', how='all').index.to_list()
    elif lt != sys.float_info.max:
      drop_rows = df[cols].mask(df[cols].abs() < abs(lt)).dropna(axis='index', how='all').index.to_list()
  else:
    drop_rows = df[cols].mask((df[cols] >= gte) & (df[cols] < lt)).dropna(axis='index', how='all').index.to_list()

  return df.drop(index=drop_rows)

def df_format_as_eventlog(df: pd.DataFrame, case_col: str = EVENTLOG_CASE, activity_col: str = EVENTLOG_ACTIVITY, time_col: str = EVENTLOG_TIMESTAMP, resource_col: Optional[str] = None, group_col: Optional[str] = None, role_col: Optional[str] = None, inplace: Optional[bool] = True, sort: Union[bool, str] = True):
  col_map = {
    case_col: EVENTLOG_CASE,
    activity_col: EVENTLOG_ACTIVITY,
    time_col: EVENTLOG_TIMESTAMP
  }
  if resource_col is not None:
    col_map[resource_col] = EVENTLOG_RESOURCE
  if group_col is not None:
    col_map[group_col] = EVENTLOG_GROUP
  if role_col is not None:
    col_map[role_col] = EVENTLOG_ROLE

  if sort and isinstance(sort, str):
    sort_cols = [case_col, sort, time_col, activity_col]
  else:
    sort_cols = [case_col, time_col, activity_col]

  case_attrs = df_find_case_attributes(df, case_col)
  col_map = col_map | {attr: f"{EVENTLOG_CASE_PREFIX}{attr}" for attr in case_attrs if not attr.startswith(EVENTLOG_CASE_PREFIX)}

  event_attrs = df_find_event_attributes(df, case_col)
  col_map = col_map | {attr: f"{attr.replace(EVENTLOG_CASE_PREFIX, '', 1)}" for attr in event_attrs if attr.startswith(EVENTLOG_CASE_PREFIX)}

  if sort and inplace:
    df.sort_values(by=sort_cols, inplace=inplace, ignore_index=True)
  elif sort and not inplace:
    df = df.sort_values(by=sort_cols, inplace=inplace, ignore_index=True)

  return df.rename(columns=col_map, inplace=inplace)

def df_write_files(df: pd.DataFrame, filename: str, index: bool = False, skip_xes: bool = True) -> None:
  df.to_csv(f"{filename}.csv", index=index)
  df.to_pickle(f"{filename}.pkl.gz")
  try:
    if isinstance(df.columns, pd.MultiIndex):
      df = df.copy()
      df.columns = df.columns.to_flat_index()
    df.reset_index().to_feather(f"{filename}.feather")
  except Exception as e:
    print(f"Skipping feather: {e}")
  if not skip_xes:
    pm4py.write_xes(df, f"{filename}.xes")

def df_datetime_to_numeric(df: pd.DataFrame, cols: Optional[Union[List[str], str]] = None, convert_datetime: Optional[bool] = True, convert_timedelta: Optional[bool] = True) -> pd.DataFrame:
  """
  Converts datetime and timedelta columns in a DataFrame to a numeric representation. Datetime columns are converted to UNIX timestamps, and timedelta columns are converted to total seconds.

  Parameters:
    df (pd.DataFrame): The DataFrame with columns to be converted.
    cols (Optional[Union[List[str], str]], optional): Columns to be converted. If None, all columns are considered. It can be a single column name or a list of column names. Defaults to None.
    convert_datetime (Optional[bool], optional): Flag indicating whether to convert datetime columns. Defaults to True.
    convert_timedelta (Optional[bool], optional): Flag indicating whether to convert timedelta columns. Defaults to True.

  Returns:
    pd.DataFrame: A DataFrame with the specified datetime and timedelta columns converted to numeric values.
  """
  df = df.copy()
  if cols is None:
    cols = df.columns.to_list()
  elif isinstance(cols, str):
    cols = [cols]

  if convert_datetime:
    for col in df.select_dtypes(include='datetime').columns.to_list():
      if col in cols:
        df[col] = df[col].map(pd.Timestamp.timestamp, na_action='ignore')

  if convert_timedelta:
    for col in df.select_dtypes(include='timedelta').columns.to_list():
      if col in cols:
        df[col] = df[col].dt.total_seconds()

  return df

def df_timedelta_to_unit(df: pd.DataFrame, timedelta_col: str, unit: Optional[Literal['days', "day", "d", "hours", "hour", "hr", "h", "m", "minute", "min", "minutes", "t", "s", "seconds", "sec", "second"]], floor: bool = False, na_token: Optional[pd.Timedelta] = pd.NA) -> pd.DataFrame:
  if not pd.isna(na_token):
    na_token = pd.Timedelta(na_token)
    df[timedelta_col].fillna(na_token, inplace=True)

  if unit in ["days", "day", "d"]:
    df[timedelta_col] = df[timedelta_col].dt.total_seconds() / 60 / 60 / 24
  elif unit in ["hours", "hour", "hr", "h"]:
    df[timedelta_col] = df[timedelta_col].dt.total_seconds() / 60 / 60
  elif unit in ["m", "minute", "min", "minutes", "t"]:
    df[timedelta_col] = df[timedelta_col].dt.total_seconds() / 60
  elif unit in ["s", "seconds", "sec", "second"]:
    df[timedelta_col] = df[timedelta_col].dt.total_seconds()
  else:
    raise ValueError(f"Invalid timedelta unit {unit}")

  if floor:
    df[timedelta_col] = df[timedelta_col].astype('Int64')
  return df

def df_to_multiindex(df: pd.DataFrame, case_col: str = EVENTLOG_CASE) -> pd.DataFrame:
  df = df.copy()
  df = df.groupby(case_col).apply(lambda x: x.reset_index(drop=True))
  df.columns = pd.MultiIndex.from_arrays([df.columns, np.zeros(len(df.columns), dtype=int)])
  return df

def df_to_flatindex(df: pd.DataFrame):
  df = df.copy()
  if df.columns.nlevels > 1:
    df.columns = ["_".join(map(str, col)) for col in df.columns.to_flat_index()]
  return df.reset_index()

def np_filter_na(arr: np.ndarray, unsqueeze: bool = False) -> np.ndarray:
  if unsqueeze:
    return np.array([np.expand_dims(el[~pd.isna(el)], -1) for el in arr], dtype='object')
  else:
    return np.array([el[~pd.isna(el)] for el in arr], dtype='object')


### Labeling Functions

In [None]:
def df_label_next(df: pd.DataFrame, case_col: str, next_col: str, label_col: str, eoc_token: Optional[str]) -> pd.DataFrame:
  df = df.copy()
  df[label_col] = df.groupby(case_col)[next_col].transform(lambda x: x.shift(-1))
  if eoc_token is not None:
    df[label_col] = df[label_col].astype("string").fillna(eoc_token)
  return df

def df_label_next_activity(df: pd.DataFrame, case_col: str = EVENTLOG_CASE, activity_col: str = EVENTLOG_ACTIVITY, label_col: str = f"{EVENTLOG_LABEL_PREFIX}concept:name:next", eoc_token: Optional[str] = None) -> pd.DataFrame:
  return df_label_next(df, case_col, activity_col, label_col, eoc_token)

def df_label_next_resource(df: pd.DataFrame, case_col: str = EVENTLOG_CASE, resource_col: str = EVENTLOG_RESOURCE, label_col: str = f"{EVENTLOG_LABEL_PREFIX}org:resource:next", eoc_token: Optional[str] = None) -> pd.DataFrame:
  return df_label_next(df, case_col, resource_col, label_col, eoc_token)

def df_label_next_group(df: pd.DataFrame, case_col: str = EVENTLOG_CASE, group_col: str = EVENTLOG_GROUP, label_col: str = f"{EVENTLOG_LABEL_PREFIX}org:group:next", eoc_token: Optional[str] = None) -> pd.DataFrame:
  return df_label_next(df, case_col, group_col, label_col, eoc_token)

def df_label_activity_duration(df: pd.DataFrame, case_col: str = EVENTLOG_CASE, time_col: str = EVENTLOG_TIMESTAMP, label_col: str = f"{EVENTLOG_LABEL_PREFIX}time:timestamp:next", unit: Optional[Literal["days", "day", "d", "hours", "hour", "hr", "h", "m", "minute", "min", "minutes", "t", "s", "seconds", "sec", "second"]] = None, eoc_token: Optional[pd.Timedelta] = None) -> pd.DataFrame:
  df = df.copy()
  df[label_col] = df.groupby(case_col)[time_col].transform(lambda x: x.shift(-1)) - df[time_col]

  if eoc_token is not None:
    df[label_col] = df[label_col].fillna(eoc_token)

  if unit is not None:
    df = df_timedelta_to_unit(df, label_col, unit)

  return df

def df_label_remaining_cycle_time(df: pd.DataFrame, case_col: str = EVENTLOG_CASE, time_col: str = EVENTLOG_TIMESTAMP, label_col: str = f"{EVENTLOG_LABEL_PREFIX}time:timestamp:last", unit: Optional[Literal["days", "day", "d", "hours", "hour", "hr", "h", "m", "minute", "min", "minutes", "t", "s", "seconds", "sec", "second"]] = None) -> pd.DataFrame:
  df = df.copy()
  if isinstance(unit, str):
      unit = unit.lower()

  df[label_col] = df.groupby(case_col)[time_col].transform('last') - df[time_col]
  if unit is not None:
    df = df_timedelta_to_unit(df, label_col, unit)

  return df



### Feature Extraction Functions

In [None]:
def df_extract_activity_duration(df: pd.DataFrame, feat_col: str, previous_event_count: int = 1, unit: str = None, absolute: bool = True, time_col: str = EVENTLOG_TIMESTAMP, case_col: str = EVENTLOG_CASE, na_token: Optional[Union[pd.Timedelta, int]] = pd.NA) -> pd.DataFrame:
  df = df.copy()

  df[feat_col] = df[time_col] - df.groupby(case_col)[time_col].transform(lambda x: x.shift(previous_event_count))

  if not pd.isna(na_token):
    na_token = pd.Timedelta(na_token)
    df[feat_col].fillna(na_token, inplace=True)

  if absolute:
    df[feat_col] = df[feat_col].abs()

  if unit is not None:
    df = df_timedelta_to_unit(df, feat_col, unit, na_token=na_token)

  return df

def df_extract_elapsed_cycle_time(df: pd.DataFrame, feat_col: str, unit: str = None, time_col: str = EVENTLOG_TIMESTAMP, case_col: str = EVENTLOG_CASE) -> pd.DataFrame:
  df = df.copy()
  df[feat_col] = df[time_col] - df.groupby(case_col)[time_col].transform('first')

  if unit is not None:
    df = df_timedelta_to_unit(df, feat_col, unit)

  return df

def df_extract_time_feats_tax(df: pd.DataFrame, time_col: str = EVENTLOG_TIMESTAMP, case_col: str = EVENTLOG_CASE, drop: bool = True) -> pd.DataFrame:
  df = df.copy()
  df = df_extract_elapsed_cycle_time(df, feat_col=f"{time_col}{EVENTLOG_FEAT_TIME_ELAPSED_CYCLE_SUFFIX}", time_col=time_col, case_col=case_col, unit='s')
  df = df_extract_activity_duration(df, feat_col=f"{time_col}{EVENTLOG_FEAT_TIME_ELAPSED_PREV_SUFFIX}", time_col=time_col, unit='s')
  df[f"{time_col}{EVENTLOG_FEAT_TIME_ELAPSED_PREV_SUFFIX}"] = df[f"{time_col}{EVENTLOG_FEAT_TIME_ELAPSED_PREV_SUFFIX}"].fillna(0)

  df = df_extract_time_of_day(df, feat_col=f"{time_col}{EVENTLOG_FEAT_TIME_OF_DAY_SUFFIX}", time_col=time_col, unit='s')
  df = df_extract_day_of_week(df, feat_col=f"{time_col}{EVENTLOG_FEAT_TIME_OF_WEEK_SUFFIX}", time_col=time_col)

  df[f"{time_col}{EVENTLOG_FEAT_TIME_OF_DAY_SUFFIX}"] = df[f"{time_col}{EVENTLOG_FEAT_TIME_OF_DAY_SUFFIX}"] / (24 * 60 * 60)
  df[f"{time_col}{EVENTLOG_FEAT_TIME_OF_WEEK_SUFFIX}"] = df[f"{time_col}{EVENTLOG_FEAT_TIME_OF_WEEK_SUFFIX}"] / 7

  if drop:
    df.drop(columns=time_col, inplace=True)

  return df

def df_extract_time_feats(df: pd.DataFrame, time_col: str = EVENTLOG_TIMESTAMP, drop: bool = True):
  df = df.copy()
  df = df_extract_hour_of_day(df, feat_col=f"{time_col}{EVENTLOG_FEAT_TIME_OF_DAY_SUFFIX}", time_col=time_col, relative=True)
  df = df_extract_day_of_week(df, feat_col=f"{time_col}{EVENTLOG_FEAT_TIME_OF_WEEK_SUFFIX}", time_col=time_col, relative=True)
  df = df_extract_day_of_month(df, feat_col=f"{time_col}{EVENTLOG_FEAT_TIME_OF_MONTH_SUFFIX}", time_col=time_col, relative=True)
  df = df_extract_month_of_year(df, feat_col=f"{time_col}{EVENTLOG_FEAT_TIME_OF_YEAR_SUFFIX}", time_col=time_col, relative=True)

  if drop:
    df.drop(columns=time_col, inplace=True)

  return df

def df_extract_time_of_day(df: pd.DataFrame, feat_col: str, unit: str = None, time_col: str = EVENTLOG_TIMESTAMP) -> pd.DataFrame:
  df = df.copy()
  df[feat_col] = df[time_col] - df[time_col].dt.normalize()

  if unit is not None:
    df = df_timedelta_to_unit(df, feat_col, unit)

  return df

def df_extract_time_of_week(df: pd.DataFrame, feat_col: str, unit: str = None, time_col: str = EVENTLOG_TIMESTAMP) -> pd.DataFrame:
  df = df.copy()
  df[feat_col] = df[time_col] - (df[time_col].dt.weekday * np.timedelta64(1, 'D'))

  if unit is not None:
    df = df_timedelta_to_unit(df, feat_col, unit)

  return df

def df_extract_time_of_month(df: pd.DataFrame, feat_col: str, unit: str = None, time_col: str = EVENTLOG_TIMESTAMP) -> pd.DataFrame:
  df = df.copy()
  df[feat_col] = df[time_col] - (df[time_col].dt.day * np.timedelta64(1, 'D'))

  if unit is not None:
    df = df_timedelta_to_unit(df, feat_col, unit)

  return df

def df_extract_time_of_year(df: pd.DataFrame, feat_col: str, unit: str = None, time_col: str = EVENTLOG_TIMESTAMP) -> pd.DataFrame:
  df = df.copy()
  df[feat_col] = df[time_col] - (df[time_col].dt.dayofyear * np.timedelta64(1, 'D'))

  if unit is not None:
    df = df_timedelta_to_unit(df, feat_col, unit)

  return df

def df_extract_hour_of_day(df: pd.DataFrame, feat_col: str, time_col: str = EVENTLOG_TIMESTAMP, relative: bool = False) -> pd.DataFrame:
  df = df.copy()
  df[feat_col] = df[time_col].dt.hour + 1

  if relative:
    df[feat_col] = df[feat_col] / 24

  return df

def df_extract_day_of_week(df: pd.DataFrame, feat_col: str, time_col: str = EVENTLOG_TIMESTAMP, relative: bool = False) -> pd.DataFrame:
  df = df.copy()
  df[feat_col] = df[time_col].dt.day_of_week + 1

  if relative:
    df[feat_col] = df[feat_col] / 7

  return df

def df_extract_day_of_month(df: pd.DataFrame, feat_col: str, time_col: str = EVENTLOG_TIMESTAMP, relative: bool = False) -> pd.DataFrame:
  df = df.copy()
  df[feat_col] = df[time_col].dt.day

  if relative:
    df[feat_col] = df[feat_col] / df[time_col].dt.days_in_month

  return df

def df_extract_day_of_year(df: pd.DataFrame, feat_col: str, time_col: str = EVENTLOG_TIMESTAMP, relative: bool = False) -> pd.DataFrame:
  df = df.copy()
  df[feat_col] = df[time_col].dt.day_of_year

  if relative:
    df.loc[df[time_col].dt.is_leap_year, feat_col] = df[df[time_col].dt.is_leap_year][feat_col] / 366
    df.loc[~df[time_col].dt.is_leap_year, feat_col] = df[~df[time_col].dt.is_leap_year][feat_col] / 365

  return df

def df_extract_month_of_year(df: pd.DataFrame, feat_col: str, time_col: str = EVENTLOG_TIMESTAMP, relative: bool = False) -> pd.DataFrame:
  df = df.copy()
  df[feat_col] = df[time_col].dt.month

  if relative:
    df[feat_col] = df[feat_col] / 12

  return df

def df_extract_has_value(df: pd.DataFrame, feat_col: str, replace: bool = True) -> pd.DataFrame:
  df = df.copy()
  if replace:
    df[feat_col] = (~df[feat_col].isna()).astype('boolean')
  else:
    df[f"has_{feat_col}"] = (~df[feat_col].isna()).astype('boolean')
  return df


In [None]:
class MeanScaler(sl.base.BaseEstimator, sl.base.TransformerMixin):
  def __init__(self):
    self.is_fitted_ = False

  def reset(self):
    if hasattr(self, "mean_"):
      del self.mean_
      del self.seen_
      self.is_fitted_ = False

    return self

  def fit(self, X):
    self.reset()
    self.partial_fit(X)
    return self

  def partial_fit(self, X):
    sample_mean = np.nanmean(X)
    sample_seen = np.prod(np.shape(X))

    if hasattr(self, "mean_"):
      self.mean_ = ((self.mean_ * self.seen_) + (sample_mean * sample_seen)) / (self.seen_ + sample_seen)
      self.seen_ += sample_seen
    else:
      self.mean_ = sample_mean
      self.seen_ = sample_seen

    self.is_fitted_ = True
    return self

  def transform(self, X):
    if not self.is_fitted_:
      raise ValueError("Transformer must be fitted.")

    return np.divide(X, self.mean_)

  def fit_transform(self, X):
    self.fit(X)
    return self.transform(X)

  def inverse_transform(self, X):
    if not self.is_fitted_:
      raise ValueError("Transformer must be fitted.")

    return np.multiply(X, self.mean_)

def df_transform_log(df: pd.DataFrame, cols: Optional[Union[str, List[str]]] = None, base: Union[Literal['e', '2', '10', '1p'], int] = 'e', inf_token: Optional[float] = None, na_token: float = pd.NA) -> pd.DataFrame:
  df = df.copy()
  if cols is None:
    cols = df.columns.to_list()
  elif isinstance(cols, str):
    cols = [cols]

  if 'e' == base:
    func = np.log
  elif '2' == base:
    func = np.log2
  elif '10' == base:
    func = np.log10
  elif '1p' == base:
    func = np.log1p
  elif isinstance(base, int) and base > 0:
    func = lambda s: np.log(s) / np.log(base)
  else:
    raise ValueError(f"Invalid log base {base}")

  for col in cols:
    df[col] = df[col].transform(func).fillna(na_token)
    if inf_token is not None:
      df[col].replace([np.inf, -np.inf], inf_token, inplace=True)

  return df

def df_transform_minmax(df_train: pd.DataFrame, df_test: Optional[pd.DataFrame] = None, cols: Optional[Union[str, List[str]]] = None, feature_range: Tuple[float, float] = (0,1), fix_feature_range: bool = False, na_token: float = pd.NA) -> Union[pd.DataFrame, Tuple[pd.DataFrame, pd.DataFrame]]:
  if cols is None:
    cols = df_train.columns.to_list()
  elif isinstance(cols, str):
    cols = [cols]

  scaler = sl.preprocessing.MinMaxScaler(
      feature_range=feature_range,
      clip=fix_feature_range
  )

  df_train[cols] = scaler.fit_transform(df_train[cols])
  df_train[cols].fillna(na_token, inplace=True)
  if df_test is not None:
    df_test[cols] = scaler.transform(df_test[cols])
    df_test[cols].fillna(na_token, inplace=True)
    return df_train, df_test

  return df_train

def df_encode_label(df_train: pd.DataFrame, df_test: Optional[pd.DataFrame] = None, cols: Optional[Union[str, List[str]]] = None) -> Union[pd.DataFrame, Tuple[pd.DataFrame, pd.DataFrame]]:
  if cols is None:
    cols = df_train.columns.to_list()
  elif isinstance(cols, str):
    cols = [cols]

  encoder = sl.preprocessing.LabelEncoder()

  for col in cols:
    df_train[col] = encoder.fit_transform(df_train[col])
    if df_test is not None:
      df_test[col] = encoder.transform(df_test[col])

  if df_test is not None:
    return df_train, df_test

  return df_train

def df_encode_onehot(df: pd.DataFrame, cols: Optional[Union[str, List[str]]] = None, drop_first: bool = False, dummy_na: bool = True, prefix_sep: str = ":", remove_unused_cats: bool = True) -> pd.DataFrame:
  if cols is None:
    _, cols = df_separate_categoricals(df)
  elif isinstance(cols, str):
    cols = [cols]

  if remove_unused_cats:
    for col in cols:
      if not isinstance(df[col], pd.DataFrame) and isinstance(df[col].dtype, pd.CategoricalDtype):
        df[col] = df[col].cat.remove_unused_categories()

  df = pd.get_dummies(
      df,
      columns=cols,
      prefix_sep=prefix_sep,
      dummy_na=dummy_na,
      drop_first=drop_first,
      sparse=True
  )
  return df

def df_encode_ordinal(df: pd.DataFrame, cols: Optional[Union[str, List[str]]] = None) -> pd.DataFrame:
  df = df.copy()
  if cols is None:
    cols, _ = df_separate_categoricals(df)
  elif isinstance(cols, str):
    cols = [cols]

  for col in cols:
    df[col] = df[col].cat.codes.astype('Int8')

  return df

### Statistic & Visualization Functions

In [None]:
def df_naive_regression_metrics(df_train: pd.DataFrame, df_test: pd.DataFrame, label_col: str, method: Literal['median', 'mean', 'mode'] = 'median'):
  if 'median' == method:
    y_pred = df_train[label_col].median()
  elif 'mean' == method:
    y_pred = df_train[label_col].mean()
  elif 'mode' == method:
    y_pred = df_train[label_col].mode()
  else:
    raise ValueError(f"Unknown method {method}")

  y_pred = np.full(df_test[label_col].size, y_pred)
  y_true = df_test[label_col].to_numpy()

  return {
      'mae': sl.metrics.mean_absolute_error(y_true, y_pred),
      'mse': sl.metrics.mean_squared_error(y_true, y_pred),
      'rmse': sl.metrics.root_mean_squared_error(y_true, y_pred),
      'msle': sl.metrics.mean_squared_log_error(y_true, y_pred),
      'logcosh': keras.losses.log_cosh(y_true, y_pred).numpy(),
  }

def df_naive_classification_metrics(df_train: pd.DataFrame, df_test: pd.DataFrame, label_col: str):
  y_pred = df_train[EVENTLOG_LABEL_NEXT_ACT].mode().iloc[0]

  label_enc = sl.preprocessing.LabelEncoder()
  label_enc.fit(np.concatenate((df_train[label_col], df_test[label_col]), axis=None))

  y_pred = label_enc.transform(np.full(df_test[label_col].size, y_pred))
  y_true = label_enc.transform(df_test[label_col])

  return {
      'accuracy': sl.metrics.accuracy_score(y_true, y_pred),
      'f1_macro': sl.metrics.f1_score(y_true, y_pred, average='macro'),
      'precision_macro': sl.metrics.precision_score(y_true, y_pred, average='macro'),
      'recall_macro': sl.metrics.recall_score(y_true, y_pred, average='macro'),
      'f1_micro': sl.metrics.f1_score(y_true, y_pred, average='micro'),
      'precision_micro': sl.metrics.precision_score(y_true, y_pred, average='micro'),
      'recall_micro': sl.metrics.recall_score(y_true, y_pred, average='micro'),
  }


def df_predictive_power_scores(df: pd.DataFrame, label_col: str, variable_cols: Optional[Union[str, List[str]]] = None, datetime_is_numeric: Optional [bool] = False, exclude_labels: Optional[bool] = True, threshold: float = 0.0) -> Tuple[pd.DataFrame, plt.Axes]:
  df = df.copy()
  if variable_cols is None:
    if exclude_labels:
      variable_cols = df.columns[~df.columns.str.startswith(EVENTLOG_LABEL_PREFIX) | (df.columns == label_col)].to_list()
    else:
      variable_cols = df.columns.to_list()
  elif isinstance(variable_cols, str):
    variable_cols = [variable_cols, label_col]

  df = df[variable_cols]

  if threshold is None or threshold < 0 or threshold > 1:
    raise ValueError(f"threshold must be in the interval [0, 1] but was {threshold}")
  if datetime_is_numeric:
    df = df_datetime_to_numeric(df)

  df_predictors = pps.predictors(df, label_col, output="df", sorted=True, catch_errors=False, random_seed=RANDOM_STATE, invalid_score=np.NaN, cross_validation=2)
  fig, ax = plt.subplots(figsize=(10,10))
  barplot = sns.barplot(data=df_predictors[df_predictors["is_valid_score"] & df_predictors["ppscore"] >= threshold], x="ppscore", y="x", orient="h", ax=ax).set_title(f"Predictive power score {label_col}")
  return df_predictors, barplot

def df_predictive_power_matrix(df: pd.DataFrame, datetime_is_numeric: Optional[bool] = False, threshold: float = 0.0) -> Tuple[pd.DataFrame, plt.Axes]:
  df = df.copy()
  if threshold is None or threshold < 0 or threshold > 1:
    raise ValueError(f"threshold must be in the interval [0, 1] but was {threshold}")
  if datetime_is_numeric:
    df = df_datetime_to_numeric(df)

  df_matrix = pps.matrix(df, output="df", sorted=True, catch_errors=False, random_seed=RANDOM_STATE, invalid_score=np.NaN, cross_validation=2)
  df_heatmap = df_matrix[df_matrix["is_valid_score"] & df_matrix["ppscore"] >= threshold][['x', 'y', 'ppscore']].pivot(columns='y', index='x', values='ppscore')
  fig, ax = plt.subplots(figsize=(30,30))
  heatmap = sns.heatmap(df_heatmap, vmin=0, vmax=1, linewidths=0.5, annot=True, ax=ax).set_title("Predictive power matrix")
  return df_matrix, heatmap

def df_correlation_matrix(df: pd.DataFrame, datetime_is_numeric: Optional[bool] = False, method: str = "pearson", threshold: float = 0.0) -> Tuple[pd.DataFrame, plt.Axes]:
  df = df.copy()
  if threshold is None or threshold < 0 or threshold > 1:
    raise ValueError(f"threshold must be in the interval [0, 1] but was {threshold}")
  if datetime_is_numeric:
    df = df_datetime_to_numeric(df)

  ordered_cols, unordered_cols = df_separate_categoricals(df)

  for ordinal_col in ordered_cols:
    df[ordinal_col] = df[ordinal_col].cat.codes

  if len(unordered_cols) > 0:
    df = df.join(pd.get_dummies(df[unordered_cols], sparse=True)).drop(columns=unordered_cols)

  df_corr = df.corr(method=method, numeric_only=True)
  fig, ax = plt.subplots(figsize=(50,50))
  heatmap = sns.heatmap(df_corr, vmin=-1, vmax=1, annot=True, ax=ax).set_title(f"{method} correlation")

  return df_corr, heatmap

def df_case_length_stats(df: pd.DataFrame, case_col: str = EVENTLOG_CASE, result_col: str = "Case Length", percentiles: List[float] = np.arange(.05, 1, .05)) -> pd.DataFrame:
  df = df.groupby(case_col).size()
  return pd.DataFrame(data={result_col: df.describe(percentiles=percentiles)})

def df_case_duration_stats(df: pd.DataFrame, case_col: str = EVENTLOG_CASE, time_col: str = EVENTLOG_TIMESTAMP, result_col: str = "Case Duration", percentiles: List[float] = np.arange(.05, 1, .05)) -> pd.DataFrame:
  df = df.groupby(case_col)[time_col].max() - df.groupby(case_col)[time_col].min()
  return pd.DataFrame(data={result_col: df.describe(percentiles=percentiles)})

def df_correlation_scores(df: pd.DataFrame, label_col: str, variable_cols: Optional[Union[str, List[str]]] = None, datetime_is_numeric: Optional [bool] = False, exclude_labels: Optional[bool] = True, method: str = "pearson", threshold: float = 0.0) -> Tuple[pd.DataFrame, plt.Axes]:
  df = df.copy()
  if threshold is None or threshold < 0 or threshold > 1:
    raise ValueError(f"threshold must be in the interval [0, 1] but was {threshold}")
  if datetime_is_numeric:
    df = df_datetime_to_numeric(df)
  if variable_cols is None or len(variable_cols) == 0:
    variable_cols = df.select_dtypes(include=["number", "category"]).columns.to_list()

  df = df[variable_cols]
  ordered_cols, unordered_cols = df_separate_categoricals(df)

  for ordinal_col in ordered_cols:
    df[ordinal_col] = df[ordinal_col].cat.codes

  if len(unordered_cols) > 0:
    df = df.join(pd.get_dummies(df[unordered_cols], sparse=True)).drop(columns=unordered_cols)
  variable_cols = df.columns.to_list()

  df_corr = pd.DataFrame(index=variable_cols, columns=[f"{method} correlation"], dtype="float")

  for col in variable_cols:
    if col == label_col:
      continue
    df_corr.loc[col][f"{method} correlation"] = df[col].dropna().astype("float").corr(df[label_col])

  df_corr.sort_values(by=[f"{method} correlation"], inplace=True)
  fig, ax = plt.subplots(figsize=(100,100))
  barplot = sns.barplot(data=df_corr, x=f"{method} correlation", y=df_corr.index, orient="h", ax=ax).set_title(label_col)

  return df_corr, barplot

def df_visualize_strict_temporal_splitting(df_train: pd.DataFrame, df_test: pd.DataFrame, time_col: str = EVENTLOG_TIMESTAMP) -> plt.Axes:
  s_months_train = df_train[time_col].dt.to_period('M').value_counts()
  s_months_test_before_sep = df_test[df_test[df_find_labels(df_test)].isna().any(axis=1)][time_col].dt.to_period('M').value_counts()
  s_months_test_after_sep = df_test[~df_test[df_find_labels(df_test)].isna().any(axis=1)][time_col].dt.to_period('M').value_counts()

  df = pd.concat([
    s_months_train.rename("Training Set Correct"),
    s_months_test_before_sep.rename("Training Set Wrong"),
    s_months_test_after_sep.rename("Test Set")
  ], axis=1).sort_index().fillna(0)
  return df.plot(kind='bar', stacked=True, color=['green', 'red', 'grey'])

### Eventlog Functions

In [None]:
def df_filter_date_range(df: pd.DataFrame, min: Union[pd.Timestamp, str, float] = 0.0, max: Union[pd.Timestamp, str, float] = 1.0, case_col: str = EVENTLOG_CASE, time_col: str = EVENTLOG_TIMESTAMP, mode: Literal['events', 'traces_contained', 'traces_intersecting'] = 'traces_contained') -> pd.DataFrame:
  """
  Filters a DataFrame for events or traces within a specified date range.

  Parameters:
    df (pd.DataFrame): The DataFrame to filter.
    min (Union[pd.Timestamp, str, float], optional): The minimum timestamp (inclusive)  for filtering, which can be an actual timestamp, a string representation of a date, or a quantile between 0.0 and 1.0 (default is 0.0).
    max (Union[pd.Timestamp, str, float], optional): The maximum timestamp (inclusive) for filtering, which can be an actual timestamp, a string representation of a date, or a quantile between 0.0 and 1.0 (default is 1.0).
    case_col (str, optional): The column name in the DataFrame that represents the case ID.
    time_col (str, optional): The column name in the DataFrame that represents the timestamp.
    mode (Literal['events', 'traces_contained', 'traces_intersecting'], optional): The mode determining how filtering is applied (default is 'traces_contained').
      - 'events': keep events between min and max
      - 'traces_contained': keep traces where all events are between min and max
      - 'traces_intersecting': keep traces where any event is between min and max

  Returns:
    pd.DataFrame: A DataFrame filtered according to the specified date range and mode

  Raises:
    ValueError: If the `min` value is not smaller than the `max` value.
  """
  df = df.copy()

  if isinstance(min, float):
    min = df[time_col].min() if 0.0 == min else df[time_col].quantile(min)
  elif not isinstance(min, pd.Timestamp):
    min = pd.to_datetime(min)
  if isinstance(max, float):
    max = df[time_col].max() if 1.0 == max else df[time_col].quantile(max)
  elif not isinstance(max, pd.Timestamp):
    max = pd.to_datetime(max)

  if min >= max:
    raise ValueError(f"min {min} must be smaller than max {max}")

  if 'events' == mode:
    df = df[(df[time_col] >= min) & (df[time_col] <= max)]
  elif 'traces_contained' == mode:
    df = df.groupby(case_col).filter(lambda df_group: ((df_group[time_col] >= min) & (df_group[time_col] <= max)).all())
  elif 'traces_intersecting' == mode:
    df = df.groupby(case_col).filter(lambda df_group: ((df_group[time_col] >= min) & (df_group[time_col] <= max)).any())

  return df

def df_filter_case_size_range(df: pd.DataFrame, min: Union[int, float] = 0.0, max: Union[int, float] = 1.0, case_col: str = EVENTLOG_CASE) -> pd.DataFrame:
  """
  Filters a DataFrame based on the number of events within each case falling between a specified range.

  Parameters:
    df (pd.DataFrame): The DataFrame to filter.
    min (Union[int, float], optional): The minimum number of events a case must contain. If a float representing a quantile is provided, the minimum number is computed based on the distribution of case sizes (default is 0.0 which represents the smallest case size).
    max (Union[int, float], optional): The maximum number of events a case can contain. If a float representing a quantile is provided, the maximum number is computed based on the distribution of case sizes (default is 1.0 which represents the largest case size).
    case_col (str, optional): The column name in the DataFrame that represents case IDs.

  Returns:
    pd.DataFrame: A DataFrame filtered by cases with a number of events within the specified range.

  Raises:
    ValueError: If the `min` value is not smaller than the `max` value.
  """
  df = df.copy()

  if isinstance(min, float):
    min = df.groupby(case_col).size().min() if 0.0 == min else df.groupby(case_col).size().quantile(min)
  if isinstance(max, float):
    max = df.groupby(case_col).size().max() if 1.0 == max else df.groupby(case_col).size().quantile(max)

  if min >= max:
    raise ValueError(f"min {min} must be smaller than max {max}")

  df = df.groupby(case_col).filter(lambda df_group: len(df_group) >= min and len(df_group) <= max)

  return df

def df_filter_case_duration_range(df: pd.DataFrame, min: Union[pd.Timedelta, str, float] = 0.0, max: Union[pd.Timedelta, str, float] = 1.0, case_col: str = EVENTLOG_CASE, time_col: str = EVENTLOG_TIMESTAMP) -> pd.DataFrame:
  """
  Filters a DataFrame based on the duration of cases falling between a specified time range.

  Parameters:
    df (pd.DataFrame): The DataFrame to filter.
    min (Union[pd.Timedelta, str, float], optional): The minimum duration for a case to be included. A float represents a quantile of case durations (0.0 being the shortest duration), a Timedelta or a string that can be converted to a Timedelta. The default is 0.0, which uses the shortest case duration.
    max (Union[pd.Timedelta, str, float], optional): The maximum duration for a case to be included. A float represents a quantile of case durations (1.0 being the longest duration), a Timedelta or a string that can be converted to a Timedelta. The default is 1.0, which uses the longest case duration.
    case_col (str, optional): The column name in the DataFrame that identifies the case ID.
    time_col (str, optional): The column name in the DataFrame that identifies the timestamp.

  Returns:
    pd.DataFrame: A DataFrame containing cases with durations within the specified range.

  Raises:
    ValueError: If the `min` value is not smaller than the `max` value.
  """
  df = df.copy()
  df_dur = df.groupby(case_col)[time_col].max() - df.groupby(case_col)[time_col].min()

  if isinstance(min, float):
    min = df_dur.min() if 0.0 == min else df_dur.quantile(min)
  elif not isinstance(min, pd.Timedelta):
    min = pd.Timedelta(min)
  if isinstance(max, float):
    max = df_dur.max() if 1.0 == max else df_dur.quantile(max)
  elif not isinstance(max, pd.Timedelta):
    max = pd.Timedelta(max)

  if min >= max:
    raise ValueError(f"min {min} must be smaller than max {max}")

  df_dur = df_dur[(df_dur >= min) & (df_dur <= max)]
  df_filtered = df[df[case_col].isin(df_dur.index)]

  return df_filtered.reset_index(drop=True)

def df_prefix_pad_attributes(df: pd.DataFrame, attr_cols: Union[str, List[str]], case_col: str = EVENTLOG_CASE) -> pd.DataFrame:
  """
  Performs prefix padding of attributes for events within each case by forward filling missing data. Attributes of events earlier in the case are propagated to later events, ensuring that each event within the case has the full history of attribute values up to that point.

  Parameters:
    df (pd.DataFrame): The DataFrame containing event log data.
    attr_cols (Union[str, List[str]]): A list of attribute column names or a single attribute column name that should be prefix padded within each case.
    case_col (str, optional): The column name in the DataFrame that identifies cases.

  Returns:
    pd.DataFrame: A DataFrame that has been pivot transformed to have attribute columns for each event in the case with forward filled values to reflect the prefix padding.

  Note:
    If `attr_cols` includes the case ID column, it will be removed from the attributes to avoid errors in the process.

  """

  if isinstance(attr_cols, str):
    attr_cols = [attr_cols]

  if case_col in attr_cols:
    attr_cols.remove(case_col)
  df = df[[case_col] + attr_cols]

  df["Event in Case"] = df.groupby(case_col).cumcount()
  df["Attribute in Case"] = df["Event in Case"]

  df_pivot = df.pivot(index=[case_col, "Event in Case"], columns=["Attribute in Case"], values=attr_cols)
  df_pivot = df_pivot.groupby(case_col).ffill()

  return df_pivot

def df_encode_categories(df: pd.DataFrame, threshold: Optional[int] = None):
  df = df.copy()

  ordinal_cols, onehot_cols = df_separate_categoricals(df)

  for col in ordinal_cols:
    df[col] = df[col].cat.codes

  if threshold is not None and threshold > 0:
    onehot_cols = [col for col in onehot_cols if len(df[col].cat.categories) <= threshold]

  df = pd.get_dummies(df, columns=onehot_cols)

  bool_cols = df.select_dtypes('boolean')
  for col in bool_cols:
    df[col] = df[col].astype('Int8')

  return df

def df_random_train_test_split(df: pd.DataFrame, test_len: float = 0.2, case_col: str = EVENTLOG_CASE, random_state: int = RANDOM_STATE) -> Tuple[pd.DataFrame, pd.DataFrame]:
  case_ids = df[case_col].unique()

  case_ids_train, case_ids_test = sl.model_selection.train_test_split(
    case_ids,
    test_size=test_len,
    random_state=random_state,
    shuffle=True
  )

  return df[df[case_col].isin(case_ids_train)].reset_index(drop=True), df[df[case_col].isin(case_ids_test)].reset_index(drop=True)

def df_temporal_train_test_split(df: pd.DataFrame, test_len: float = 0.2, split_mode: Literal['event', 'case_start'] = 'event', filter_mode: Literal['events', 'traces'] = 'traces', case_col: str = EVENTLOG_CASE, time_col: str = EVENTLOG_TIMESTAMP) -> Tuple[pd.DataFrame, pd.DataFrame]:
  """
  Splits a DataFrame into train and test sets based on a temporal split.

  Parameters:
    df (pd.DataFrame): A pandas DataFrame containing the event log data.
    test_len (float, optional): The proportion of the data to include in the test split. Defaults to 0.2.
    split_mode (Literal['event', 'case_start'], optional): The mode to use for splitting the data. 'event' splits based on the timestamp of individual events, while 'case_start' splits based on the timestamp of the first event in each case. Defaults to 'event'.
    filter_mode (Literal['events', 'traces'], optional): The mode to use for filtering the data after splitting. 'events' filters individual events, while 'traces' filters entire cases. Defaults to 'traces'.
    case_col (str, optional): The name of the column in df that represents the case identifier. Defaults to `EVENTLOG_CASE`.
    time_col (str, optional): The name of the column in df that represents the timestamp. Defaults to `EVENTLOG_TIMESTAMP`.

  Returns:
    Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the train and test DataFrames.
  """
  if 'event' == split_mode:
    test_start = df[time_col].quantile(q=(1 - test_len), interpolation='higher')
  elif 'case_start' == split_mode:
    test_start = df.groupby(case_col)[time_col].min().quantile(q=(1 - test_len), interpolation='higher')
  else:
    raise ValueError(f"Unknown split mode {split_mode}")

  if 'events' == filter_mode:
    df_test = df_filter_date_range(df, min=test_start, case_col=case_col, time_col=time_col, mode='events')
    df_train = df_filter_date_range(df, max=test_start, case_col=case_col, time_col=time_col, mode='events')
  elif 'traces' == filter_mode:
    df_test = df_filter_date_range(df, min=test_start, case_col=case_col, time_col=time_col, mode='traces_contained')
    df_train = df[~df[case_col].isin(df_test[case_col])]
  else:
    raise ValueError(f"Unknown filter mode {filter_mode}")

  return df_train.reset_index(drop=True), df_test.reset_index(drop=True)

def df_strict_temporal_train_test_split(df: pd.DataFrame, test_len: float = 0.2, label_cols: Union[List[str], str] = [], latest_start: Optional[Union[pd.Timestamp, str]] = None, case_col: str = EVENTLOG_CASE, time_col: str = EVENTLOG_TIMESTAMP, debias_start: bool = True, debias_end: bool = True) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Splits a DataFrame into train and test sets based on a strict temporal split. The code is based on the works of Weytjens & De Weerdt (2022) (see https://github.com/hansweytjens/predictive-process-monitoring-benchmarks/blob/main/create_benchmarks.py) 
    """
    if isinstance(label_cols, str):
      label_cols = [label_cols]

    if latest_start is None:
      df_dur = df.groupby(case_col)[time_col].max() - df.groupby(case_col)[time_col].min()
      latest_start = df[time_col].max() - df_dur.max()
    elif not isinstance(latest_start, pd.Timestamp):
      latest_start = pd.Timestamp(latest_start)

    case_starts_df = df.groupby(case_col)[time_col].min()
    case_nr_list_start = case_starts_df.sort_values().index.array
    case_stops_df = df.groupby(case_col)[time_col].max().to_frame()

    ### TEST SET ###
    first_test_case_nr = int(len(case_nr_list_start) * (1 - test_len))
    first_test_start_time = np.sort(case_starts_df.values)[first_test_case_nr]
    # retain cases that end after first_test_start time
    test_case_nrs = case_stops_df[case_stops_df[time_col].values >= first_test_start_time].index.array
    df_test_all = df[df[case_col].isin(test_case_nrs)].reset_index(drop=True)

    # drop prefixes in test set that are past latest_start
    if debias_end:
      df_test = df_test_all[df_test_all[time_col] <= latest_start]
    else:
      df_test = df_test_all

    if debias_start:
    # convert targets into np.NAN for those prefixes that end before the separation time (beginning of test set)
      df_test.loc[df_test[time_col].values < first_test_start_time, label_cols] = pd.NA

    #### TRAINING SET ###
    train_case_nrs = case_stops_df[case_stops_df[time_col].values < first_test_start_time].index.array  # added values
    df_train = df[df[case_col].isin(train_case_nrs)].reset_index(drop=True)

    return df_train, df_test

### Dataset Functions

In [None]:
def ds_write_files(ds: tf.data.Dataset, filename: str, compression: Optional[str] = 'zip') -> None:
  ds.save(filename, compression='GZIP' if compression is not None else compression)
  if compression is not None:
    shutil.make_archive(
      filename,
      format=compression,
      root_dir=filename
    )

def ds_find_static_attrs(ds: tf.data.Dataset):
  input, _ = ds.element_spec

  stat_attrs = []
  for key, value in input.items():
    if 1 == value.shape.num_elements():
      stat_attrs.append(key)

  return stat_attrs

def ds_find_dynamic_attrs(ds: tf.data.Dataset):
  input, _ = ds.element_spec

  dyn_attrs = []
  for key, value in input.items():
    if 1 < value.shape.num_elements():
      dyn_attrs.append(key)

  return dyn_attrs

def ds_find_max_seq_len(ds: tf.data.Dataset):
  input, _ = ds.element_spec

  dyn_attrs = ds_find_dynamic_attrs(ds)
  dyn_attr_lens = set()
  for dyn_attr in dyn_attrs:
    dyn_attr_lens.add(input[dyn_attr].shape.num_elements())

  if len(dyn_attr_lens) > 1:
    raise ValueError("Different sequence lengths for different attributes.")

  return dyn_attr_lens.pop()

def ds_sequentialize_static_attrs(ds: tf.data.Dataset, activity_attr: str = "activity", pad_token: str = TOKEN_PADDING, pad_token_num = int(TOKEN_PADDING_NUM)) -> tf.data.Dataset:
  max_seq_len = ds_find_max_seq_len(ds)
  stat_attrs = ds_find_static_attrs(ds)

  def ds_sequentialize_map_helper(inputs, outputs):
    # Find the current sequence length by looking for the padding token in the 'activity' attribute
    # Assumes 'activity' is a dynamic attribute holding the sequence
    seq_len = tf.argmax(tf.equal(inputs['activity'], pad_token), axis=-1)

    for attr in stat_attrs:
      # Repeat static attribute for the sequence length and pad to the max sequence length
      attr_val_repeated = tf.repeat(inputs[attr], repeats=seq_len, axis=0)
      paddings = [[0, max_seq_len - seq_len]]
      pad_const = pad_token_num if attr_val_repeated.dtype.is_numeric else pad_token
      attr_val_padded = tf.pad(attr_val_repeated, paddings, 'CONSTANT', constant_values=pad_const)
      inputs[attr] = attr_val_padded

    return inputs, outputs

  return ds.map(ds_sequentialize_map_helper)

def prepare_dataset(
    df: pd.DataFrame,
    df_train: pd.DataFrame,
    df_test: Optional[pd.DataFrame] = None,
    df_val: Optional[pd.DataFrame] = None,
    out_dir: str = OUTPUT_DATA_DIR,
    event_cols: Optional[Union[List[str], str]] = None,
    case_cols: Optional[Union[List[str], str]] = None,
    label_cols: Optional[Union[List[str], str]] = None,
    cat_enc_factory: Optional[Callable[[], sl.base.TransformerMixin]] = lambda: sl.preprocessing.OneHotEncoder(sparse_output=False),
    #cat_enc_factory: Optional[Callable[[], sl.base.TransformerMixin]] = lambda: ce.OneHotEncoder(return_df=False),
    fit_complete: bool = True,
    time_col: str = EVENTLOG_TIMESTAMP,
    case_col: str = EVENTLOG_CASE,
    label_prefix: str = EVENTLOG_LABEL_PREFIX,
):
  Path(out_dir).mkdir(parents=True, exist_ok=True)

  if event_cols is None:
    event_cols = df_find_event_attributes(df.drop(columns=time_col), exclude_labels=True, case_col=case_col, label_prefix=label_prefix)
  elif isinstance(event_cols, str):
    event_cols = [event_cols]
  if case_cols is None:
    case_cols = df_find_case_attributes(df, exclude_labels=True, case_col=case_col, label_prefix=label_prefix)
  elif isinstance(case_cols, str):
    case_cols = [case_cols]
  if label_cols is None:
    label_cols = df_find_labels(df, label_prefix=label_prefix)
  elif isinstance(label_cols, str):
    label_cols = [label_cols]

  train_stat = []
  test_stat = []
  val_stat = []
  for case_col in case_cols:
    if case_col in df.select_dtypes(exclude=['number', 'timedelta', 'datetime', 'datetimetz']).columns and cat_enc_factory is not None:
      enc = cat_enc_factory()
      if fit_complete:
        enc.fit(np.reshape(df[case_col].to_numpy(), (-1,1)))
      else:
        enc.fit(np.reshape(df_train[case_col].to_numpy(), (-1,1)))

      train_stat.append(enc.transform(np.reshape(df_train[case_col].to_numpy(), (-1,1))))
      if df_test is not None:
        test_stat.append(enc.transform(np.reshape(df_test[case_col].to_numpy(), (-1,1))))
      if df_val is not None:
        val_stat.append(enc.transform(np.reshape(df_val[case_col].to_numpy(), (-1,1))))
    else:
      train_stat.append(np.expand_dims(df_train[case_col].to_numpy(), axis=-1))
      if df_test is not None:
        test_stat.append(np.expand_dims(df_test[case_col].to_numpy(), axis=-1))
      if df_val is not None:
        val_stat.append(np.expand_dims(df_val[case_col].to_numpy(), axis=-1))

  np.save(os.path.join(out_dir, "static_train.npy"), np.hstack(train_stat))
  if df_test is not None:
    np.save(os.path.join(out_dir, "static_test.npy"), np.hstack(test_stat))
  if df_val is not None:
    np.save(os.path.join(out_dir, "static_val.npy"), np.hstack(val_stat))

  df_train_dyn = df_prefix_pad_attributes(df_train, event_cols)
  if df_test is not None:
    df_test_dyn = df_prefix_pad_attributes(df_test, event_cols)
  if df_val is not None:
    df_val_dyn = df_prefix_pad_attributes(df_val, event_cols)

  for event_col in event_cols:
    if event_col in df.select_dtypes(exclude=['number', 'timedelta', 'datetime', 'datetimetz']).columns and cat_enc_factory is not None:
      enc = cat_enc_factory()
      if fit_complete:
        enc.fit(np.reshape(df[event_col].to_numpy(), (-1,1)))
      else:
        enc.fit(np.reshape(df_train[event_col].to_numpy(), (-1,1)))

      train_dyn = np_filter_na(df_train_dyn[event_col].to_numpy(), unsqueeze=True)
      train_dyn = np.array([enc.transform(event) for event in train_dyn], dtype='object')
      np.save(os.path.join(out_dir, f"timeseries_{get_valid_filename(event_col)}_train.npy"), train_dyn)
      if df_test is not None:
        test_dyn = np_filter_na(df_test_dyn[event_col].to_numpy(), unsqueeze=True)
        test_dyn = np.array([enc.transform(event) for event in test_dyn], dtype='object')
        np.save(os.path.join(out_dir, f"timeseries_{get_valid_filename(event_col)}_test.npy"), test_dyn)
      if df_val is not None:
        val_dyn = np_filter_na(df_val_dyn[event_col].to_numpy(), unsqueeze=True)
        val_dyn = np.array([enc.transform(event) for event in val_dyn], dtype='object')
        np.save(os.path.join(out_dir, f"timeseries_{get_valid_filename(event_col)}_val.npy"), val_dyn)
    else:
      train_dyn = np_filter_na(df_train_dyn[event_col].to_numpy())
      np.save(os.path.join(out_dir, f"timeseries_{get_valid_filename(event_col)}_train.npy"), train_dyn)
      if df_test is not None:
        test_dyn = np_filter_na(df_test_dyn[event_col].to_numpy())
        np.save(os.path.join(out_dir, f"timeseries_{get_valid_filename(event_col)}_test.npy"), test_dyn)
      if df_val is not None:
        val_dyn = np_filter_na(df_val_dyn[event_col].to_numpy())
        np.save(os.path.join(out_dir, f"timeseries_{get_valid_filename(event_col)}_val.npy"), val_dyn)

  for label_col in label_cols:
    np.save(os.path.join(out_dir, f"y_train_{get_valid_filename(label_col)}.npy"), df_train[label_col].to_numpy())
    if df_test is not None:
      np.save(os.path.join(out_dir, f"y_test_{get_valid_filename(label_col)}.npy"), df_test[label_col].to_numpy())
    if df_val is not None:
      np.save(os.path.join(out_dir, f"y_val_{get_valid_filename(label_col)}.npy"), df_val[label_col].to_numpy())


### File Functions

In [None]:
def get_valid_filename(name):
  """
  Return the given string converted to a string that can be used for a clean filename. Remove leading and trailing spaces; convert other spaces to underscores; and remove anything that is not an alphanumeric, dash, underscore, or dot.
  """
  s = str(name).strip().replace(" ", "_")
  s = re.sub(r"(?u)[^-\w.]", "", s)
  return s

### Graph Functions

In [None]:
class GraphAttentionEmbedding(torch.nn.Module):
  # Implementation of TGN graph attention oriented on https://github.com/pyg-team/pytorch_geometric/blob/master/examples/tgn.py
    def __init__(self, in_channels: int, out_channels: int, msg_dim: int, time_enc: torch.nn.Module):
        super().__init__()
        self.time_enc = time_enc
        edge_dim = msg_dim + time_enc.out_channels
        self.conv = pyg.nn.TransformerConv(in_channels, out_channels // 2, heads=2, dropout=0.1, edge_dim=edge_dim)

    def forward(self, x, last_update, edge_index, t, msg):
        rel_t = last_update[edge_index[0]] - t
        rel_t_enc = self.time_enc(rel_t.to(x.dtype))
        edge_attr = torch.cat([rel_t_enc, msg], dim=-1)
        return self.conv(x, edge_index, edge_attr)

class LinkPredictor(torch.nn.Module):
    # Implementation of TGN output module oriented on https://github.com/pyg-team/pytorch_geometric/blob/master/examples/tgn.py
    def __init__(self, in_channels: int):
        super().__init__()
        self.lin_src = pyg.nn.Linear(in_channels, in_channels)
        self.lin_dst = pyg.nn.Linear(in_channels, in_channels)
        self.lin_final = pyg.nn.Linear(in_channels, 1)

    def forward(self, z_src, z_dst):
        h = self.lin_src(z_src) + self.lin_dst(z_dst)
        h = h.relu()
        return self.lin_final(h)

memory_dim = time_dim = embedding_dim = EMBEDDING_DIM

In [None]:
def df_to_pyg_temporal_data(
    df_train: pd.DataFrame,
    node_id_col: str,
    dest_node_id_col: Optional[str] = None,
    target_col: Optional[str] = None,
    feat_cols: List[str] = [],
    case_col: str = EVENTLOG_CASE,
    time_col: str = EVENTLOG_TIMESTAMP,
) -> Tuple[dict, dict, pyg.data.TemporalData]:
  df_train = df_train.copy()

  id_mapping = dict(enumerate(df_train[node_id_col].cat.categories))

  feat_mapping = {}
  for feat_col in feat_cols:
    feat_mapping[feat_col] = df_train[feat_col].max()

  def df_to_pyg_temporal_data_helper(df: Optional[pd.DataFrame]) -> pyg.data.TemporalData:
    if df is None:
      return None

    df[df[node_id_col] == TOKEN_NA][node_id_col] = pd.NA

    if dest_node_id_col is None:
      df["_src_node_id"] = df.groupby(EVENTLOG_CASE)[node_id_col].shift(1)
      df["_dst_node_id"] = df[node_id_col]
    else:
      df["_src_node_id"] = df[node_id_col]
      df["_dst_node_id"] = df[dest_node_id_col]

    df["_timestamp"] = df[EVENTLOG_TIMESTAMP].view(int)

    df = df.dropna(subset=["_src_node_id", "_dst_node_id"], axis='index', how='any')
    df = df.drop_duplicates(subset=["_src_node_id", "_dst_node_id", "_timestamp"], keep='last')

    df["_dst_node_id"] = df["_dst_node_id"].cat.codes
    df["_src_node_id"] = df["_src_node_id"].cat.codes

    df["_target"] = 0 if target_col is None else df[target_col]

    df["_feat_weight"] = df.sort_values(["_src_node_id", "_dst_node_id", EVENTLOG_TIMESTAMP]).groupby(by=["_src_node_id", "_dst_node_id"])["_src_node_id"].cumcount() + 1
    df["_feat_weight"] = df["_feat_weight"] / df["_feat_weight"].max()

    for feat_col in feat_cols:
      df[f"_feat_{feat_col}"] = df[feat_col].cat.codes
      df[f"_feat_{feat_col}"] = df[f"_feat_{feat_col}"] / feat_mapping[feat_col]

    src = torch.from_numpy(df["_src_node_id"].values).to(torch.long)
    dst = torch.from_numpy(df["_dst_node_id"].values).to(torch.long)

    t = torch.from_numpy(df["_timestamp"].values).to(torch.long)
    y = torch.from_numpy(df["_target"].values).to(torch.long)
    msg = torch.from_numpy(df[[col for col in df if col.startswith("_feat_")]].values).to(torch.float)

    return pyg.data.TemporalData(src=src, dst=dst, t=t, msg=msg, y=y)

  inverse_id_mapping = {v: k for k, v in id_mapping.items()}

  return id_mapping, inverse_id_mapping, df_to_pyg_temporal_data_helper(df_train)

In [None]:
def train():
    # Implementation of TGN training loop https://github.com/pyg-team/pytorch_geometric/blob/master/examples/tgn.py
    memory.train()
    gnn.train()
    link_pred.train()

    memory.reset_state()  # Start with a fresh memory.
    neighbor_loader.reset_state()  # Start with an empty graph.

    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        batch = batch.to(device)

        n_id, edge_index, e_id = neighbor_loader(batch.n_id)
        assoc[n_id] = torch.arange(n_id.size(0), device=device)

        # Get updated memory of all nodes involved in the computation.
        z, last_update = memory(n_id)
        z = gnn(z, last_update, edge_index, data.t[e_id].to(device), data.msg[e_id].to(device))
        pos_out = link_pred(z[assoc[batch.src]], z[assoc[batch.dst]])
        neg_out = link_pred(z[assoc[batch.src]], z[assoc[batch.neg_dst]])

        loss = criterion(pos_out, torch.ones_like(pos_out))
        loss += criterion(neg_out, torch.zeros_like(neg_out))

        # Update memory and neighbor loader with ground-truth state.
        memory.update_state(batch.src, batch.dst, batch.t, batch.msg)
        neighbor_loader.insert(batch.src, batch.dst)

        loss.backward()
        optimizer.step()
        memory.detach()
        total_loss += float(loss) * batch.num_events

    return total_loss / train_data.num_events

@torch.no_grad()
def test(loader):
    # Implementation of TGN test loop oriented on https://github.com/pyg-team/pytorch_geometric/blob/master/examples/tgn.py
    memory.eval()
    gnn.eval()
    link_pred.eval()

    torch.manual_seed(RANDOM_STATE)

    aps, aucs = [], []
    for batch in loader:
        batch = batch.to(device)

        n_id, edge_index, e_id = neighbor_loader(batch.n_id)
        assoc[n_id] = torch.arange(n_id.size(0), device=device)

        z, last_update = memory(n_id)
        z = gnn(z, last_update, edge_index, data.t[e_id].to(device), data.msg[e_id].to(device))
        pos_out = link_pred(z[assoc[batch.src]], z[assoc[batch.dst]])
        neg_out = link_pred(z[assoc[batch.src]], z[assoc[batch.neg_dst]])

        y_pred = torch.cat([pos_out, neg_out], dim=0).sigmoid().cpu()
        y_true = torch.cat([torch.ones(pos_out.size(0)), torch.zeros(neg_out.size(0))], dim=0)

        aps.append(sl.metrics.average_precision_score(y_true, y_pred))
        aucs.append(sl.metrics.roc_auc_score(y_true, y_pred))

        memory.update_state(batch.src, batch.dst, batch.t, batch.msg)
        neighbor_loader.insert(batch.src, batch.dst)
    return float(torch.tensor(aps).mean()), float(torch.tensor(aucs).mean())

def embed():
  memory.eval()
  gnn.eval()
  link_pred.eval()

  assoc = torch.empty(data.num_nodes, dtype=torch.long)

  embeddings = []
  for i in range(data.num_nodes):
    n_id, edge_index, e_id = neighbor_loader(torch.tensor([i]))
    assoc[n_id] = torch.arange(n_id.size(0))

    z, last_update = memory(n_id)
    z = gnn(z, last_update, edge_index, data.t[e_id], data.msg[e_id])
    embeddings.append(z[assoc[torch.tensor([i])]].numpy(force=True))

  return np.concatenate(embeddings)

In [None]:
def pyg_visualize_temporal_graph(temporal_data: pyg.data.TemporalData) -> nx.DiGraph:
  data = pyg.data.Data(edge_index=temporal_data.edge_index)
  g = pyg.utils.to_networkx(data)

  nx.draw(g, with_labels=True)
  plt.show()

  return g

## Data Import

### A: Import from Google Drive

In [None]:
drive.mount("/content/drive")

!cp -r "$GDRIVE_INPUT_DIR" "$INPUT_DATA_DIR"

drive.flush_and_unmount()

### B: Upload from Local Machine

In [None]:
#uploaded = files.upload()

#for filename in uploaded.keys():
#  target = os.path.join(INPUT_DATA_DIR, filename)
#  !mv "$filename" "$target"

# del uploaded

# Data Preprocessing

## Dataset: Incident management process enriched event log

This event log is of an incident management process extracted from an instance of the ServiceNow platform used by an IT company. See also https://doi.org/10.24432/C57S4H and http://processmining.each.webhostusp.sti.usp.br/index.php/event-logs/.

- **Control Attributes**:
    - *number*: incident identifier with the same number as total cases;
    
    - *incident state*: attribute with eight levels controlling incident management process transitions from opening until closing the case;
    
    - *active*: boolean attribute indicating if record is active or closed/canceled;
    
    - *reassignment_count*: number of times incident has changed group or support analysts;
    
    - *reopen_count*: number of times incident resolution was rejected by caller;
    
    - *sys_mod_count*: number of incident updates until that moment;
    
    - *made_sla*: boolean attribute to incident exceeded target SLA or not;

- **Identification and Classification Attributes**:
    - *caller_id*: user identifier affected;
    
    - *opened_by*: user identifier that reported the incident;
    
    - *opened_at*: incident opening date and time;
    
    - *sys_created_by*: user identifier that registered the incident;
    
    - *sys_created_at*: incident creation date and time;
    
    - *sys_updated_by*: user identifier that made update and generated current log record;
    
    - *sys_updated_at*: log update date and time;
    
    - *contact_type*: categorical field with values indicating how incident was reported;
    
    - *location*: location identifier of place being affected;
    
    - *category*: description of the first level of service being affected;
    
    - *subcategory*: description of the second level of service being affected related to first level;
    
    - *u_symptom*: description about user perception of service availability;
    
    - *cmdb_ci*: (confirmation item) identifier (not mandatory) referencing homonyms relation and used to report item being affected;
    
    - *impact*: description of the impact caused by incident. Values are: 1-High; 2-Medium; 3-Low;
    
    - *urgency*: description to the urgency asked by user for incident resolution. Values are same as impact;
    
    - *priority*: priority calculated by system based on Impact and urgency;

- **Support, Diagnosis and Other Attributes**:
    - *assignment_group*: identifier referencing the relation Group (database relational model in ServiceNowTM) describing support group in charge of incident;
    
    - *assigned_to*: user identifier in charge of incident;
    
    - *knowledge*: boolean attribute indicating whether a knowledge base document was used to resolve incident;
    
    - *u_priority_confirmation*: boolean attribute indicating whether priority field was double checked;
    
    - *notify*: categorical attribute indicating whether notifications was generated for this incident;
    
    - *problem_id*: identifier referencing homonyms relation describing problem identifier associated with this incident;
    
    - *rfc*: (change request) identifier referencing homonyms relation describing change request identifier associated with incident;
    
    - *vendor*: identifier referencing homonyms relation describing vendor in charge of incident;
    
    - *caused_by*: relation with RFC code responsible by the incident;
    
    - *close_code*: resolution code of the incident;
    
    - *resolved_by*: user identifier who resolved the incident;
    
    - *resolved_at*: incident resolution date and time;
    
    - *closed_at*: incident close date and time;

### Read Incident management process enriched event log V1

In [None]:
dtypes_servicenow = {
    "number": "string",
    "incident_state": "category",
    "active": "boolean",
    "reassignment_count": "Int16",
    "reopen_count": "Int16",
    "sys_mod_count": "Int16",
    "made_sla": "boolean",
    "caller_id": "category",
    "opened_by": "category",
    "sys_created_by": "category",
    "sys_updated_by": "category",
    "contact_type": "category",
    "location": "category",
    "category": "category",
    "subcategory": "category",
    "u_symptom": "category",
    "cmdb_ci": "category",
    "impact": pd.CategoricalDtype(['1 - High', '2 - Medium', '3 - Low'], ordered=True),
    "urgency": pd.CategoricalDtype(['1 - High', '2 - Medium', '3 - Low'], ordered=True),
    "priority": pd.CategoricalDtype(['1 - Critical', '2 - High', '3 - Moderate', '4 - Low'], ordered=True),
    "assignment_group": "category",
    "assigned_to": "category",
    "knowledge": "boolean",
    "u_priority_confirmation": "boolean",
    "problem_id": "string",
    "rfc": "string",
    "vendor": "category",
    "caused_by": "category",
    "closed_code": "category",
    "resolved_by": "category"
}

df_servicenow = pd.read_csv(
    os.path.join(INPUT_DATA_DIR, "incident_event_log.csv"),
    header=0,
    na_values=["?"],
    dtype=dtypes_servicenow
)

# Convert timestamps
df_servicenow = df_convert_datetimes(
    df_servicenow,
    ["opened_at", "sys_created_at", "sys_updated_at", "resolved_at", "closed_at"],
    dayfirst=True
)

# Convert booleans
df_servicenow = df_convert_bools(
    df_servicenow,
    ["notify"],
    "Send Email",
    "Do Not Notify"
)

df_drop_duplicate_rows(df_servicenow)
df_drop_na_rows_and_cols(df_servicenow)
df_drop_single_val_cols(df_servicenow)

# Sort and normalize column names
df_format_as_eventlog(df_servicenow, case_col="number", activity_col="incident_state", time_col="sys_updated_at", resource_col="assigned_to", group_col="assignment_group")

# Write as Pandas CSV
df_write_files(df_servicenow, os.path.join(INTERIM_DATA_DIR, "incident_event_log_processed"), skip_xes=False)

print(df_servicenow.dtypes)
df_servicenow

### Label Incident management process enriched event log V1

In [None]:
df_servicenow = df_label_next_activity(df_servicenow, eoc_token=TOKEN_EOC)
df_servicenow = df_label_activity_duration(df_servicenow, unit='d', eoc_token=pd.Timedelta(0))
df_servicenow = df_label_remaining_cycle_time(df_servicenow, unit='d')

df_write_files(df_servicenow, os.path.join(INTERIM_DATA_DIR, "incident_event_log_labeled"))
df_servicenow

### Descriptive Statistics for Incident management process enriched event log V1

In [None]:
df_stat = df_servicenow.describe(include='all')
df_write_files(df_stat, os.path.join(OUTPUT_DATA_DIR, "incident_event_log_describe"), index=True)
df_stat

In [None]:
df_servicenow.hist(xrot=90, figsize=(10, 10))
plt.tight_layout()
plt.savefig(os.path.join(GRAPHIC_DIR, "incident_event_log_hist.svg"), bbox_inches='tight')
plt.show()

In [None]:
df_case_length_stats(df_servicenow)

In [None]:
df_case_duration_stats(df_servicenow)

### Clean Incident management process enriched event log V1

In [None]:
df_servicenow_clean = df_servicenow.drop(columns=[
    "active", # A-posteriori
    "made_sla", # A-posteriori
    "case:opened_at", # In sequence encoded
    "case:sys_created_at", # In sequence encoded
    "impact", # Correlates with priority and urgency
    "urgency", # Correlates with priority and urgency
    "case:closed_code", # A-posteriori
    "case:resolved_by", # A-posteriori
    "case:resolved_at", # A-posteriori
    "case:closed_at", # A-posteriori
])

# Remove mostly empty columns
df_drop_threshold_na_cols(df_servicenow_clean, 0.95)

# Filter event log
df_servicenow_clean = df_filter_case_duration_range(df_servicenow_clean, max=0.95)
df_servicenow_clean = df_filter_date_range(df_servicenow_clean, max="2016-07-01 00:00:00", mode='traces_included')

# Create time features
df_servicenow_clean = df_extract_elapsed_cycle_time(df_servicenow_clean, f"{EVENTLOG_TIMESTAMP}{EVENTLOG_FEAT_TIME_ELAPSED_CYCLE_SUFFIX}:seconds", unit='s')
df_servicenow_clean = df_extract_activity_duration(df_servicenow_clean, f"{EVENTLOG_TIMESTAMP}{EVENTLOG_FEAT_TIME_ELAPSED_PREV_SUFFIX}:seconds", unit='s', na_token=0)

df_servicenow_clean = df_extract_month_of_year(df_servicenow_clean, f"{EVENTLOG_TIMESTAMP}:month", relative=True)
df_servicenow_clean = df_extract_day_of_year(df_servicenow_clean, f"{EVENTLOG_TIMESTAMP}:dayofyear", relative=True)
df_servicenow_clean = df_extract_day_of_month(df_servicenow_clean, f"{EVENTLOG_TIMESTAMP}:day", relative=True)
df_servicenow_clean = df_extract_day_of_week(df_servicenow_clean, f"{EVENTLOG_TIMESTAMP}:weekday", relative=True)
df_servicenow_clean = df_extract_hour_of_day(df_servicenow_clean, f"{EVENTLOG_TIMESTAMP}:hour", relative=True)

df_servicenow_clean = df_extract_month_of_year(df_servicenow_clean, f"{EVENTLOG_TIMESTAMP}:month:raw")
df_servicenow_clean = df_extract_day_of_year(df_servicenow_clean, f"{EVENTLOG_TIMESTAMP}:dayofyear:raw")
df_servicenow_clean = df_extract_day_of_month(df_servicenow_clean, f"{EVENTLOG_TIMESTAMP}:day:raw")
df_servicenow_clean = df_extract_day_of_week(df_servicenow_clean, f"{EVENTLOG_TIMESTAMP}:weekday:raw")
df_servicenow_clean = df_extract_hour_of_day(df_servicenow_clean, f"{EVENTLOG_TIMESTAMP}:hour:raw")

# Encode categorical labels
df_servicenow_clean = df_encode_label(df_servicenow_clean, cols=EVENTLOG_LABEL_NEXT_ACT)

# Transform bools to int
df_servicenow_clean = df_convert_bool_to_int(df_servicenow_clean)

# Transform ordered cats to int
df_servicenow_clean = df_convert_ordered_cat_to_int(df_servicenow_clean, relative=True)

# Fill empty str values
df_servicenow_clean = df_fillna_str(df_servicenow_clean, TOKEN_NA)

# Fill empty cat values
df_servicenow_clean = df_fillna_cat(df_servicenow_clean, TOKEN_NA)

df_write_files(df_servicenow_clean, os.path.join(OUTPUT_DATA_DIR, "incident_event_log_cleaned"))

df_servicenow_clean

In [None]:
df_servicenow_train, df_servicenow_test = df_strict_temporal_train_test_split(
    df_servicenow_clean,
    0.2,
    df_find_labels(df_servicenow_clean),
    debias_end=False
)

axes = df_visualize_strict_temporal_splitting(df_servicenow_train, df_servicenow_test)
plt.tight_layout()
plt.savefig(os.path.join(GRAPHIC_DIR, "incident_event_log_train_test.svg"), bbox_inches='tight')
plt.show()

# Remove overlaps
df_servicenow_test.dropna(axis='index', how='any', subset=df_find_labels(df_servicenow_test), inplace=True)

df_write_files(df_servicenow_train, os.path.join(OUTPUT_DATA_DIR, "incident_event_log_train"))
df_write_files(df_servicenow_test, os.path.join(OUTPUT_DATA_DIR, "incident_event_log_test"))

In [None]:
df_servicenow_dyn = df_prefix_pad_attributes(
    pd.concat([df_servicenow_train, df_servicenow_test]),
    df_find_event_attributes(df_servicenow_clean, exclude_labels=True)
)

df_servicenow_dyn

In [None]:
df_servicenow_train_dyn = df_servicenow_dyn.loc[df_servicenow_train[EVENTLOG_CASE].unique()]
df_servicenow_train_dyn

In [None]:
df_servicenow_test_dyn = df_servicenow_dyn.loc[df_servicenow_test[EVENTLOG_CASE].unique()]
df_servicenow_test_dyn

In [None]:
df_naive_regression_metrics(
    df_servicenow_train,
    df_servicenow_test,
    EVENTLOG_LABEL_NEXT_TIME
)

In [None]:
df_naive_regression_metrics(
    df_servicenow_train,
    df_servicenow_test,
    EVENTLOG_LABEL_REM_TIME
)

In [None]:
df_naive_classification_metrics(
    df_servicenow_train,
    df_servicenow_test,
    EVENTLOG_LABEL_NEXT_ACT
)

### Create Graphs for Incident management process enriched event log V1

#### Create Resource Graphs for Incident management process enriched event log V1

In [None]:
mapping, reverse_mapping, data = df_to_pyg_temporal_data(pd.concat([df_servicenow_train, df_servicenow_test], ignore_index=True), EVENTLOG_RESOURCE)
data

In [None]:
train_data, val_data, test_data = data.train_val_test_split(val_ratio=0.15, test_ratio=0.15)

train_loader = pyg.loader.TemporalDataLoader(
    train_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)
val_loader = pyg.loader.TemporalDataLoader(
    val_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)
test_loader = pyg.loader.TemporalDataLoader(
    test_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)

neighbor_loader = pyg.nn.models.tgn.LastNeighborLoader(data.num_nodes, size=8)

In [None]:
memory = pyg.nn.models.TGNMemory(
    data.num_nodes,
    data.msg.size(-1),
    memory_dim,
    time_dim,
    message_module=pyg.nn.models.tgn.IdentityMessage(data.msg.size(-1), memory_dim, time_dim),
    aggregator_module=pyg.nn.models.tgn.LastAggregator(),
)

gnn = GraphAttentionEmbedding(
    in_channels=memory_dim,
    out_channels=embedding_dim,
    msg_dim=data.msg.size(-1),
    time_enc=memory.time_enc,
).to(device)

link_pred = LinkPredictor(in_channels=embedding_dim).to(device)

optimizer = torch.optim.Adam(set(memory.parameters()) | set(gnn.parameters()) | set(link_pred.parameters()), lr=0.00001, weight_decay=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()

# Helper vector to map global node indices to local ones.
assoc = torch.empty(data.num_nodes, dtype=torch.long, device=device)

In [None]:
for epoch in range(1, 50):

    loss = train()
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}')
    val_ap, val_auc = test(val_loader)
    test_ap, test_auc = test(test_loader)
    print(f'Val AP: {val_ap:.4f}, Val AUC: {val_auc:.4f}')
    print(f'Test AP: {test_ap:.4f}, Test AUC: {test_auc:.4f}')

with open(os.path.join(OUTPUT_DATA_DIR, "servicenow_resource_mapping.json"), "w") as f:
  json.dump(mapping, f)
with open(os.path.join(OUTPUT_DATA_DIR, "servicenow_resource_inverse_mapping.json"), "w") as f:
  json.dump(reverse_mapping, f)

embeddings = embed()
np.save(os.path.join(OUTPUT_DATA_DIR, "servicenow_resource_embeddings.npy"), embeddings, allow_pickle=False)

In [None]:
arr_nan = np.full(EMBEDDING_DIM, -99.0, dtype='float32')

embeddings_df_train = df_servicenow_train_dyn[EVENTLOG_RESOURCE].map(lambda x: embeddings[reverse_mapping[x]] if x is not None and not pd.isna(x) else arr_nan, na_action=None).to_numpy()
resource_graph_embeddings_train = np.empty((embeddings_df_train.shape[0], embeddings_df_train.shape[1], EMBEDDING_DIM), dtype='float32')
for i in range(len(embeddings_df_train)):
  resource_graph_embeddings_train[i] = np.stack(embeddings_df_train[i])

resource_graph_embeddings_train.shape

In [None]:
embeddings_df_test = df_servicenow_test_dyn[EVENTLOG_RESOURCE].map(lambda x: embeddings[reverse_mapping[x]] if x is not None and not pd.isna(x) else arr_nan, na_action=None).to_numpy()
resource_graph_embeddings_test = np.empty((embeddings_df_test.shape[0], embeddings_df_test.shape[1], EMBEDDING_DIM), dtype='float32')
for i in range(len(embeddings_df_test)):
  resource_graph_embeddings_test[i] = np.stack(embeddings_df_test[i])

resource_graph_embeddings_test.shape

#### Create Group Graphs for Incident management process enriched event log V1

In [None]:
mapping, reverse_mapping, data = df_to_pyg_temporal_data(pd.concat([df_servicenow_train, df_servicenow_test], ignore_index=True), EVENTLOG_GROUP)
data

In [None]:
train_data, val_data, test_data = data.train_val_test_split(val_ratio=0.15, test_ratio=0.15)

train_loader = pyg.loader.TemporalDataLoader(
    train_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)
val_loader = pyg.loader.TemporalDataLoader(
    val_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)
test_loader = pyg.loader.TemporalDataLoader(
    test_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)

neighbor_loader = pyg.nn.models.tgn.LastNeighborLoader(data.num_nodes, size=8)

In [None]:
memory = pyg.nn.models.TGNMemory(
    data.num_nodes,
    data.msg.size(-1),
    memory_dim,
    time_dim,
    message_module=pyg.nn.models.tgn.IdentityMessage(data.msg.size(-1), memory_dim, time_dim),
    aggregator_module=pyg.nn.models.tgn.LastAggregator(),
)

gnn = GraphAttentionEmbedding(
    in_channels=memory_dim,
    out_channels=embedding_dim,
    msg_dim=data.msg.size(-1),
    time_enc=memory.time_enc,
).to(device)

link_pred = LinkPredictor(in_channels=embedding_dim).to(device)

optimizer = torch.optim.Adam(set(memory.parameters()) | set(gnn.parameters()) | set(link_pred.parameters()), lr=0.00001, weight_decay=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()

# Helper vector to map global node indices to local ones.
assoc = torch.empty(data.num_nodes, dtype=torch.long, device=device)

In [None]:
for epoch in range(1, 50):
    loss = train()
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}')
    val_ap, val_auc = test(val_loader)
    test_ap, test_auc = test(test_loader)
    print(f'Val AP: {val_ap:.4f}, Val AUC: {val_auc:.4f}')
    print(f'Test AP: {test_ap:.4f}, Test AUC: {test_auc:.4f}')

with open(os.path.join(OUTPUT_DATA_DIR, "servicenow_group_mapping.json"), "w") as f:
  json.dump(mapping, f)
with open(os.path.join(OUTPUT_DATA_DIR, "servicenow_group_inverse_mapping.json"), "w") as f:
  json.dump(reverse_mapping, f)

embeddings = embed()
np.save(os.path.join(OUTPUT_DATA_DIR, "servicenow_group_embeddings.npy"), embeddings, allow_pickle=False)

In [None]:
arr_nan = np.full(EMBEDDING_DIM, -99.0, dtype='float32')

embeddings_df_train = df_servicenow_train_dyn[EVENTLOG_GROUP].map(lambda x: embeddings[reverse_mapping[x]] if x is not None and not pd.isna(x) else arr_nan, na_action=None).to_numpy()
group_graph_embeddings_train = np.empty((embeddings_df_train.shape[0], embeddings_df_train.shape[1], EMBEDDING_DIM), dtype='float32')
for i in range(len(embeddings_df_train)):
  group_graph_embeddings_train[i] = np.stack(embeddings_df_train[i])

group_graph_embeddings_train.shape

In [None]:
embeddings_df_test = df_servicenow_test_dyn[EVENTLOG_GROUP].map(lambda x: embeddings[reverse_mapping[x]] if x is not None and not pd.isna(x) else arr_nan, na_action=None).to_numpy()
group_graph_embeddings_test = np.empty((embeddings_df_test.shape[0], embeddings_df_test.shape[1], EMBEDDING_DIM), dtype='float32')
for i in range(len(embeddings_df_test)):
  group_graph_embeddings_test[i] = np.stack(embeddings_df_test[i])

group_graph_embeddings_test.shape

### Build Dataset for Incident management process enriched event log V1

In [None]:
ds_servicenow_train = tf.data.Dataset.from_tensor_slices(({
  "activity": df_servicenow_train_dyn[EVENTLOG_ACTIVITY].to_numpy(na_value=TOKEN_PADDING),
  "reassignment_count": df_servicenow_train_dyn["reassignment_count"].to_numpy(dtype='float32', na_value=-1),
  "reopen_count": df_servicenow_train_dyn["reopen_count"].to_numpy(dtype='float32', na_value=-1),
  "sys_mod_count": df_servicenow_train_dyn["sys_mod_count"].to_numpy(dtype='float32', na_value=-1),
  "caller_id": df_servicenow_train_dyn["caller_id"].to_numpy(na_value=TOKEN_PADDING),
  "sys_updated_by": df_servicenow_train_dyn["sys_updated_by"].to_numpy(na_value=TOKEN_PADDING),
  "contact_type": df_servicenow_train_dyn["contact_type"].to_numpy(na_value=TOKEN_PADDING),
  "location": df_servicenow_train_dyn["location"].to_numpy(na_value=TOKEN_PADDING),
  "category": df_servicenow_train_dyn["category"].to_numpy(na_value=TOKEN_PADDING),
  "subcategory": df_servicenow_train_dyn["subcategory"].to_numpy(na_value=TOKEN_PADDING),
  "u_symptom": df_servicenow_train_dyn["u_symptom"].to_numpy(na_value=TOKEN_PADDING),
  "priority": df_servicenow_train_dyn["priority"].to_numpy(dtype='float32', na_value=-1),
  "org_group": df_servicenow_train_dyn["org:group"].to_numpy(na_value=TOKEN_PADDING),
  "org_resource": df_servicenow_train_dyn["org:resource"].to_numpy(na_value=TOKEN_PADDING),
  "org_resource_graph": resource_graph_embeddings_train,
  "org_group_graph": group_graph_embeddings_train,
  "knowledge": df_servicenow_train_dyn["knowledge"].astype('Int8').to_numpy(dtype='int8', na_value=-1),
  "u_priority_confirmation": df_servicenow_train_dyn["u_priority_confirmation"].astype('Int8').to_numpy(dtype='int8', na_value=-1),
  "time_timestamp_elapsedcycle": df_servicenow_train_dyn["time:timestamp:elapsedcycle:seconds"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_elapsedprev": df_servicenow_train_dyn["time:timestamp:elapsedprev:seconds"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_month": df_servicenow_train_dyn["time:timestamp:month"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_dayofyear": df_servicenow_train_dyn["time:timestamp:dayofyear"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_day": df_servicenow_train_dyn["time:timestamp:day"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_weekday": df_servicenow_train_dyn["time:timestamp:weekday"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_hour": df_servicenow_train_dyn["time:timestamp:hour"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_month_raw": df_servicenow_train_dyn["time:timestamp:month:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_dayofyear_raw": df_servicenow_train_dyn["time:timestamp:dayofyear:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_day_raw": df_servicenow_train_dyn["time:timestamp:day:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_weekday_raw": df_servicenow_train_dyn["time:timestamp:weekday:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_hour_raw": df_servicenow_train_dyn["time:timestamp:hour:raw"].to_numpy(dtype='float32', na_value=-1),
  "case_notify": np.expand_dims(df_servicenow_train["case:notify"].to_numpy(dtype='int8'), axis=-1),
  "case_opened_by": np.expand_dims(df_servicenow_train["case:opened_by"].astype('string').to_numpy(na_value=TOKEN_NA), axis=-1),
  "case_sys_created_by": np.expand_dims(df_servicenow_train["case:sys_created_by"].astype('string').to_numpy(na_value=TOKEN_NA), axis=-1),
}, {
  "next_activity": np.expand_dims(df_servicenow_train["label:concept:name:next"].to_numpy(dtype='int16'), axis=-1),
  "next_time": np.expand_dims(df_servicenow_train["label:time:timestamp:next"].to_numpy(dtype='float32'), axis=-1),
  "remaining_time": np.expand_dims(df_servicenow_train["label:time:timestamp:last"].to_numpy(dtype='float32'), axis=-1),
}))

ds_write_files(ds_servicenow_train, os.path.join(OUTPUT_DATA_DIR, 'incident_event_log_train_dataset'))

ds_servicenow_train

In [None]:
ds_servicenow_test = tf.data.Dataset.from_tensor_slices(({
  "activity": df_servicenow_test_dyn[EVENTLOG_ACTIVITY].to_numpy(na_value=TOKEN_PADDING),
  "reassignment_count": df_servicenow_test_dyn["reassignment_count"].to_numpy(dtype='float32', na_value=-1),
  "reopen_count": df_servicenow_test_dyn["reopen_count"].to_numpy(dtype='float32', na_value=-1),
  "sys_mod_count": df_servicenow_test_dyn["sys_mod_count"].to_numpy(dtype='float32', na_value=-1),
  "caller_id": df_servicenow_test_dyn["caller_id"].to_numpy(na_value=TOKEN_PADDING),
  "sys_updated_by": df_servicenow_test_dyn["sys_updated_by"].to_numpy(na_value=TOKEN_PADDING),
  "contact_type": df_servicenow_test_dyn["contact_type"].to_numpy(na_value=TOKEN_PADDING),
  "location": df_servicenow_test_dyn["location"].to_numpy(na_value=TOKEN_PADDING),
  "category": df_servicenow_test_dyn["category"].to_numpy(na_value=TOKEN_PADDING),
  "subcategory": df_servicenow_test_dyn["subcategory"].to_numpy(na_value=TOKEN_PADDING),
  "u_symptom": df_servicenow_test_dyn["u_symptom"].to_numpy(na_value=TOKEN_PADDING),
  "priority": df_servicenow_test_dyn["priority"].to_numpy(dtype='float32', na_value=-1),
  "org_group": df_servicenow_test_dyn["org:group"].to_numpy(na_value=TOKEN_PADDING),
  "org_resource": df_servicenow_test_dyn["org:resource"].to_numpy(na_value=TOKEN_PADDING),
  "org_resource_graph": resource_graph_embeddings_test,
  "org_group_graph": group_graph_embeddings_test,
  "knowledge": df_servicenow_test_dyn["knowledge"].astype('Int8').to_numpy(dtype='int8', na_value=-1),
  "u_priority_confirmation": df_servicenow_test_dyn["u_priority_confirmation"].astype('Int8').to_numpy(dtype='int8', na_value=-1),
  "time_timestamp_elapsedcycle": df_servicenow_test_dyn["time:timestamp:elapsedcycle:seconds"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_elapsedprev": df_servicenow_test_dyn["time:timestamp:elapsedprev:seconds"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_month": df_servicenow_test_dyn["time:timestamp:month"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_dayofyear": df_servicenow_test_dyn["time:timestamp:dayofyear"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_day": df_servicenow_test_dyn["time:timestamp:day"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_weekday": df_servicenow_test_dyn["time:timestamp:weekday"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_hour": df_servicenow_test_dyn["time:timestamp:hour"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_month_raw": df_servicenow_test_dyn["time:timestamp:month:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_dayofyear_raw": df_servicenow_test_dyn["time:timestamp:dayofyear:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_day_raw": df_servicenow_test_dyn["time:timestamp:day:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_weekday_raw": df_servicenow_test_dyn["time:timestamp:weekday:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_hour_raw": df_servicenow_test_dyn["time:timestamp:hour:raw"].to_numpy(dtype='float32', na_value=-1),
  "case_notify": np.expand_dims(df_servicenow_test["case:notify"].to_numpy(dtype='int8'), axis=-1),
  "case_opened_by": np.expand_dims(df_servicenow_test["case:opened_by"].astype('string').to_numpy(na_value=TOKEN_NA), axis=-1),
  "case_sys_created_by": np.expand_dims(df_servicenow_test["case:sys_created_by"].astype('string').to_numpy(na_value=TOKEN_NA), axis=-1),
}, {
  "next_activity": np.expand_dims(df_servicenow_test["label:concept:name:next"].to_numpy(dtype='int16'), axis=-1),
  "next_time": np.expand_dims(df_servicenow_test["label:time:timestamp:next"].to_numpy(dtype='float32'), axis=-1),
  "remaining_time": np.expand_dims(df_servicenow_test["label:time:timestamp:last"].to_numpy(dtype='float32'), axis=-1),
}))

ds_write_files(ds_servicenow_test, os.path.join(OUTPUT_DATA_DIR, 'incident_event_log_test_dataset'))

ds_servicenow_test

## Dataset: Dataset belonging to the help desk log of an Italian Company

The event log concerns the ticketing management process of the Help desk of an Italian software company. See also https://doi.org/10.4121/uuid:0c60edf1-6f83-4e75-9367-4c63b3e9d5bb.

- *Case ID*: the case identifier

- *Activity*: the activity name

- *Resource*: the resource who performed the action

- Complete Timestamp: the timestamp of the event. Format: YYYY/MM/DD hh:mm:ss.

- *Variant*: case variant

- Variant index: case variant in integer format

- *seriousness*: a seriousness level for the ticket

- *customer*: name of the customer

- *product*: name of the product

- *responsible_section*: name of the responsible section

- *seriousness_2*: a sub-seriousness level

- *service_level*: level of the service

- *service_type*: type of the service

- *support_section*: name of the support section

- *workgroup*: name of the workgroup

### Read Dataset belonging to the help desk log of an Italian Company

In [None]:
dtypes_italy = {
    "Case ID": "string",
    "Activity": "category",
    "Resource": "category",
    "Variant": "category",
    "Variant index": "Int32",
    "seriousness": "category",
    "customer": "string",
    "product": "category",
    "responsible_section": "category",
    "seriousness_2": pd.CategoricalDtype(['Value 1', 'Value 2', 'Value 3', 'Value 4'], ordered=True),
    "service_level": pd.CategoricalDtype(['Value 1', 'Value 2', 'Value 3', 'Value 4'], ordered=True),
    "service_type": "category",
    "support_section": "category",
    "workgroup": "category"
}

df_italy = pd.read_csv(
    os.path.join(INPUT_DATA_DIR, "finale.csv"),
    header=0,
    dtype=dtypes_italy
)

df_italy = df_drop_duplicate_cols(df_italy)
df_drop_duplicate_rows(df_italy)
df_drop_na_rows_and_cols(df_italy)
df_drop_single_val_cols(df_italy)

# Convert timestamps
df_italy = df_convert_datetimes(
    df_italy,
    ["Complete Timestamp"],
    yearfirst=True
)
# Sort and normalize column names
df_format_as_eventlog(df_italy, case_col="Case ID", activity_col="Activity", time_col="Complete Timestamp", resource_col="Resource", group_col="workgroup")

# Write as Pandas CSV
df_write_files(df_italy, os.path.join(INTERIM_DATA_DIR, "finale_processed"), skip_xes=False)

print(df_italy.dtypes)
df_italy

### Label Dataset belonging to the help desk log of an Italian Company

In [None]:
df_italy = df_label_next_activity(df_italy, eoc_token=TOKEN_EOC)
df_italy = df_label_activity_duration(df_italy, unit='d', eoc_token=pd.Timedelta(0))
df_italy = df_label_remaining_cycle_time(df_italy, unit='d')

df_write_files(df_italy, os.path.join(INTERIM_DATA_DIR, "finale_labeled"))
df_italy

### Descriptive Statistics for Dataset belonging to the help desk log of an Italian Company

In [None]:
df_stat = df_italy.describe(include='all')
df_write_files(df_stat, os.path.join(OUTPUT_DATA_DIR, "finale_describe"), index=True)
df_stat

In [None]:
df_italy.hist(xrot=90, figsize=(10, 10))
plt.tight_layout()
plt.savefig(os.path.join(GRAPHIC_DIR, "finale_hist.svg"), bbox_inches='tight')
plt.show()

In [None]:
df_case_length_stats(df_italy)

In [None]:
df_case_duration_stats(df_italy)

### Clean Dataset belonging to the help desk log of an Italian Company

In [None]:
df_italy_clean = df_italy.drop(columns=[
    "case:Variant", # A-posteriori
    "case:Variant index", # A-posteriori
])

# Remove mostly empty columns
df_drop_threshold_na_cols(df_italy_clean, 0.95)

# Filter event log
df_italy_clean = df_filter_case_duration_range(df_italy_clean, max=0.95)

# Create time features
df_italy_clean = df_extract_elapsed_cycle_time(df_italy_clean, f"{EVENTLOG_TIMESTAMP}{EVENTLOG_FEAT_TIME_ELAPSED_CYCLE_SUFFIX}:seconds", unit='s')
df_italy_clean = df_extract_activity_duration(df_italy_clean, f"{EVENTLOG_TIMESTAMP}{EVENTLOG_FEAT_TIME_ELAPSED_PREV_SUFFIX}:seconds", unit='s', na_token=0)

df_italy_clean = df_extract_month_of_year(df_italy_clean, f"{EVENTLOG_TIMESTAMP}:month", relative=True)
df_italy_clean = df_extract_day_of_year(df_italy_clean, f"{EVENTLOG_TIMESTAMP}:dayofyear", relative=True)
df_italy_clean = df_extract_day_of_month(df_italy_clean, f"{EVENTLOG_TIMESTAMP}:day", relative=True)
df_italy_clean = df_extract_day_of_week(df_italy_clean, f"{EVENTLOG_TIMESTAMP}:weekday", relative=True)
df_italy_clean = df_extract_hour_of_day(df_italy_clean, f"{EVENTLOG_TIMESTAMP}:hour", relative=True)

df_italy_clean = df_extract_month_of_year(df_italy_clean, f"{EVENTLOG_TIMESTAMP}:month:raw")
df_italy_clean = df_extract_day_of_year(df_italy_clean, f"{EVENTLOG_TIMESTAMP}:dayofyear:raw")
df_italy_clean = df_extract_day_of_month(df_italy_clean, f"{EVENTLOG_TIMESTAMP}:day:raw")
df_italy_clean = df_extract_day_of_week(df_italy_clean, f"{EVENTLOG_TIMESTAMP}:weekday:raw")
df_italy_clean = df_extract_hour_of_day(df_italy_clean, f"{EVENTLOG_TIMESTAMP}:hour:raw")

# Encode categorical labels
df_italy_clean = df_encode_label(df_italy_clean, cols=EVENTLOG_LABEL_NEXT_ACT)

# Transform bools to int
df_italy_clean = df_convert_bool_to_int(df_italy_clean)

# Transform ordered cats to int
df_italy_clean = df_convert_ordered_cat_to_int(df_italy_clean, relative=True)

# Fill empty str values
df_italy_clean = df_fillna_str(df_italy_clean, TOKEN_NA)

# Fill empty cat values
df_italy_clean = df_fillna_cat(df_italy_clean, TOKEN_NA)

df_write_files(df_italy_clean, os.path.join(OUTPUT_DATA_DIR, "finale_cleaned"))

df_italy_clean

In [None]:
df_italy_train, df_italy_test = df_strict_temporal_train_test_split(
    df_italy_clean,
    0.2,
    df_find_labels(df_italy_clean),
    debias_end=False
)

axes = df_visualize_strict_temporal_splitting(df_italy_train, df_italy_test)
plt.tight_layout()
plt.savefig(os.path.join(GRAPHIC_DIR, "finale_train_test.svg"), bbox_inches='tight')
plt.show()

# Remove overlaps
df_italy_test.dropna(axis='index', how='any', subset=df_find_labels(df_italy_test), inplace=True)

df_write_files(df_italy_train, os.path.join(OUTPUT_DATA_DIR, "finale_train"))
df_write_files(df_italy_test, os.path.join(OUTPUT_DATA_DIR, "finale_test"))

In [None]:
df_italy_dyn = df_prefix_pad_attributes(
    pd.concat([df_italy_train, df_italy_test]),
    df_find_event_attributes(df_italy_clean, exclude_labels=True)
)

df_italy_dyn

In [None]:
df_italy_train_dyn = df_italy_dyn.loc[df_italy_train[EVENTLOG_CASE].unique()]
df_italy_train_dyn

In [None]:
df_italy_test_dyn = df_italy_dyn.loc[df_italy_test[EVENTLOG_CASE].unique()]
df_italy_test_dyn

In [None]:
df_naive_regression_metrics(
    df_italy_train,
    df_italy_test,
    EVENTLOG_LABEL_NEXT_TIME
)

In [None]:
df_naive_regression_metrics(
    df_italy_train,
    df_italy_test,
    EVENTLOG_LABEL_REM_TIME
)

In [None]:
df_naive_classification_metrics(
    df_italy_train,
    df_italy_test,
    EVENTLOG_LABEL_NEXT_ACT
)

### Create Graphs for Dataset belonging to the help desk log of an Italian Company

#### Create Resource Graphs for Dataset belonging to the help desk log of an Italian Company

In [None]:
mapping, reverse_mapping, data = df_to_pyg_temporal_data(pd.concat([df_italy_train, df_italy_test], ignore_index=True), EVENTLOG_RESOURCE)
data

In [None]:
train_data, val_data, test_data = data.train_val_test_split(val_ratio=0.15, test_ratio=0.15)

train_loader = pyg.loader.TemporalDataLoader(
    train_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)
val_loader = pyg.loader.TemporalDataLoader(
    val_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)
test_loader = pyg.loader.TemporalDataLoader(
    test_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)

neighbor_loader = pyg.nn.models.tgn.LastNeighborLoader(data.num_nodes, size=8)

In [None]:
memory = pyg.nn.models.TGNMemory(
    data.num_nodes,
    data.msg.size(-1),
    memory_dim,
    time_dim,
    message_module=pyg.nn.models.tgn.IdentityMessage(data.msg.size(-1), memory_dim, time_dim),
    aggregator_module=pyg.nn.models.tgn.LastAggregator(),
)

gnn = GraphAttentionEmbedding(
    in_channels=memory_dim,
    out_channels=embedding_dim,
    msg_dim=data.msg.size(-1),
    time_enc=memory.time_enc,
).to(device)

link_pred = LinkPredictor(in_channels=embedding_dim).to(device)

optimizer = torch.optim.Adam(set(memory.parameters()) | set(gnn.parameters()) | set(link_pred.parameters()), lr=0.00001, weight_decay=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()

# Helper vector to map global node indices to local ones.
assoc = torch.empty(data.num_nodes, dtype=torch.long, device=device)

In [None]:
for epoch in range(1, 50):
    loss = train()
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}')
    val_ap, val_auc = test(val_loader)
    test_ap, test_auc = test(test_loader)
    print(f'Val AP: {val_ap:.4f}, Val AUC: {val_auc:.4f}')
    print(f'Test AP: {test_ap:.4f}, Test AUC: {test_auc:.4f}')

with open(os.path.join(OUTPUT_DATA_DIR, "italy_resource_mapping.json"), "w") as f:
  json.dump(mapping, f)
with open(os.path.join(OUTPUT_DATA_DIR, "italy_resource_inverse_mapping.json"), "w") as f:
  json.dump(reverse_mapping, f)

embeddings = embed()
np.save(os.path.join(OUTPUT_DATA_DIR, "italy_resource_embeddings.npy"), embeddings, allow_pickle=False)

In [None]:
arr_nan = np.full(EMBEDDING_DIM, -99.0, dtype='float32')

embeddings_df_train = df_italy_train_dyn[EVENTLOG_RESOURCE].map(lambda x: embeddings[reverse_mapping[x]] if x is not None and not pd.isna(x) else arr_nan, na_action=None).to_numpy()
resource_graph_embeddings_train = np.empty((embeddings_df_train.shape[0], embeddings_df_train.shape[1], EMBEDDING_DIM), dtype='float32')
for i in range(len(embeddings_df_train)):
  resource_graph_embeddings_train[i] = np.stack(embeddings_df_train[i])

resource_graph_embeddings_train.shape

In [None]:
embeddings_df_test = df_italy_test_dyn[EVENTLOG_RESOURCE].map(lambda x: embeddings[reverse_mapping[x]] if x is not None and not pd.isna(x) else arr_nan, na_action=None).to_numpy()
resource_graph_embeddings_test = np.empty((embeddings_df_test.shape[0], embeddings_df_test.shape[1], EMBEDDING_DIM), dtype='float32')
for i in range(len(embeddings_df_test)):
  resource_graph_embeddings_test[i] = np.stack(embeddings_df_test[i])

resource_graph_embeddings_test.shape

#### Create Group Graphs for Dataset belonging to the help desk log of an Italian Company

In [None]:
mapping, reverse_mapping, data = df_to_pyg_temporal_data(pd.concat([df_italy_train, df_italy_test], ignore_index=True), EVENTLOG_GROUP)
data

In [None]:
train_data, val_data, test_data = data.train_val_test_split(val_ratio=0.15, test_ratio=0.15)

train_loader = pyg.loader.TemporalDataLoader(
    train_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)
val_loader = pyg.loader.TemporalDataLoader(
    val_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)
test_loader = pyg.loader.TemporalDataLoader(
    test_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)

neighbor_loader = pyg.nn.models.tgn.LastNeighborLoader(data.num_nodes, size=8)

In [None]:
memory = pyg.nn.models.TGNMemory(
    data.num_nodes,
    data.msg.size(-1),
    memory_dim,
    time_dim,
    message_module=pyg.nn.models.tgn.IdentityMessage(data.msg.size(-1), memory_dim, time_dim),
    aggregator_module=pyg.nn.models.tgn.LastAggregator(),
)

gnn = GraphAttentionEmbedding(
    in_channels=memory_dim,
    out_channels=embedding_dim,
    msg_dim=data.msg.size(-1),
    time_enc=memory.time_enc,
).to(device)

link_pred = LinkPredictor(in_channels=embedding_dim).to(device)

optimizer = torch.optim.Adam(set(memory.parameters()) | set(gnn.parameters()) | set(link_pred.parameters()), lr=0.00001, weight_decay=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()

# Helper vector to map global node indices to local ones.
assoc = torch.empty(data.num_nodes, dtype=torch.long, device=device)

In [None]:
for epoch in range(1, 50):
    loss = train()
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}')
    val_ap, val_auc = test(val_loader)
    test_ap, test_auc = test(test_loader)
    print(f'Val AP: {val_ap:.4f}, Val AUC: {val_auc:.4f}')
    print(f'Test AP: {test_ap:.4f}, Test AUC: {test_auc:.4f}')

with open(os.path.join(OUTPUT_DATA_DIR, "italy_group_mapping.json"), "w") as f:
  json.dump(mapping, f)
with open(os.path.join(OUTPUT_DATA_DIR, "italy_group_inverse_mapping.json"), "w") as f:
  json.dump(reverse_mapping, f)

embeddings = embed()
np.save(os.path.join(OUTPUT_DATA_DIR, "italy_group_embeddings.npy"), embeddings, allow_pickle=False)

In [None]:
arr_nan = np.full(EMBEDDING_DIM, -99.0, dtype='float32')

embeddings_df_train = df_italy_train_dyn[EVENTLOG_GROUP].map(lambda x: embeddings[reverse_mapping[x]] if x is not None and not pd.isna(x) else arr_nan, na_action=None).to_numpy()
group_graph_embeddings_train = np.empty((embeddings_df_train.shape[0], embeddings_df_train.shape[1], EMBEDDING_DIM), dtype='float32')
for i in range(len(embeddings_df_train)):
  group_graph_embeddings_train[i] = np.stack(embeddings_df_train[i])

group_graph_embeddings_train.shape

In [None]:
embeddings_df_test = df_italy_test_dyn[EVENTLOG_GROUP].map(lambda x: embeddings[reverse_mapping[x]] if x is not None and not pd.isna(x) else arr_nan, na_action=None).to_numpy()
group_graph_embeddings_test = np.empty((embeddings_df_test.shape[0], embeddings_df_test.shape[1], EMBEDDING_DIM), dtype='float32')
for i in range(len(embeddings_df_test)):
  group_graph_embeddings_test[i] = np.stack(embeddings_df_test[i])

group_graph_embeddings_test.shape

### Build Dataset for Dataset belonging to the help desk log of an Italian Company

In [None]:
ds_italy_train = tf.data.Dataset.from_tensor_slices(({
  "activity": df_italy_train_dyn["concept:name"].to_numpy(na_value=TOKEN_PADDING),
  "org_resource": df_italy_train_dyn["org:resource"].to_numpy(na_value=TOKEN_PADDING),
  "customer": df_italy_train_dyn["customer"].to_numpy(na_value=TOKEN_PADDING),
  "product": df_italy_train_dyn["product"].to_numpy(na_value=TOKEN_PADDING),
  "seriousness_2": df_italy_train_dyn["seriousness_2"].to_numpy(dtype='float32', na_value=-1),
  "service_level": df_italy_train_dyn["service_level"].to_numpy(dtype='float32', na_value=-1),
  "service_type": df_italy_train_dyn["service_type"].to_numpy(na_value=TOKEN_PADDING),
  "org_group": df_italy_train_dyn["org:group"].to_numpy(na_value=TOKEN_PADDING),
  "org_resource_graph": resource_graph_embeddings_train,
  "org_group_graph": group_graph_embeddings_train,
  "time_timestamp_elapsedcycle": df_italy_train_dyn["time:timestamp:elapsedcycle:seconds"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_elapsedprev": df_italy_train_dyn["time:timestamp:elapsedprev:seconds"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_month": df_italy_train_dyn["time:timestamp:month"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_dayofyear": df_italy_train_dyn["time:timestamp:dayofyear"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_day": df_italy_train_dyn["time:timestamp:day"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_weekday": df_italy_train_dyn["time:timestamp:weekday"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_hour": df_italy_train_dyn["time:timestamp:hour"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_month_raw": df_italy_train_dyn["time:timestamp:month:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_dayofyear_raw": df_italy_train_dyn["time:timestamp:dayofyear:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_day_raw": df_italy_train_dyn["time:timestamp:day:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_weekday_raw": df_italy_train_dyn["time:timestamp:weekday:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_hour_raw": df_italy_train_dyn["time:timestamp:hour:raw"].to_numpy(dtype='float32', na_value=-1),
  "case_responsible_section": np.expand_dims(df_italy_train["case:responsible_section"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_support_section": np.expand_dims(df_italy_train["case:support_section"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
}, {
  "next_activity": np.expand_dims(df_italy_train["label:concept:name:next"].to_numpy(dtype='int16'), axis=-1),
  "next_time": np.expand_dims(df_italy_train["label:time:timestamp:next"].to_numpy(dtype='float32'), axis=-1),
  "remaining_time": np.expand_dims(df_italy_train["label:time:timestamp:last"].to_numpy(dtype='float32'), axis=-1),
}))

ds_write_files(ds_italy_train, os.path.join(OUTPUT_DATA_DIR, 'finale_train_dataset'))

ds_italy_train

In [None]:
ds_italy_test = tf.data.Dataset.from_tensor_slices(({
  "activity": df_italy_test_dyn["concept:name"].to_numpy(na_value=TOKEN_PADDING),
  "org_resource": df_italy_test_dyn["org:resource"].to_numpy(na_value=TOKEN_PADDING),
  "customer": df_italy_test_dyn["customer"].to_numpy(na_value=TOKEN_PADDING),
  "product": df_italy_test_dyn["product"].to_numpy(na_value=TOKEN_PADDING),
  "seriousness_2": df_italy_test_dyn["seriousness_2"].to_numpy(dtype='float32', na_value=-1),
  "service_level": df_italy_test_dyn["service_level"].to_numpy(dtype='float32', na_value=-1),
  "service_type": df_italy_test_dyn["service_type"].to_numpy(na_value=TOKEN_PADDING),
  "org_group": df_italy_test_dyn["org:group"].to_numpy(na_value=TOKEN_PADDING),
  "org_resource_graph": resource_graph_embeddings_test,
  "org_group_graph": group_graph_embeddings_test,
  "time_timestamp_elapsedcycle": df_italy_test_dyn["time:timestamp:elapsedcycle:seconds"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_elapsedprev": df_italy_test_dyn["time:timestamp:elapsedprev:seconds"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_month": df_italy_test_dyn["time:timestamp:month"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_dayofyear": df_italy_test_dyn["time:timestamp:dayofyear"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_day": df_italy_test_dyn["time:timestamp:day"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_weekday": df_italy_test_dyn["time:timestamp:weekday"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_hour": df_italy_test_dyn["time:timestamp:hour"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_month_raw": df_italy_test_dyn["time:timestamp:month:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_dayofyear_raw": df_italy_test_dyn["time:timestamp:dayofyear:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_day_raw": df_italy_test_dyn["time:timestamp:day:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_weekday_raw": df_italy_test_dyn["time:timestamp:weekday:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_hour_raw": df_italy_test_dyn["time:timestamp:hour:raw"].to_numpy(dtype='float32', na_value=-1),
  "case_responsible_section": np.expand_dims(df_italy_test["case:responsible_section"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_support_section": np.expand_dims(df_italy_test["case:support_section"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
}, {
  "next_activity": np.expand_dims(df_italy_test["label:concept:name:next"].to_numpy(dtype='int16'), axis=-1),
  "next_time": np.expand_dims(df_italy_test["label:time:timestamp:next"].to_numpy(dtype='float32'), axis=-1),
  "remaining_time": np.expand_dims(df_italy_test["label:time:timestamp:last"].to_numpy(dtype='float32'), axis=-1),
}))

ds_write_files(ds_italy_test, os.path.join(OUTPUT_DATA_DIR, 'finale_test_dataset'))

ds_italy_test

## Dataset: BPIC 2014

This dataset is provided by the Rabobank ICT and contains information about the employed ITIL processes. See https://www.win.tue.nl/bpi/2014/challenge.html and https://data.4tu.nl/collections/dff0e630-9c91-4b8e-806d-ec9a3a0f2206

### Incident Details

Based on an estimated Impact and Urgency, done by the SDA, an Incident-record is prioritized and gets a deadline to resolve the service disruption. A Team  eader within the Assignment Group assigns the records to an Operator. The  Operator resolves the issue for the customer, or reassigns the record to a colleague if other or more knowledge is needed. After solving the issue for the customer, the Operator relates the Incident-record to the Configuration Item (CausedBy CI) that caused the service disruption. After closing the Incident-record, the customer receives an email to inform that the issue is resolved. See also https://doi.org/10.4121/uuid:3cfa2260-f5c5-44be-afe1-b70d35288d6d.

- **Dataset attributes:**  
  - *CI Name (aff):* Configuration Item (CI) where a disruption of an ICT Service is noticed, this is what we call the "Affected CI". When a Service Desk Agent decides to create an Incident from an Interaction, the Affected CI is copied from the Interaction-record into the Incident-record.

  - *CI Type (aff):* Every CI in the CMDB is related to an Entity Type.

  - *CI Subtype (aff):* Every CI in the CMDB is related to a Subtype, which is related to a CI Type.

  - *Service Comp WBS (aff):* Every CI in the CMDB is related to 1 Service Component, in order to identify which Product Manager is responsible for the CI. A Service Component is equal to a product in the Bill of Material and is part of one or more Services.

  - *Incident ID:* The unique ID of an Incident-record in the Service Management tool.

  - *Status:* Status of the Incident-record.

  - *Impact:* Impact of the service disruption to the customer.

  - *Urgency:* Indication of how urgent the customer needs a solution.

  - *Priority:* Impact and Urgency lead to a Priority for the Assignment Group to resolve the service disruption.

  - *Category:* In order to select and compare similar Incidents in the Service Management tool, all records are categorized. The categorization is derived from the Knowledge Document.

  - *KM number:* A Knowledge Document contains default attribute values for the Interaction-record and a set of questions for a Service Desk Agent to derive which Configuration Item is disrupted and to determine Impact and Urgency for the customer.

  - *Open Time:* Date and time the Incident record was opened in the Service Management tool.

  - *Reopen Time:* Date and time the Incident record was reopened in the Service Management tool. This option is used when an Incident record was closed and within a short period of time it is discovered that the resolution is not effective for the customer.

  - *Resolved Time:* Date and time the Service disruption is resolved.

  - *Closed Time:* Date and time the Incident record is closed in the Service Management tool.

  - *Handle Time (secs):* Time registered to resolve the service disruption.

  - *Closure Code:* Short code to classify the type of Service disruption.
  - *Alert Status:* Alert status of the Incident-record, during its lifecycle, based on defined Service Levels in the Service Management tool.

  - *Reassignments:* Number of Incident Activities with Activity Type "Reassignment".

  - *Related Interactions:* Number of related Interactions to this Incident. Related Interaction Record-number if only one Interaction is related to this Incident.

  - *Related Incidents:* Number of similar Incidents, related to this record. The related Incidents are what we call child-records for this parent Incident-record, which is used for logging all Activities to resolve the service disruption.

  - *Related Changes:* Number of related Changes to this Incident. Related Change Record-number if only one Change is related to this Incident.

  - *CI Name (CBy):* Configuration Item (CI) which caused the disruption of an ICT Service, this is what we call the "CausedBy CI". When an Operator resolves an Incident, the CausedBy CI must be registered before closing the Incident-record.

  - *CI Type (CBy):* Every CI in the CMDB is related to an Entity Type.

  - *CI Subtype (CBy):* Every CI in the CMDB is related to a Subtype, which is related to a CI Type.

  - *ServiceComp WBS (CBy):* Every CI in the CMDB is related to 1 Service Component, in order to identify which Product Manager is responsible for the CI. A Service Component is equal to a product in the Bill of Material.


#### Read BPIC 2014 Incident Details

In [None]:
dtypes_bpic14_incident = {
    'CI Name (aff)': 'category',
    'CI Type (aff)': 'category',
    'CI Subtype (aff)': 'category',
    'Service Component WBS (aff)': 'category',
    'Incident ID': 'string',
    'Status': pd.CategoricalDtype(['Work in progress', 'Closed'], ordered=True),
    'Impact': pd.CategoricalDtype(range(1, 6), ordered=True),
    'Urgency': pd.CategoricalDtype(['1', '2', '3', '4', '5', '5 - Very Low'], ordered=True),
    'Priority': pd.CategoricalDtype(range(1, 6), ordered=True),
    'Category': 'category',
    'KM number': 'category',
    '# Reassignments': 'Int16',
    'Alert Status': 'category',
    'Closure Code': 'category',
    '# Related Interactions': 'Int16',
    'Related Interaction': 'string',
    '# Related Incidents': 'Int16',
    '# Related Changes': 'Int16',
    'Related Change': 'string',
    'CI Name (CBy)': 'category',
    'CI Type (CBy)': 'category',
    'CI Subtype (CBy)': 'category',
    'ServiceComp WBS (CBy)': 'category'
}

df_bpic14_incident = pd.read_csv(
    os.path.join(INPUT_DATA_BPIC2014_DIR, 'Detail_Incident.csv'),
    header=0,
    sep=';',
    decimal=',',
    dtype=dtypes_bpic14_incident,
    na_values=['#MULTIVALUE', '#N/B', 'Unknown']
)

df_drop_duplicate_rows(df_bpic14_incident)
df_drop_na_rows_and_cols(df_bpic14_incident)
df_drop_single_val_cols(df_bpic14_incident)

# Convert timestamps
df_bpic14_incident = df_convert_datetimes(
    df_bpic14_incident,
    ["Open Time", "Reopen Time", "Resolved Time", "Close Time"],
    dayfirst=True
)

# Convert time deltas
df_bpic14_incident = df_convert_timedeltas(df_bpic14_incident, ['Handle Time (Hours)'], unit="hours")

# Unify category namings
df_bpic14_incident = df_rename_cat_values(df_bpic14_incident, ['CI Subtype (aff)', 'CI Subtype (CBy)'], 'Iptelephony', 'IPtelephony')
df_bpic14_incident = df_rename_cat_values(df_bpic14_incident, ['Urgency'], '5 - Very Low', '5')

df_bpic14_incident.sort_values(by=['Incident ID', 'Open Time'], inplace=True, ignore_index=True)

df_write_files(df_bpic14_incident, os.path.join(INTERIM_DATA_DIR, "Detail_Incident_processed"))

print(df_bpic14_incident.dtypes)
df_bpic14_incident

#### Descriptive Statistics for BPIC 2014 Incident Details

In [None]:
df_stat = df_bpic14_incident.describe(include='all')
df_write_files(df_stat, os.path.join(OUTPUT_DATA_DIR, "Detail_Incident_describe"), index=True)
df_stat

In [None]:
df_bpic14_incident.hist(xrot=90, figsize=(10, 10))
plt.tight_layout()
plt.savefig(os.path.join(GRAPHIC_DIR, "Detail_Incident_hist.svg"), bbox_inches='tight')
plt.show()

### Change Details

If particular service disruptions reoccur more often than usual, a Problem investigation is started, which will lead to an analysis and improvement plan to prevent the service disruption to happen again. The improvement plan leads to a Request for Change (RfC) on the CausedBy CI. All CI's are related to a Service Component, Risk Impact Analysis is done by an Implementation Manager assigned to changes related to the specific Service Component. See also https://doi.org/10.4121/uuid:d5ccb355-ca67-480f-8739-289b9b593aaf.

- **Dataset attributes:**
  - *CI Name (aff):* Configuration Item (CI) which will be affected by this change. A change can be related to more than one Configuration Item.

  - *CI Type (aff):* Every CI in the CMDB is related to an Entity Type.

  - *CI Subtype (aff):* Every CI in the CMDB is related to a Subtype, which is related to a CI Type.

  - *Service Comp WBS (aff):* Every CI in the CMDB is related to 1 Service Component, in order to identify which Product Manager is responsible for the CI. A Service Component is equal to a product in the Bill of Material and is part of one or more Services.

  - *Change ID:* The unique ID of a Change-record in the Service Management tool.

  - *Change Type:* In order to select and compare similar Changes in the Service Management tool, all records are categorized by Change Type.

  - *Risk Assessment:* Impact of change: Major Business Change, Business Change or Minor Change.

  - *Emergency Change:* Indication if the change is an emergency fix.

  - *CAB-approval needed:* Indication is the changes need approval by the Change Advisory Board, before implementation.

  - *Planned Start Date:* Date and time the change implementation is planned to start.

  - *Planned End Date:* Date and time the change implementation is planned to end.

  - *Scheduled Downtime Start Date:* Date and time the Service Downtime is scheduled to start.

  - *Scheduled Downtime End Date:* Date and time the Service Downtime is scheduled to end.

  - *Actual Start Date:* Date and time the change implementation is actually started.

  - *Actual End Date:* Date and time the change implementation actually ended.

  - *Requested End Date:* Date and time before the change requestor wants the change to be implemented.

  - *Change record Open Time:* Date and time the Change record was opened in the Service Management tool.

  - *Change record Close Time:* Date and time the Change record is closed in the Service Management tool.

  - *Originated from:* Indication if the change originated from, for instance, Problem research, or is a quick fix for an Incident.

  - *# Related Interactions:* Number of Interactions caused by this change.

  - *# Related Incidents:* Number of Incidents caused by this change.


#### Read BPIC 2014 Change Details

In [None]:
dtypes_bpic14_change = {
    'CI Name (aff)': 'category',
    'CI Type (aff)': 'category',
    'CI Subtype (aff)': 'category',
    'Service Component WBS (aff)': 'category',
    'Change ID': 'string',
    'Change Type': 'category',
    'Risk Assessment': pd.CategoricalDtype(['Minor Change', 'Business Change', 'Major Business Change'], ordered=True),
    'Emergency Change': 'boolean',
    'CAB-approval needed': 'boolean',
    'Originated from': 'category',
    '# Related Interactions': 'Int16',
    '# Related Incidents': 'Int16',
}

df_bpic14_change = pd.read_csv(
    os.path.join(INPUT_DATA_BPIC2014_DIR, 'Detail_Change.csv'),
    header=0,
    sep=';',
    dtype=dtypes_bpic14_change,
    true_values=['Y'],
    false_values=['N'],
    na_values=['#N/B']
)

# Convert dates
df_bpic14_change = df_convert_datetimes(
    df_bpic14_change,
    ['Planned Start', 'Planned End', 'Scheduled Downtime Start', 'Scheduled Downtime End', 'Actual Start', 'Actual End', 'Requested End Date', 'Change record Open Time', 'Change record Close Time'],
    dayfirst=True
)

df_drop_duplicate_rows(df_bpic14_change)
df_drop_na_rows_and_cols(df_bpic14_change)
df_drop_single_val_cols(df_bpic14_change)

# Sort by Incident ID, DateStamp and IncidentActivity_Number
df_bpic14_change.sort_values(['Change ID', 'Change record Open Time'], ignore_index=True, inplace=True)

df_write_files(df_bpic14_change, os.path.join(INTERIM_DATA_DIR, "Detail_Change_processed"))

print(df_bpic14_change.dtypes)
df_bpic14_change

#### Descriptive Statistics for BPIC 2014 Incident Details

In [None]:
df_stat = df_bpic14_change.describe(include='all')
df_write_files(df_stat, os.path.join(OUTPUT_DATA_DIR, "Detail_Change_describe"), index=True)
df_stat

In [None]:
df_bpic14_change.hist(xrot=90, figsize=(10, 10))
plt.tight_layout()
plt.savefig(os.path.join(GRAPHIC_DIR, "Detail_Change_hist.svg"), bbox_inches='tight')
plt.show()

### Interaction Details

In order to manage calls or mails from customers (Rabobank colleagues) to the Service Desk concerning disruptions of ICT-services, a Service Desk Agent (SDA) logs calls/mails in an Interaction-record and relates it to an Affected Configuration Item (CI) The SDA can either resolve the issue for the customer  directly (First Call Resolution) or create an Incident-record to assign the issue to an Assignment Group with more technical knowledge to resolve the service disruption. If similar calls/mails are received by the Service Desk, a SDA can decide to relate multiple Interaction-records to one Incident-record Further logging of Activities to resolve the service disruption will be done in the Incident-record. See also https://doi.org/10.4121/uuid:3d5ae0ce-198c-4b5c-b0f9-60d3035d07bf.


- **Dataset attributes:**
  - *CI Name (aff):* Configuration Item (CI) where a disruption of an ICT Service is noticed, this is what we call the "Affected CI". A Service Desk Agent always uses questions in a Knowledge Document (identified by a KM number) to find the correct CI in the Configuration Item Database (CMDB).

  - *CI Type (aff):* Every CI in the CMDB is related to an Entity Type.

  - *CI Subtype (aff):* Every CI in the CMDB is related to a Subtype, which is related to a CI Type.

  - *Service Comp WBS (aff):* Every CI in the CMDB is related to 1 Service Component, in order to identify which Product Manager is responsible for the CI. A Service Component is equal to a product in the Bill of Material and is part of one or more Services.

  - *Interaction ID:* The unique ID of an Interaction-record in the Service Management tool.

  - *Status:* Status of the Interaction-record.

  - *Impact:* Impact of the service disruption to the customer.

  - *Urgency:* Indication of how urgent the customer needs a solution.

  - *Priority:* Impact and Urgency lead to a Priority for the Assignment Group to resolve the service disruption.

  - *Category:* In order to select and compare similar Interactions in the Service Management tool, all records are categorized. The categorization is derived from the Knowledge Document.

  - *KM number:* A Knowledge Document contains default attribute values for the Interaction-record and a set of questions for a Service Desk Agent to derive which Configuration Item is disrupted and to determine Impact and Urgency for the customer.

  - *Open Time (First Touch):* Date and time the Interaction record was opened in the Service Management tool.

  - *Close Time:* Date and time the Interaction record is closed in the Service Management tool.

  - *Closure Code:* Short code to classify the type of service disruption.

  - *First Call Resolution:* Flag which indicates if the Service Desk Agent was able to provide the customer with a workaround for the Service disruption (Y) or if it was necessary to assign the record to a specialist by creating an Incident record.

  - *Handle Time (secs):* Time registered to resolve the service disruption.

  - *Related Incident Record:* Number of the Incident created from this Interaction-record.

#### Read BPIC 2014 Interaction Details

In [None]:
dtypes_bpic14_interaction = {
    'CI Name (aff)': 'category',
    'CI Type (aff)': 'category',
    'CI Subtype (aff)': 'category',
    'Service Comp WBS (aff)': 'category',
    'Interaction ID': 'string',
    'Status': 'category',
    'Impact': 'category',
    'Urgency': 'category',
    'Priority': pd.CategoricalDtype(range(1, 6), ordered=True),
    'Category': 'category',
    'KM number': 'category',
    'Closure Code': 'category',
    'First Call Resolution': 'boolean',
    'Related Incident': 'string',
}

df_bpic14_interaction = pd.read_csv(
    os.path.join(INPUT_DATA_BPIC2014_DIR, 'Detail_Interaction.csv'),
    header=0,
    sep=';',
    dtype=dtypes_bpic14_interaction,
    true_values=['Y'],
    false_values=['N'],
    na_values=['#MULTIVALUE', '#N/B']
)

# Convert timestamps
df_bpic14_interaction = df_convert_datetimes(
    df_bpic14_interaction,
   ["Open Time (First Touch)", "Close Time"],
    dayfirst=True
)

# Convert time deltas
df_bpic14_interaction = df_convert_timedeltas(df_bpic14_interaction, ["Handle Time (secs)"], unit="seconds")

df_drop_duplicate_rows(df_bpic14_interaction)
df_drop_na_rows_and_cols(df_bpic14_interaction)
df_drop_single_val_cols(df_bpic14_interaction)

# Unify category namings
df_bpic14_interaction = df_rename_cat_values(df_bpic14_interaction, 'Closure Code', 'unknown', 'Unknown')
df_bpic14_interaction = df_rename_cat_values(df_bpic14_interaction, 'Closure Code', ['SOFTWARE', 'software'], 'Software')

# Sort by Incident ID and Open Time
df_bpic14_interaction.sort_values(['Interaction ID', 'Open Time (First Touch)'], ignore_index=True, inplace=True)

df_write_files(df_bpic14_interaction, os.path.join(INTERIM_DATA_DIR, "Detail_Interaction_processed"))

print(df_bpic14_interaction.dtypes)
df_bpic14_interaction

#### Descriptive Statistics for BPIC 2014 Interaction Details

In [None]:
df_stat = df_bpic14_interaction.describe(include='all')
df_write_files(df_stat, os.path.join(OUTPUT_DATA_DIR, "Detail_Interaction_describe"), index=True)
df_stat

In [None]:
df_bpic14_interaction.hist(xrot=90, figsize=(10, 10))
plt.tight_layout()
plt.savefig(os.path.join(GRAPHIC_DIR, "Detail_Interaction_hist.svg"), bbox_inches='tight')
plt.show()

### Incident Activity

- *Incident ID:* The unique ID of an Incident-record in the Service Management tool.

- *DateStamp:* Date and time when this specific Incident Activity started.

- *IncidentActivity_Number:* Unique ID for an Incident Activity.

- *IncidentActivity_Type:* Short code to identify which type of Incident Activity took place.

- *Interaction ID:* The unique ID of an Interaction-record in the Service Management tool.

- *Assignment Group:* The team responsible for this Incident Activity.

- *KM number:* A Knowledge Document contains default attribute values for the Interaction-record and a set of questions for a Service Desk Agent to derive which Configuration Item is disrupted and to determine Impact and Urgency for the customer.

#### Read BPIC 2014 Incident Activity

See https://doi.org/10.4121/uuid:86977bac-f874-49cf-8337-80f26bf5d2ef.

In [None]:
dtypes_bpic14 = {
    'Incident ID': 'string',
    'IncidentActivity_Number': 'string',
    'IncidentActivity_Type': 'category',
    'Assignment Group': 'category',
    'KM number': 'category',
    'Interaction ID': 'string'
}

df_bpic14 = pd.read_csv(
    os.path.join(INPUT_DATA_BPIC2014_DIR, 'Detail_Incident_Activity.csv'),
    header=0,
    sep=';',
    decimal=',',
    dtype=dtypes_bpic14,
    na_values=['#N/B']
)

# Convert timestamps
df_bpic14 = df_convert_datetimes(
    df_bpic14,
    ["DateStamp"],
    dayfirst=True
)

# Combine other data sources
df_bpic14 = df_bpic14.join(
    df_bpic14_incident.set_index('Incident ID').add_prefix("incident_"),
    how='left',
    on='Incident ID',
    #validate='m:1'
).join(
    df_bpic14_change.set_index('Change ID').add_prefix("change_"),
    how='left',
    on='incident_Related Change',
    #validate='m:m'
).join(
    df_bpic14_interaction.set_index('Interaction ID').add_prefix("interaction_"),
    how='left',
    on='incident_Related Interaction',
    #validate='m:m'
)

df_drop_duplicate_rows(df_bpic14)
df_drop_na_rows_and_cols(df_bpic14)
df_drop_single_val_cols(df_bpic14)

# Sort and normalize column names
df_format_as_eventlog(df_bpic14, case_col="Incident ID", activity_col="IncidentActivity_Type", time_col="DateStamp", group_col="Assignment Group")

# Write as Pandas CSV
df_write_files(df_bpic14, os.path.join(INTERIM_DATA_DIR, "Detail_Incident_Activity_processed"), skip_xes=False)

print(df_bpic14.dtypes)
df_bpic14

#### Label BPIC 2014 Incident Activity

In [None]:
df_bpic14 = df_label_next_activity(df_bpic14, eoc_token=TOKEN_EOC)
df_bpic14 = df_label_activity_duration(df_bpic14, unit='d', eoc_token=pd.Timedelta(0))
df_bpic14 = df_label_remaining_cycle_time(df_bpic14, unit='d')

#df_write_files(df_bpic14, os.path.join(INTERIM_DATA_DIR, "Detail_Incident_Activity_labeled"))
df_bpic14


#### Descriptive Statistics for BPIC 2014 Incident Activity

In [None]:
df_stat = df_bpic14.describe(include='all')
df_write_files(df_stat, os.path.join(OUTPUT_DATA_DIR, "Detail_Incident_Activity_describe"), index=True)
df_stat

In [None]:
df_bpic14.hist(xrot=90, figsize=(20, 20))
plt.tight_layout()
plt.savefig(os.path.join(GRAPHIC_DIR, "Detail_Incident_Activity_hist.svg"), bbox_inches='tight')
plt.show()

In [None]:
df_case_length_stats(df_bpic14)

In [None]:
df_case_duration_stats(df_bpic14)

#### Clean BPIC 2014 Incident Activities

In [None]:
# Filter open incidents
df_bpic14_clean = df_bpic14[df_bpic14["case:incident_Status"] == "Closed"]

df_bpic14_clean = df_bpic14_clean.drop(columns=[
    "IncidentActivity_Number", # ID
    "case:incident_# Reassignments", # Correlated to change_CI Name (aff)
    "case:change_CAB-approval needed", # Correlated to change_CI Name (aff) and case:change_Change Type
    "case:interaction_CI Name (aff)", # Correlated to case:incident_CI Name (aff)
    "case:interaction_CI Type (aff)", # Correlated to case:incident_CI Type (aff)
    "case:interaction_CI Subtype (aff)", # Correlated to case:incident_CI Subtype (aff)
    "case:incident_ServiceComp WBS (CBy)", # Correlated to case:incident_Service Component WBS (aff); a-posteriori
    "case:interaction_Service Comp WBS (aff)", # Correlated to 'case:incident_Service Component WBS (aff)'
    "case:incident_Impact", # Correlated to case:incident_Urgency and case:incident_Priority
    "case:incident_Urgency", # Correlated to case:incident_Impact and case:incident_Priority
    "case:interaction_Category", # Correlated to case:interaction_Category
    "case:interaction_Closure Code", # Correlated to case:interaction_Closure Code
    "case:interaction_Impact", # Correlated to case:interaction_Urgency and case:interaction_Priority
    "case:interaction_Urgency", # Correlated to case:interaction_Impact and case:interaction_Priority
    "case:interaction_KM number", # Correlated to case:incident_KM number
    "case:incident_Open Time", # In sequence encoded
    "case:incident_Reopen Time", # In sequence encoded, a-posteriori
    "case:incident_Resolved Time", # A-posteriori
    "case:incident_Close Time", # A-posteriori
    "case:incident_Handle Time (Hours)", # A-posteriori,
    "case:incident_Closure Code", # A-posteriori
    "case:incident_# Related Interactions", # A-posteriori
    "case:incident_Related Interaction", # ID
    "case:incident_# Related Incidents", # A-posteriori
    "case:incident_# Related Changes", # A-posteriori
    "case:incident_Related Change", # ID
    "case:interaction_Related Incident", # ID
    "case:interaction_Handle Time (secs)", # A-posteriori
    "case:Interaction ID", # ID
    "case:incident_Status", # Single value
    "case:change_Actual Start",
    "case:change_Actual End",
    "case:change_Change record Open Time",
    "change_Change record Close Time",
    "case:interaction_Open Time (First Touch)",
    "case:change_Planned Start",
    "case:change_Planned End",
    "case:change_Scheduled Downtime Start",
    "case:change_Scheduled Downtime End",
    "case:change_Requested End Date",
    "case:interaction_Close Time"
])

# Filter event log
df_bpic14_clean = df_filter_case_duration_range(df_bpic14_clean, max=0.95)
df_bpic14_clean = df_filter_date_range(df_bpic14_clean, min="2013-09-01 00:00:00", mode='traces_included')

# Remove mostly empty columns
df_drop_threshold_na_cols(df_bpic14_clean, 0.95)

# Create time features
df_bpic14_clean = df_extract_elapsed_cycle_time(df_bpic14_clean, f"{EVENTLOG_TIMESTAMP}{EVENTLOG_FEAT_TIME_ELAPSED_CYCLE_SUFFIX}:seconds", unit='s')
df_bpic14_clean = df_extract_activity_duration(df_bpic14_clean, f"{EVENTLOG_TIMESTAMP}{EVENTLOG_FEAT_TIME_ELAPSED_PREV_SUFFIX}:seconds", unit='s', na_token=0)

df_bpic14_clean = df_extract_month_of_year(df_bpic14_clean, f"{EVENTLOG_TIMESTAMP}:month", relative=True)
df_bpic14_clean = df_extract_day_of_year(df_bpic14_clean, f"{EVENTLOG_TIMESTAMP}:dayofyear", relative=True)
df_bpic14_clean = df_extract_day_of_month(df_bpic14_clean, f"{EVENTLOG_TIMESTAMP}:day", relative=True)
df_bpic14_clean = df_extract_day_of_week(df_bpic14_clean, f"{EVENTLOG_TIMESTAMP}:weekday", relative=True)
df_bpic14_clean = df_extract_hour_of_day(df_bpic14_clean, f"{EVENTLOG_TIMESTAMP}:hour", relative=True)

df_bpic14_clean = df_extract_month_of_year(df_bpic14_clean, f"{EVENTLOG_TIMESTAMP}:month:raw")
df_bpic14_clean = df_extract_day_of_year(df_bpic14_clean, f"{EVENTLOG_TIMESTAMP}:dayofyear:raw")
df_bpic14_clean = df_extract_day_of_month(df_bpic14_clean, f"{EVENTLOG_TIMESTAMP}:day:raw")
df_bpic14_clean = df_extract_day_of_week(df_bpic14_clean, f"{EVENTLOG_TIMESTAMP}:weekday:raw")
df_bpic14_clean = df_extract_hour_of_day(df_bpic14_clean, f"{EVENTLOG_TIMESTAMP}:hour:raw")

# Encode categorical labels
df_bpic14_clean = df_encode_label(df_bpic14_clean, cols=EVENTLOG_LABEL_NEXT_ACT)

# Transform bools to int
df_bpic14_clean = df_convert_bool_to_int(df_bpic14_clean)

# Transform ordered cats to int
df_bpic14_clean = df_convert_ordered_cat_to_int(df_bpic14_clean, relative=True)

# Fill empty str values
df_bpic14_clean = df_fillna_str(df_bpic14_clean, TOKEN_NA)

# Fill empty cat values
df_bpic14_clean = df_fillna_cat(df_bpic14_clean, TOKEN_NA)

df_write_files(df_bpic14_clean, os.path.join(OUTPUT_DATA_DIR, "Detail_Incident_Activity_cleaned"))

df_bpic14_clean

In [None]:
df_bpic14_train, df_bpic14_test = df_strict_temporal_train_test_split(
    df_bpic14_clean,
    0.2,
    df_find_labels(df_bpic14_clean),
    debias_end=False
)

axes = df_visualize_strict_temporal_splitting(df_bpic14_train, df_bpic14_test)
plt.tight_layout()
plt.savefig(os.path.join(GRAPHIC_DIR, "Detail_Incident_Activity_train_test.svg"), bbox_inches='tight')
plt.show()

# Remove overlaps
df_bpic14_test.dropna(axis='index', how='any', subset=df_find_labels(df_bpic14_test), inplace=True)

# Remove two very long cases
df_bpic14_train = df_bpic14_train[~df_bpic14_train[EVENTLOG_CASE].isin(["IM0000944", "IM0018374"])]

df_write_files(df_bpic14_train, os.path.join(OUTPUT_DATA_DIR, "Detail_Incident_Activity_train"))
df_write_files(df_bpic14_test, os.path.join(OUTPUT_DATA_DIR, "Detail_Incident_Activity_test"))

In [None]:
df_bpic14_dyn = df_prefix_pad_attributes(
    pd.concat([df_bpic14_train, df_bpic14_test]),
    df_find_event_attributes(df_bpic14_clean, exclude_labels=True)
)

df_bpic14_dyn

In [None]:
df_bpic14_train_dyn = df_bpic14_dyn.loc[df_bpic14_train[EVENTLOG_CASE].unique()]
df_bpic14_train_dyn

In [None]:
df_bpic14_test_dyn = df_bpic14_dyn.loc[df_bpic14_test[EVENTLOG_CASE].unique()]
df_bpic14_test_dyn

In [None]:
df_naive_regression_metrics(
    df_bpic14_train,
    df_bpic14_test,
    EVENTLOG_LABEL_NEXT_TIME
)

In [None]:
df_naive_regression_metrics(
    df_bpic14_train,
    df_bpic14_test,
    EVENTLOG_LABEL_REM_TIME
)

In [None]:
df_naive_classification_metrics(
    df_bpic14_train,
    df_bpic14_test,
    EVENTLOG_LABEL_NEXT_ACT
)

#### Create Graphs for BPIC 2014

##### Create Group Graphs for BPIC 2014

In [None]:
mapping, reverse_mapping, data = df_to_pyg_temporal_data(pd.concat([df_bpic14_train, df_bpic14_test], ignore_index=True), EVENTLOG_GROUP)
data

In [None]:
train_data, val_data, test_data = data.train_val_test_split(val_ratio=0.15, test_ratio=0.15)

train_loader = pyg.loader.TemporalDataLoader(
    train_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)
val_loader = pyg.loader.TemporalDataLoader(
    val_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)
test_loader = pyg.loader.TemporalDataLoader(
    test_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)

neighbor_loader = pyg.nn.models.tgn.LastNeighborLoader(data.num_nodes, size=8)

In [None]:
memory = pyg.nn.models.TGNMemory(
    data.num_nodes,
    data.msg.size(-1),
    memory_dim,
    time_dim,
    message_module=pyg.nn.models.tgn.IdentityMessage(data.msg.size(-1), memory_dim, time_dim),
    aggregator_module=pyg.nn.models.tgn.LastAggregator(),
)

gnn = GraphAttentionEmbedding(
    in_channels=memory_dim,
    out_channels=embedding_dim,
    msg_dim=data.msg.size(-1),
    time_enc=memory.time_enc,
).to(device)

link_pred = LinkPredictor(in_channels=embedding_dim).to(device)

optimizer = torch.optim.Adam(set(memory.parameters()) | set(gnn.parameters()) | set(link_pred.parameters()), lr=0.00001, weight_decay=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()

# Helper vector to map global node indices to local ones.
assoc = torch.empty(data.num_nodes, dtype=torch.long, device=device)

In [None]:
for epoch in range(1, 50):
    loss = train()
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}')
    val_ap, val_auc = test(val_loader)
    test_ap, test_auc = test(test_loader)
    print(f'Val AP: {val_ap:.4f}, Val AUC: {val_auc:.4f}')
    print(f'Test AP: {test_ap:.4f}, Test AUC: {test_auc:.4f}')

with open(os.path.join(OUTPUT_DATA_DIR, "bpic14_group_mapping.json"), "w") as f:
  json.dump(mapping, f)
with open(os.path.join(OUTPUT_DATA_DIR, "bpic14_group_inverse_mapping.json"), "w") as f:
  json.dump(reverse_mapping, f)

embeddings = embed()
np.save(os.path.join(OUTPUT_DATA_DIR, "bpic14_group_embeddings.npy"), embeddings, allow_pickle=False)

In [None]:
arr_nan = np.full(EMBEDDING_DIM, -99.0, dtype='float32')

embeddings_df_train = df_bpic14_train_dyn[EVENTLOG_GROUP].map(lambda x: embeddings[int(reverse_mapping[x])] if x is not None and not pd.isna(x) else arr_nan, na_action=None).to_numpy()
group_graph_embeddings_train = np.empty((embeddings_df_train.shape[0], embeddings_df_train.shape[1], EMBEDDING_DIM), dtype='float32')
for i in range(len(embeddings_df_train)):
  group_graph_embeddings_train[i] = np.stack(embeddings_df_train[i])

group_graph_embeddings_train.shape

In [None]:
arr_nan = np.full(EMBEDDING_DIM, -99.0, dtype='float32')

embeddings_df_test = df_bpic14_test_dyn[EVENTLOG_GROUP].map(lambda x: embeddings[int(reverse_mapping[x])] if x is not None and not pd.isna(x) else arr_nan, na_action=None).to_numpy()
group_graph_embeddings_test = np.empty((embeddings_df_test.shape[0], embeddings_df_test.shape[1], EMBEDDING_DIM), dtype='float32')
for i in range(len(embeddings_df_test)):
  group_graph_embeddings_test[i] = np.stack(embeddings_df_test[i])

group_graph_embeddings_test.shape

#### Create Dataset from BPIC 2014

In [None]:
ds_bpic14_train = tf.data.Dataset.from_tensor_slices(({
  "activity": df_bpic14_train_dyn["concept:name"].to_numpy(na_value=TOKEN_PADDING),
  "org_group": df_bpic14_train_dyn["org:group"].to_numpy(na_value=TOKEN_PADDING),
  "org_group_graph": group_graph_embeddings_train,

  "time_timestamp_elapsedcycle": df_bpic14_train_dyn["time:timestamp:elapsedcycle:seconds"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_elapsedprev": df_bpic14_train_dyn["time:timestamp:elapsedprev:seconds"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_month": df_bpic14_train_dyn["time:timestamp:month"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_dayofyear": df_bpic14_train_dyn["time:timestamp:dayofyear"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_day": df_bpic14_train_dyn["time:timestamp:day"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_weekday": df_bpic14_train_dyn["time:timestamp:weekday"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_hour": df_bpic14_train_dyn["time:timestamp:hour"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_month_raw": df_bpic14_train_dyn["time:timestamp:month:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_dayofyear_raw": df_bpic14_train_dyn["time:timestamp:dayofyear:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_day_raw": df_bpic14_train_dyn["time:timestamp:day:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_weekday_raw": df_bpic14_train_dyn["time:timestamp:weekday:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_hour_raw": df_bpic14_train_dyn["time:timestamp:hour:raw"].to_numpy(dtype='float32', na_value=-1),

  "case_km_number": np.expand_dims(df_bpic14_train["case:KM number"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_incident_ci_type_aff": np.expand_dims(df_bpic14_train["case:incident_CI Type (aff)"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_incident_ci_subtype_aff": np.expand_dims(df_bpic14_train["case:incident_CI Subtype (aff)"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_incident_service_component_aff": np.expand_dims(df_bpic14_train["case:incident_Service Component WBS (aff)"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_incident_priority": np.expand_dims(df_bpic14_train["case:incident_Priority"].to_numpy(dtype='float32', na_value=-1), axis=-1),
  "case_incident_category": np.expand_dims(df_bpic14_train["case:incident_Category"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_incident_ci_name_cby": np.expand_dims(df_bpic14_train["case:incident_CI Name (CBy)"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_incident_ci_type_cby": np.expand_dims(df_bpic14_train["case:incident_CI Type (CBy)"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_incident_ci_subtype_cby": np.expand_dims(df_bpic14_train["case:incident_CI Subtype (CBy)"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_interaction_priority": np.expand_dims(df_bpic14_train["case:interaction_Priority"].to_numpy(dtype='float32', na_value=-1), axis=-1),
}, {
  "next_activity": np.expand_dims(df_bpic14_train["label:concept:name:next"].to_numpy(dtype='int16'), axis=-1),
  "next_time": np.expand_dims(df_bpic14_train["label:time:timestamp:next"].to_numpy(dtype='float32'), axis=-1),
  "remaining_time": np.expand_dims(df_bpic14_train["label:time:timestamp:last"].to_numpy(dtype='float32'), axis=-1),
}))

# Batch dataset to avoid OOM error
ds_write_files(ds_bpic14_train.batch(128), os.path.join(OUTPUT_DATA_DIR, 'Detail_Incident_Activity_train_dataset'))

ds_bpic14_train

In [None]:
ds_bpic14_test = tf.data.Dataset.from_tensor_slices(({
  "activity": df_bpic14_test_dyn["concept:name"].to_numpy(na_value=TOKEN_PADDING),
  "org_group": df_bpic14_test_dyn["org:group"].to_numpy(na_value=TOKEN_PADDING),
  "org_group_graph": group_graph_embeddings_test,

  "time_timestamp_elapsedcycle": df_bpic14_test_dyn["time:timestamp:elapsedcycle:seconds"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_elapsedprev": df_bpic14_test_dyn["time:timestamp:elapsedprev:seconds"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_month": df_bpic14_test_dyn["time:timestamp:month"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_dayofyear": df_bpic14_test_dyn["time:timestamp:dayofyear"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_day": df_bpic14_test_dyn["time:timestamp:day"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_weekday": df_bpic14_test_dyn["time:timestamp:weekday"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_hour": df_bpic14_test_dyn["time:timestamp:hour"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_month_raw": df_bpic14_test_dyn["time:timestamp:month:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_dayofyear_raw": df_bpic14_test_dyn["time:timestamp:dayofyear:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_day_raw": df_bpic14_test_dyn["time:timestamp:day:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_weekday_raw": df_bpic14_test_dyn["time:timestamp:weekday:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_hour_raw": df_bpic14_test_dyn["time:timestamp:hour:raw"].to_numpy(dtype='float32', na_value=-1),

  "case_km_number": np.expand_dims(df_bpic14_test["case:KM number"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_incident_ci_type_aff": np.expand_dims(df_bpic14_test["case:incident_CI Type (aff)"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_incident_ci_subtype_aff": np.expand_dims(df_bpic14_test["case:incident_CI Subtype (aff)"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_incident_service_component_aff": np.expand_dims(df_bpic14_test["case:incident_Service Component WBS (aff)"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_incident_priority": np.expand_dims(df_bpic14_test["case:incident_Priority"].to_numpy(dtype='float32', na_value=-1), axis=-1),
  "case_incident_category": np.expand_dims(df_bpic14_test["case:incident_Category"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_incident_ci_name_cby": np.expand_dims(df_bpic14_test["case:incident_CI Name (CBy)"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_incident_ci_type_cby": np.expand_dims(df_bpic14_test["case:incident_CI Type (CBy)"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_incident_ci_subtype_cby": np.expand_dims(df_bpic14_test["case:incident_CI Subtype (CBy)"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_interaction_priority": np.expand_dims(df_bpic14_test["case:interaction_Priority"].to_numpy(dtype='float32', na_value=-1), axis=-1),
}, {
  "next_activity": np.expand_dims(df_bpic14_test["label:concept:name:next"].to_numpy(dtype='int16'), axis=-1),
  "next_time": np.expand_dims(df_bpic14_test["label:time:timestamp:next"].to_numpy(dtype='float32'), axis=-1),
  "remaining_time": np.expand_dims(df_bpic14_test["label:time:timestamp:last"].to_numpy(dtype='float32'), axis=-1),
}))

ds_write_files(ds_bpic14_test, os.path.join(OUTPUT_DATA_DIR, 'Detail_Incident_Activity_test_dataset'))

ds_bpic14_test

## Dataset: Helpdesk

This data set contains a helpdesk event logs and is available here: https://data.mendeley.com/datasets/39bp3vv62t/1  

### Read Helpdesk

In [None]:
dtypes_helpdesk = {
    'CaseID': 'string',
    'ActivityID': 'string',
}
df_helpdesk = pd.read_csv(
    os.path.join(INPUT_DATA_DIR, 'helpdesk.csv'),
    header=0,
    dtype=dtypes_helpdesk
)

df_drop_duplicate_rows(df_helpdesk)
df_drop_na_rows_and_cols(df_helpdesk)
df_drop_single_val_cols(df_helpdesk)

# Convert timestamps
df_helpdesk = df_convert_datetimes(
    df_helpdesk,
    ["CompleteTimestamp"],
    yearfirst=True
)
# Sort and normalize column names
df_format_as_eventlog(df_helpdesk, case_col="CaseID", activity_col="ActivityID", time_col="CompleteTimestamp")

# Write as Pandas CSV
df_write_files(df_helpdesk, os.path.join(INTERIM_DATA_DIR, "helpdesk_processed"), skip_xes=False)

print(df_helpdesk.dtypes)
df_helpdesk

### Label Helpdesk

In [None]:
df_helpdesk = df_label_next_activity(df_helpdesk, eoc_token=TOKEN_EOC)
df_helpdesk = df_label_activity_duration(df_helpdesk, unit='d', eoc_token=pd.Timedelta(0))
df_helpdesk = df_label_remaining_cycle_time(df_helpdesk, unit='d')

df_write_files(df_helpdesk, os.path.join(INTERIM_DATA_DIR, "helpdesk_labeled"))
df_helpdesk

### Descriptive Statistics for Helpdesk

In [None]:
df_stat = df_helpdesk.describe(include='all')
df_write_files(df_stat, os.path.join(OUTPUT_DATA_DIR, "helpdesk_describe"), index=True)
df_stat

In [None]:
df_helpdesk.hist(xrot=90, figsize=(10, 10))
plt.tight_layout()
plt.savefig(os.path.join(GRAPHIC_DIR, "helpdesk_hist.svg"), bbox_inches='tight')
plt.show()

In [None]:
df_case_length_stats(df_helpdesk)

In [None]:
df_case_duration_stats(df_helpdesk)

### Clean Helpdesk

In [None]:
df_helpdesk_clean = df_helpdesk.copy()

# Create time features
df_helpdesk_clean = df_extract_elapsed_cycle_time(df_helpdesk_clean, f"{EVENTLOG_TIMESTAMP}{EVENTLOG_FEAT_TIME_ELAPSED_CYCLE_SUFFIX}:seconds", unit='s')
df_helpdesk_clean = df_extract_activity_duration(df_helpdesk_clean, f"{EVENTLOG_TIMESTAMP}{EVENTLOG_FEAT_TIME_ELAPSED_PREV_SUFFIX}:seconds", unit='s', na_token=0)

df_helpdesk_clean = df_extract_month_of_year(df_helpdesk_clean, f"{EVENTLOG_TIMESTAMP}:month", relative=True)
df_helpdesk_clean = df_extract_day_of_year(df_helpdesk_clean, f"{EVENTLOG_TIMESTAMP}:dayofyear", relative=True)
df_helpdesk_clean = df_extract_day_of_month(df_helpdesk_clean, f"{EVENTLOG_TIMESTAMP}:day", relative=True)
df_helpdesk_clean = df_extract_day_of_week(df_helpdesk_clean, f"{EVENTLOG_TIMESTAMP}:weekday", relative=True)
df_helpdesk_clean = df_extract_hour_of_day(df_helpdesk_clean, f"{EVENTLOG_TIMESTAMP}:hour", relative=True)

df_helpdesk_clean = df_extract_month_of_year(df_helpdesk_clean, f"{EVENTLOG_TIMESTAMP}:month:raw")
df_helpdesk_clean = df_extract_day_of_year(df_helpdesk_clean, f"{EVENTLOG_TIMESTAMP}:dayofyear:raw")
df_helpdesk_clean = df_extract_day_of_month(df_helpdesk_clean, f"{EVENTLOG_TIMESTAMP}:day:raw")
df_helpdesk_clean = df_extract_day_of_week(df_helpdesk_clean, f"{EVENTLOG_TIMESTAMP}:weekday:raw")
df_helpdesk_clean = df_extract_hour_of_day(df_helpdesk_clean, f"{EVENTLOG_TIMESTAMP}:hour:raw")

df_helpdesk_clean = df_filter_case_duration_range(df_helpdesk_clean, max=0.95)

# Encode categorical labels
df_helpdesk_clean = df_encode_label(df_helpdesk_clean, cols=EVENTLOG_LABEL_NEXT_ACT)

# Transform bools to int
df_helpdesk_clean = df_convert_bool_to_int(df_helpdesk_clean)

# Transform ordered cats to int
df_helpdesk_clean = df_convert_ordered_cat_to_int(df_helpdesk_clean, relative=True)

# Fill empty str values
df_helpdesk_clean = df_fillna_str(df_helpdesk_clean, TOKEN_NA)

# Fill empty cat values
df_helpdesk_clean = df_fillna_cat(df_helpdesk_clean, TOKEN_NA)

df_write_files(df_helpdesk_clean, os.path.join(OUTPUT_DATA_DIR, "helpdesk_cleaned"))

df_helpdesk_clean

In [None]:
df_helpdesk_train, df_helpdesk_test = df_strict_temporal_train_test_split(
    df_helpdesk_clean,
    0.2,
    df_find_labels(df_helpdesk_clean),
    debias_end=False
)

axes = df_visualize_strict_temporal_splitting(df_helpdesk_train, df_helpdesk_test)
plt.tight_layout()
plt.savefig(os.path.join(GRAPHIC_DIR, "helpdesk_train_test.svg"), bbox_inches='tight')
plt.show()

# Remove overlaps
df_helpdesk_test.dropna(axis='index', how='any', subset=df_find_labels(df_helpdesk_test), inplace=True)

df_write_files(df_helpdesk_train, os.path.join(OUTPUT_DATA_DIR, "helpdesk_train"))
df_write_files(df_helpdesk_test, os.path.join(OUTPUT_DATA_DIR, "helpdesk_test"))

In [None]:
df_helpdesk_dyn = df_prefix_pad_attributes(
    pd.concat([df_helpdesk_train, df_helpdesk_test]),
    df_find_event_attributes(df_helpdesk_clean, exclude_labels=True)
)

df_helpdesk_dyn

In [None]:
df_helpdesk_train_dyn = df_helpdesk_dyn.loc[df_helpdesk_train[EVENTLOG_CASE].unique()]
df_helpdesk_train_dyn

In [None]:
df_helpdesk_test_dyn = df_helpdesk_dyn.loc[df_helpdesk_test[EVENTLOG_CASE].unique()]
df_helpdesk_test_dyn

In [None]:
df_helpdesk_train

In [None]:
df_naive_regression_metrics(
    df_helpdesk_train,
    df_helpdesk_test,
    EVENTLOG_LABEL_NEXT_TIME
)

In [None]:
df_naive_regression_metrics(
    df_helpdesk_train,
    df_helpdesk_test,
    EVENTLOG_LABEL_REM_TIME
)

In [None]:
df_naive_classification_metrics(
    df_helpdesk_train,
    df_helpdesk_test,
    EVENTLOG_LABEL_NEXT_ACT
)

### Create Dataset from Helpdesk

In [None]:
ds_helpdesk_train = tf.data.Dataset.from_tensor_slices(({
  "activity": df_helpdesk_train_dyn["concept:name"].to_numpy(na_value=TOKEN_PADDING),

  "time_timestamp_elapsedcycle": df_helpdesk_train_dyn["time:timestamp:elapsedcycle:seconds"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_elapsedprev": df_helpdesk_train_dyn["time:timestamp:elapsedprev:seconds"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_month": df_helpdesk_train_dyn["time:timestamp:month"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_dayofyear": df_helpdesk_train_dyn["time:timestamp:dayofyear"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_day": df_helpdesk_train_dyn["time:timestamp:day"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_weekday": df_helpdesk_train_dyn["time:timestamp:weekday"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_hour": df_helpdesk_train_dyn["time:timestamp:hour"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_month_raw": df_helpdesk_train_dyn["time:timestamp:month:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_dayofyear_raw": df_helpdesk_train_dyn["time:timestamp:dayofyear:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_day_raw": df_helpdesk_train_dyn["time:timestamp:day:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_weekday_raw": df_helpdesk_train_dyn["time:timestamp:weekday:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_hour_raw": df_helpdesk_train_dyn["time:timestamp:hour:raw"].to_numpy(dtype='float32', na_value=-1),
}, {
  "next_activity": np.expand_dims(df_helpdesk_train["label:concept:name:next"].to_numpy(dtype='int16'), axis=-1),
  "next_time": np.expand_dims(df_helpdesk_train["label:time:timestamp:next"].to_numpy(dtype='float32'), axis=-1),
  "remaining_time": np.expand_dims(df_helpdesk_train["label:time:timestamp:last"].to_numpy(dtype='float32'), axis=-1),
}))

ds_write_files(ds_helpdesk_train, os.path.join(OUTPUT_DATA_DIR, 'helpdesk_train_dataset'))

ds_helpdesk_train

In [None]:
ds_helpdesk_test = tf.data.Dataset.from_tensor_slices(({
  "activity": df_helpdesk_test_dyn["concept:name"].to_numpy(na_value=TOKEN_PADDING),

  "time_timestamp_elapsedcycle": df_helpdesk_test_dyn["time:timestamp:elapsedcycle:seconds"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_elapsedprev": df_helpdesk_test_dyn["time:timestamp:elapsedprev:seconds"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_month": df_helpdesk_test_dyn["time:timestamp:month"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_dayofyear": df_helpdesk_test_dyn["time:timestamp:dayofyear"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_day": df_helpdesk_test_dyn["time:timestamp:day"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_weekday": df_helpdesk_test_dyn["time:timestamp:weekday"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_hour": df_helpdesk_test_dyn["time:timestamp:hour"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_month_raw": df_helpdesk_test_dyn["time:timestamp:month:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_dayofyear_raw": df_helpdesk_test_dyn["time:timestamp:dayofyear:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_day_raw": df_helpdesk_test_dyn["time:timestamp:day:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_weekday_raw": df_helpdesk_test_dyn["time:timestamp:weekday:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_hour_raw": df_helpdesk_test_dyn["time:timestamp:hour:raw"].to_numpy(dtype='float32', na_value=-1),

}, {
  "next_activity": np.expand_dims(df_helpdesk_test["label:concept:name:next"].to_numpy(dtype='int16'), axis=-1),
  "next_time": np.expand_dims(df_helpdesk_test["label:time:timestamp:next"].to_numpy(dtype='float32'), axis=-1),
  "remaining_time": np.expand_dims(df_helpdesk_test["label:time:timestamp:last"].to_numpy(dtype='float32'), axis=-1),
}))

ds_write_files(ds_helpdesk_test, os.path.join(OUTPUT_DATA_DIR, 'helpdesk_test_dataset'))

ds_helpdesk_test

## Dataset: BPIC 2013

This data set contains information about the IT service management processes at Volvo IT. See http://www.win.tue.nl/bpi/2013/challenge and https://data.4tu.nl/collections/BPI_Challenge_2013/5065448 for further details.

### Incidents

**Dataset attributes**:
- *Involved ST*: The actual team that will try to solve the incident
- *Owner Country*: The country taking ownership of the incident
- *Owner First Name*: The owning person's first name
- *Involved Org line 3*: the business area of the user reporting the problem to the helpdesk
- *Involved ST Function Div*: The IT organization is divided into functions (mostly technology wise)
- *Status*: Current activity of the incident
- *SR Latest Impact*: Impact is a measure of the business criticality of an Incident often equal to the extent to which an Incident leads to degradation of agreed service levels.
- *Product*: Affected product
- *Country*: Country as a code
- *Sub Status*: Lifecycle transition of the incident
- *SR Number*: The case id as the incident's identifier

#### Read BPIC 2013 Incidents

In [None]:
dtypes_bpic13 = {
    "Involved ST": "category",
    "Owner Country": "category",
    "Owner First Name": "category",
    "Involved Org line 3": "category",
    "Involved ST Function Div": "category",
    "Status": "category",
    "SR Latest Impact": pd.CategoricalDtype(['Low', 'Medium', 'High', 'Major'], ordered=True),
    "Product": "category",
    "Country": "category",
    "Sub Status": "category",
    "SR Number": "string"
}

df_bpic13 = pd.read_csv(
    os.path.join(INPUT_DATA_BPIC2013_DIR, "VINST cases incidents.csv"),
    header=0,
    sep=';',
    encoding='cp1252',
    dtype=dtypes_bpic13
)


df_bpic13 = df_convert_datetimes(
    df_bpic13,
    ["Change Date+Time"],
    yearfirst=True
)

# Concat activities
df_bpic13[EVENTLOG_ACTIVITY] = (df_bpic13["Status"].astype('string') + "_" + df_bpic13["Sub Status"].astype('string')).astype('string')

df_drop_duplicate_rows(df_bpic13)
df_drop_na_rows_and_cols(df_bpic13)
df_drop_single_val_cols(df_bpic13)

df_bpic13 = df_rename_cat_values(df_bpic13, 'Country', 'SE', 'se')
df_bpic13 = df_rename_cat_values(df_bpic13, 'Owner First Name', 'Perjohan', 'Per-Johan')
df_bpic13 = df_rename_cat_values(df_bpic13, 'Owner First Name', 'Jan Erik', 'Jan-Erik')

df_format_as_eventlog(df_bpic13, case_col="SR Number", time_col="Change Date+Time", group_col="Involved ST", role_col="Involved ST Function Div", resource_col="Owner First Name")

# Write as Pandas CSV
df_write_files(df_bpic13, os.path.join(INTERIM_DATA_DIR, "BPI_Challenge_2013_incidents_processed"))

print(df_bpic13.dtypes)
df_bpic13

#### Label BPIC 2013 Incidents

In [None]:
df_bpic13 = df_label_next_activity(df_bpic13, eoc_token=TOKEN_EOC)
df_bpic13 = df_label_activity_duration(df_bpic13, unit='d', eoc_token=pd.Timedelta(0))
df_bpic13 = df_label_remaining_cycle_time(df_bpic13, unit='d')

df_write_files(df_bpic13, os.path.join(INTERIM_DATA_DIR, "BPI_Challenge_2013_incidents_labeled"))
df_bpic13

#### Descriptive Statistics for BPIC 2013

In [None]:
df_bpic13.info()

In [None]:
df_stat = df_bpic13.describe(include='all')
df_write_files(df_stat, os.path.join(OUTPUT_DATA_DIR, "BPI_Challenge_2013_incidents_describe"), index=True)
df_stat

In [None]:
df_bpic13.hist(xrot=90, figsize=(10, 10))
plt.tight_layout()
plt.savefig(os.path.join(GRAPHIC_DIR, "BPI_Challenge_2013_incidents_hist.svg"), bbox_inches='tight')
plt.show()

In [None]:
df_case_length_stats(df_bpic13)

In [None]:
df_case_duration_stats(df_bpic13)

#### Clean BPIC 2013 Incidents

In [None]:
df_bpic13_clean = df_bpic13.copy()

# Filter event log
df_bpic13_clean = df_filter_case_duration_range(df_bpic13_clean, max=0.95)
df_bpic13_clean = df_filter_date_range(df_bpic13_clean, min="2012-04-01 00:00:00", mode='traces_included')

# Remove mostly empty columns
df_drop_threshold_na_cols(df_bpic13_clean, 0.95)

# Create time features
df_bpic13_clean = df_extract_elapsed_cycle_time(df_bpic13_clean, f"{EVENTLOG_TIMESTAMP}{EVENTLOG_FEAT_TIME_ELAPSED_CYCLE_SUFFIX}", unit='s')
df_bpic13_clean = df_extract_activity_duration(df_bpic13_clean, f"{EVENTLOG_TIMESTAMP}{EVENTLOG_FEAT_TIME_ELAPSED_PREV_SUFFIX}", unit='s', na_token=0)

df_bpic13_clean = df_extract_month_of_year(df_bpic13_clean, f"{EVENTLOG_TIMESTAMP}:month", relative=True)
df_bpic13_clean = df_extract_day_of_year(df_bpic13_clean, f"{EVENTLOG_TIMESTAMP}:dayofyear", relative=True)
df_bpic13_clean = df_extract_day_of_month(df_bpic13_clean, f"{EVENTLOG_TIMESTAMP}:day", relative=True)
df_bpic13_clean = df_extract_day_of_week(df_bpic13_clean, f"{EVENTLOG_TIMESTAMP}:weekday", relative=True)
df_bpic13_clean = df_extract_hour_of_day(df_bpic13_clean, f"{EVENTLOG_TIMESTAMP}:hour", relative=True)

df_bpic13_clean = df_extract_month_of_year(df_bpic13_clean, f"{EVENTLOG_TIMESTAMP}:month:raw")
df_bpic13_clean = df_extract_day_of_year(df_bpic13_clean, f"{EVENTLOG_TIMESTAMP}:dayofyear:raw")
df_bpic13_clean = df_extract_day_of_month(df_bpic13_clean, f"{EVENTLOG_TIMESTAMP}:day:raw")
df_bpic13_clean = df_extract_day_of_week(df_bpic13_clean, f"{EVENTLOG_TIMESTAMP}:weekday:raw")
df_bpic13_clean = df_extract_hour_of_day(df_bpic13_clean, f"{EVENTLOG_TIMESTAMP}:hour:raw")

# Encode categorical labels
df_bpic13_clean = df_encode_label(df_bpic13_clean, cols=EVENTLOG_LABEL_NEXT_ACT)

# Transform bools to int
df_bpic13_clean = df_convert_bool_to_int(df_bpic13_clean)

# Transform ordered cats to int
df_bpic13_clean = df_convert_ordered_cat_to_int(df_bpic13_clean, relative=True)

# Fill empty str values
df_bpic13_clean = df_fillna_str(df_bpic13_clean, TOKEN_NA)

# Fill empty cat values
df_bpic13_clean = df_fillna_cat(df_bpic13_clean, TOKEN_NA)

df_write_files(df_bpic13_clean, os.path.join(OUTPUT_DATA_DIR, "BPI_Challenge_2013_incidents_cleaned"))

df_bpic13_clean

In [None]:
df_bpic13_train, df_bpic13_test = df_strict_temporal_train_test_split(
    df_bpic13_clean,
    0.2,
    df_find_labels(df_bpic13_clean),
    debias_end=False
)

axes = df_visualize_strict_temporal_splitting(df_bpic13_train, df_bpic13_test)
plt.tight_layout()
plt.savefig(os.path.join(GRAPHIC_DIR, "BPI_Challenge_2013_incidents_train_test.svg"), bbox_inches='tight')
plt.show()

# Remove overlaps
df_bpic13_test.dropna(axis='index', how='any', subset=df_find_labels(df_bpic13_test), inplace=True)



In [None]:
df_bpic13_train, df_bpic13_test = df_temporal_train_test_split(df_bpic13_clean, 0.25, split_mode='case_start', filter_mode='traces')

In [None]:
df_write_files(df_bpic13_train, os.path.join(OUTPUT_DATA_DIR, "BPI_Challenge_2013_incidents_train"))
df_bpic13_train

In [None]:
df_write_files(df_bpic13_test, os.path.join(OUTPUT_DATA_DIR, "BPI_Challenge_2013_incidents_test"))
df_bpic13_test

In [None]:
df_bpic13_dyn = df_prefix_pad_attributes(
    pd.concat([df_bpic13_train, df_bpic13_test]),
    df_find_event_attributes(df_bpic13_clean, exclude_labels=True)
)

df_bpic13_dyn

In [None]:
df_bpic13_train_dyn = df_bpic13_dyn.loc[df_bpic13_train[EVENTLOG_CASE].unique()]
df_bpic13_train_dyn

In [None]:
df_bpic13_test_dyn = df_bpic13_dyn.loc[df_bpic13_test[EVENTLOG_CASE].unique()]
df_bpic13_test_dyn

In [None]:
df_naive_regression_metrics(
    df_bpic13_train,
    df_bpic13_test,
    EVENTLOG_LABEL_NEXT_TIME
)

In [None]:
df_naive_regression_metrics(
    df_bpic13_train,
    df_bpic13_test,
    EVENTLOG_LABEL_REM_TIME
)

In [None]:
df_naive_classification_metrics(
    df_bpic13_train,
    df_bpic13_test,
    EVENTLOG_LABEL_NEXT_ACT
)

#### Create Graphs for BPIC 2013

##### Create Resource Graphs for BPIC 2013

In [None]:
mapping, reverse_mapping, data = df_to_pyg_temporal_data(pd.concat([df_bpic13_train, df_bpic13_test], ignore_index=True), EVENTLOG_RESOURCE)
data

In [None]:
train_data, val_data, test_data = data.train_val_test_split(val_ratio=0.15, test_ratio=0.15)

train_loader = pyg.loader.TemporalDataLoader(
    train_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)
val_loader = pyg.loader.TemporalDataLoader(
    val_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)
test_loader = pyg.loader.TemporalDataLoader(
    test_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)

neighbor_loader = pyg.nn.models.tgn.LastNeighborLoader(data.num_nodes, size=8)

In [None]:
memory = pyg.nn.models.TGNMemory(
    data.num_nodes,
    data.msg.size(-1),
    memory_dim,
    time_dim,
    message_module=pyg.nn.models.tgn.IdentityMessage(data.msg.size(-1), memory_dim, time_dim),
    aggregator_module=pyg.nn.models.tgn.LastAggregator(),
)

gnn = GraphAttentionEmbedding(
    in_channels=memory_dim,
    out_channels=embedding_dim,
    msg_dim=data.msg.size(-1),
    time_enc=memory.time_enc,
).to(device)

link_pred = LinkPredictor(in_channels=embedding_dim).to(device)

optimizer = torch.optim.Adam(set(memory.parameters()) | set(gnn.parameters()) | set(link_pred.parameters()), lr=0.00001, weight_decay=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()

# Helper vector to map global node indices to local ones.
assoc = torch.empty(data.num_nodes, dtype=torch.long, device=device)

In [None]:
for epoch in range(1, 50):
    loss = train()
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}')
    val_ap, val_auc = test(val_loader)
    test_ap, test_auc = test(test_loader)
    print(f'Val AP: {val_ap:.4f}, Val AUC: {val_auc:.4f}')
    print(f'Test AP: {test_ap:.4f}, Test AUC: {test_auc:.4f}')

with open(os.path.join(OUTPUT_DATA_DIR, "bpic13_resource_mapping.json"), "w") as f:
  json.dump(mapping, f)
with open(os.path.join(OUTPUT_DATA_DIR, "bpic13_resource_inverse_mapping.json"), "w") as f:
  json.dump(reverse_mapping, f)

embeddings = embed()
np.save(os.path.join(OUTPUT_DATA_DIR, "bpic13_resource_embeddings.npy"), embeddings, allow_pickle=False)

In [None]:
arr_nan = np.full(EMBEDDING_DIM, -99.0, dtype='float32')

embeddings_df_train = df_bpic13_train_dyn[EVENTLOG_RESOURCE].map(lambda x: embeddings[reverse_mapping[x]] if x is not None and not pd.isna(x) else arr_nan, na_action=None).to_numpy()
resource_graph_embeddings_train = np.empty((embeddings_df_train.shape[0], embeddings_df_train.shape[1], EMBEDDING_DIM), dtype='float32')
for i in range(len(embeddings_df_train)):
  resource_graph_embeddings_train[i] = np.stack(embeddings_df_train[i])

resource_graph_embeddings_train.shape

In [None]:
embeddings_df_test = df_bpic13_test_dyn[EVENTLOG_RESOURCE].map(lambda x: embeddings[reverse_mapping[x]] if x is not None and not pd.isna(x) else arr_nan, na_action=None).to_numpy()
resource_graph_embeddings_test = np.empty((embeddings_df_test.shape[0], embeddings_df_test.shape[1], EMBEDDING_DIM), dtype='float32')
for i in range(len(embeddings_df_test)):
  resource_graph_embeddings_test[i] = np.stack(embeddings_df_test[i])

resource_graph_embeddings_test.shape

##### Create Group Graphs for BPIC 2013

In [None]:
mapping, reverse_mapping, data = df_to_pyg_temporal_data(pd.concat([df_bpic13_train, df_bpic13_test], ignore_index=True), EVENTLOG_GROUP)
data

In [None]:
train_data, val_data, test_data = data.train_val_test_split(val_ratio=0.15, test_ratio=0.15)

train_loader = pyg.loader.TemporalDataLoader(
    train_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)
val_loader = pyg.loader.TemporalDataLoader(
    val_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)
test_loader = pyg.loader.TemporalDataLoader(
    test_data,
    batch_size=32,
    neg_sampling_ratio=1.0,
)

neighbor_loader = pyg.nn.models.tgn.LastNeighborLoader(data.num_nodes, size=8)

In [None]:
memory = pyg.nn.models.TGNMemory(
    data.num_nodes,
    data.msg.size(-1),
    memory_dim,
    time_dim,
    message_module=pyg.nn.models.tgn.IdentityMessage(data.msg.size(-1), memory_dim, time_dim),
    aggregator_module=pyg.nn.models.tgn.LastAggregator(),
)

gnn = GraphAttentionEmbedding(
    in_channels=memory_dim,
    out_channels=embedding_dim,
    msg_dim=data.msg.size(-1),
    time_enc=memory.time_enc,
).to(device)

link_pred = LinkPredictor(in_channels=embedding_dim).to(device)

optimizer = torch.optim.Adam(set(memory.parameters()) | set(gnn.parameters()) | set(link_pred.parameters()), lr=0.00001, weight_decay=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()

# Helper vector to map global node indices to local ones.
assoc = torch.empty(data.num_nodes, dtype=torch.long, device=device)

In [None]:
for epoch in range(1, 50):
    loss = train()
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}')
    val_ap, val_auc = test(val_loader)
    test_ap, test_auc = test(test_loader)
    print(f'Val AP: {val_ap:.4f}, Val AUC: {val_auc:.4f}')
    print(f'Test AP: {test_ap:.4f}, Test AUC: {test_auc:.4f}')

with open(os.path.join(OUTPUT_DATA_DIR, "bpic13_group_mapping.json"), "w") as f:
  json.dump(mapping, f)
with open(os.path.join(OUTPUT_DATA_DIR, "bpic13_group_inverse_mapping.json"), "w") as f:
  json.dump(reverse_mapping, f)

embeddings = embed()
np.save(os.path.join(OUTPUT_DATA_DIR, "bpic13_group_embeddings.npy"), embeddings, allow_pickle=False)

In [None]:
arr_nan = np.full(EMBEDDING_DIM, -99.0, dtype='float32')

embeddings_df_train = df_bpic13_train_dyn[EVENTLOG_GROUP].map(lambda x: embeddings[reverse_mapping[x]] if x is not None and not pd.isna(x) else arr_nan, na_action=None).to_numpy()
group_graph_embeddings_train = np.empty((embeddings_df_train.shape[0], embeddings_df_train.shape[1], EMBEDDING_DIM), dtype='float32')
for i in range(len(embeddings_df_train)):
  group_graph_embeddings_train[i] = np.stack(embeddings_df_train[i])

group_graph_embeddings_train.shape

In [None]:
embeddings_df_test = df_bpic13_test_dyn[EVENTLOG_GROUP].map(lambda x: embeddings[reverse_mapping[x]] if x is not None and not pd.isna(x) else arr_nan, na_action=None).to_numpy()
group_graph_embeddings_test = np.empty((embeddings_df_test.shape[0], embeddings_df_test.shape[1], EMBEDDING_DIM), dtype='float32')
for i in range(len(embeddings_df_test)):
  group_graph_embeddings_test[i] = np.stack(embeddings_df_test[i])

group_graph_embeddings_test.shape

#### Create Dataset from BPIC 2013 Incidents

In [None]:
ds_bpic13_train = tf.data.Dataset.from_tensor_slices(({
  "activity": df_bpic13_train_dyn["concept:name"].to_numpy(na_value=TOKEN_PADDING),
  "status": df_bpic13_train_dyn["Status"].to_numpy(na_value=TOKEN_PADDING),
  "sub_status": df_bpic13_train_dyn["Sub Status"].to_numpy(na_value=TOKEN_PADDING),
  "org_role": df_bpic13_train_dyn["org:role"].to_numpy(na_value=TOKEN_PADDING),
  "org_line": df_bpic13_train_dyn["Involved Org line 3"].to_numpy(na_value=TOKEN_PADDING),
  "org_group": df_bpic13_train_dyn["org:group"].to_numpy(na_value=TOKEN_PADDING),
  "owner_country": df_bpic13_train_dyn["Owner Country"].to_numpy(na_value=TOKEN_PADDING),
  "org_resource": df_bpic13_train_dyn["org:resource"].to_numpy(na_value=TOKEN_PADDING),
  "org_resource_graph": resource_graph_embeddings_train,
  "org_group_graph": group_graph_embeddings_train,

  "time_timestamp_elapsedcycle": df_bpic13_train_dyn["time:timestamp:elapsedcycle"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_elapsedprev": df_bpic13_train_dyn["time:timestamp:elapsedprev"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_month": df_bpic13_train_dyn["time:timestamp:month"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_dayofyear": df_bpic13_train_dyn["time:timestamp:dayofyear"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_day": df_bpic13_train_dyn["time:timestamp:day"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_weekday": df_bpic13_train_dyn["time:timestamp:weekday"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_hour": df_bpic13_train_dyn["time:timestamp:hour"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_month_raw": df_bpic13_train_dyn["time:timestamp:month:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_dayofyear_raw": df_bpic13_train_dyn["time:timestamp:dayofyear:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_day_raw": df_bpic13_train_dyn["time:timestamp:day:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_weekday_raw": df_bpic13_train_dyn["time:timestamp:weekday:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_hour_raw": df_bpic13_train_dyn["time:timestamp:hour:raw"].to_numpy(dtype='float32', na_value=-1),

  "case_product": np.expand_dims(df_bpic13_train["case:Product"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_country": np.expand_dims(df_bpic13_train["case:Country"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_latest_impact": np.expand_dims(df_bpic13_train["case:SR Latest Impact"].to_numpy(dtype='float32', na_value=-1), axis=-1),

}, {
  "next_activity": np.expand_dims(df_bpic13_train["label:concept:name:next"].to_numpy(dtype='int16'), axis=-1),
  "next_time": np.expand_dims(df_bpic13_train["label:time:timestamp:next"].to_numpy(dtype='float32'), axis=-1),
  "remaining_time": np.expand_dims(df_bpic13_train["label:time:timestamp:last"].to_numpy(dtype='float32'), axis=-1),
}))

ds_write_files(ds_bpic13_train, os.path.join(OUTPUT_DATA_DIR, 'BPI_Challenge_2013_incidents_train_dataset'))

print(len(ds_bpic13_train))
ds_bpic13_train

In [None]:
ds_bpic13_test = tf.data.Dataset.from_tensor_slices(({
  "activity": df_bpic13_test_dyn["concept:name"].to_numpy(na_value=TOKEN_PADDING),
  "status": df_bpic13_test_dyn["Status"].to_numpy(na_value=TOKEN_PADDING),
  "sub_status": df_bpic13_test_dyn["Sub Status"].to_numpy(na_value=TOKEN_PADDING),
  "org_role": df_bpic13_test_dyn["org:role"].to_numpy(na_value=TOKEN_PADDING),
  "org_line": df_bpic13_test_dyn["Involved Org line 3"].to_numpy(na_value=TOKEN_PADDING),
  "org_group": df_bpic13_test_dyn["org:group"].to_numpy(na_value=TOKEN_PADDING),
  "owner_country": df_bpic13_test_dyn["Owner Country"].to_numpy(na_value=TOKEN_PADDING),
  "org_resource": df_bpic13_test_dyn["org:resource"].to_numpy(na_value=TOKEN_PADDING),
  "org_resource_graph": resource_graph_embeddings_test,
  "org_group_graph": group_graph_embeddings_test,

  "time_timestamp_elapsedcycle": df_bpic13_test_dyn["time:timestamp:elapsedcycle"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_elapsedprev": df_bpic13_test_dyn["time:timestamp:elapsedprev"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_month": df_bpic13_test_dyn["time:timestamp:month"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_dayofyear": df_bpic13_test_dyn["time:timestamp:dayofyear"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_day": df_bpic13_test_dyn["time:timestamp:day"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_weekday": df_bpic13_test_dyn["time:timestamp:weekday"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_hour": df_bpic13_test_dyn["time:timestamp:hour"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_month_raw": df_bpic13_test_dyn["time:timestamp:month:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_dayofyear_raw": df_bpic13_test_dyn["time:timestamp:dayofyear:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_day_raw": df_bpic13_test_dyn["time:timestamp:day:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_weekday_raw": df_bpic13_test_dyn["time:timestamp:weekday:raw"].to_numpy(dtype='float32', na_value=-1),
  "time_timestamp_hour_raw": df_bpic13_test_dyn["time:timestamp:hour:raw"].to_numpy(dtype='float32', na_value=-1),

  "case_product": np.expand_dims(df_bpic13_test["case:Product"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_country": np.expand_dims(df_bpic13_test["case:Country"].to_numpy(na_value=TOKEN_PADDING), axis=-1),
  "case_latest_impact": np.expand_dims(df_bpic13_test["case:SR Latest Impact"].to_numpy(dtype='float32', na_value=-1), axis=-1),

}, {
  "next_activity": np.expand_dims(df_bpic13_test["label:concept:name:next"].to_numpy(dtype='int16'), axis=-1),
  "next_time": np.expand_dims(df_bpic13_test["label:time:timestamp:next"].to_numpy(dtype='float32'), axis=-1),
  "remaining_time": np.expand_dims(df_bpic13_test["label:time:timestamp:last"].to_numpy(dtype='float32'), axis=-1),
}))

ds_write_files(ds_bpic13_test, os.path.join(OUTPUT_DATA_DIR, 'BPI_Challenge_2013_incidents_test_dataset'))

print(len(ds_bpic13_test))
ds_bpic13_test

# Data Export

In [None]:
output_file = f"results_{datetime.datetime.now().strftime('%Y-%m-%d_%H.%M.%S%z')}.zip"

!zip -r "$output_file" "$DATA_DIR" "$GRAPHIC_DIR" "$MODEL_DIR"

## A: Export to Google Drive

In [None]:
drive.mount("/content/drive")

Path(GDRIVE_OUTPUT_DIR).mkdir(exist_ok=True)

!cp "$output_file" "$GDRIVE_OUTPUT_DIR"

drive.flush_and_unmount()

## B: Download to Local Machine

In [None]:
files.download(output_file)