In [3]:
import pandas as pd
import numpy as np

DATA_PATH = "../data/"

corpus_df = pd.read_csv(DATA_PATH + "process_behavior_corpus.csv")
T_SAD_df = pd.read_csv(DATA_PATH + "T_SAD.csv")
A_SAD_df = pd.read_csv(DATA_PATH + "A_SAD.csv")
S_NAP_df = pd.read_csv(DATA_PATH + "S_NAP.csv")

In [10]:
"""
A-SAD: Given an eventually-follows relation ef = a ≺ b of a trace σ, decide if ef represents a valid execution order of the two activities a and b that are executed in a process or not, without knowing the behavior allowed in the process.
Each row contains an eventually-follows relation (column eventually_follows) with a corresponding label (column out_of_order) indicating wether the two activities of the relation were executed in an invalid order (TRUE) or in a valid order (FALSE) according to the underlying process (model). The set of activities that can occur in the process are also given (column unique_activities).
"""
# Valid?
A_SAD_df["ds_labels"] = (~A_SAD_df["out_of_order"]).astype(bool)

A_SAD_df.head()

Unnamed: 0,model_id,revision_id,out_of_order,unique_activities,eventually_follows,id,ds_labels
0,e8990a10f516495e89a1eabf9627b9e9,295ab34872b6424aad0b4633abb621a9,True,"{'Set up access rights, hardware and software'...","('Procure work equipment', 'Add personal data')",e8990a10f516495e89a1eabf9627b9e9_295ab34872b64...,False
1,e8990a10f516495e89a1eabf9627b9e9,295ab34872b6424aad0b4633abb621a9,False,"{'Set up access rights, hardware and software'...","('Set up access rights, hardware and software'...",e8990a10f516495e89a1eabf9627b9e9_295ab34872b64...,True
2,e8990a10f516495e89a1eabf9627b9e9,295ab34872b6424aad0b4633abb621a9,False,"{'Set up access rights, hardware and software'...","('Select necessary work equipment', 'Set up ac...",e8990a10f516495e89a1eabf9627b9e9_295ab34872b64...,True
3,e8990a10f516495e89a1eabf9627b9e9,295ab34872b6424aad0b4633abb621a9,True,"{'Set up access rights, hardware and software'...","('Procure work equipment', 'Select necessary w...",e8990a10f516495e89a1eabf9627b9e9_295ab34872b64...,False
4,e8990a10f516495e89a1eabf9627b9e9,295ab34872b6424aad0b4633abb621a9,True,"{'Set up access rights, hardware and software'...","('Set up access rights, hardware and software'...",e8990a10f516495e89a1eabf9627b9e9_295ab34872b64...,False


In [11]:
"""
- T-SAD: Given a trace σ, decide if σ is a valid execution of the underlying process or not, without knowing the behavior allowed in the process. Each row contains a trace (column trace) with a corresponding label (column anomalous) indicating whether the trace represents a valid execution of the underlying process. The set of activities that can occur in the process are also given (column unique_activities).
"""
# Valid?
T_SAD_df["ds_labels"] = (~T_SAD_df["anomalous"]).astype(bool)

T_SAD_df.head()

Unnamed: 0,model_id,revision_id,trace,label,unique_activities,anomalous,id,ds_labels
0,e8990a10f516495e89a1eabf9627b9e9,295ab34872b6424aad0b4633abb621a9,"['Select necessary work equipment', 'Add perso...","['Select necessary work equipment', 'Add perso...","{'Set up access rights, hardware and software'...",True,e8990a10f516495e89a1eabf9627b9e9_295ab34872b64...,False
1,e8990a10f516495e89a1eabf9627b9e9,295ab34872b6424aad0b4633abb621a9,"['Add personal data', 'Set up access rights, h...","['Set up access rights, hardware and software'...","{'Set up access rights, hardware and software'...",True,e8990a10f516495e89a1eabf9627b9e9_295ab34872b64...,False
2,e8990a10f516495e89a1eabf9627b9e9,295ab34872b6424aad0b4633abb621a9,"['Add personal data', 'Select necessary work e...",False,"{'Set up access rights, hardware and software'...",False,e8990a10f516495e89a1eabf9627b9e9_295ab34872b64...,True
3,e8990a10f516495e89a1eabf9627b9e9,295ab34872b6424aad0b4633abb621a9,"['Select necessary work equipment', 'Add perso...","['Select necessary work equipment', 'Add perso...","{'Set up access rights, hardware and software'...",True,e8990a10f516495e89a1eabf9627b9e9_295ab34872b64...,False
4,e8990a10f516495e89a1eabf9627b9e9,295ab34872b6424aad0b4633abb621a9,"['Select necessary work equipment', 'Add perso...","['Select necessary work equipment', 'Add perso...","{'Set up access rights, hardware and software'...",True,e8990a10f516495e89a1eabf9627b9e9_295ab34872b64...,False


In [12]:
T_total = len(T_SAD_df)
T_valid = len(T_SAD_df[T_SAD_df['anomalous'] == False])
T_anomalous = len(T_SAD_df[T_SAD_df['anomalous'] == True])

A_total = len(A_SAD_df)
A_valid = len(A_SAD_df[A_SAD_df['out_of_order'] == False])
A_anomalous = len(A_SAD_df[A_SAD_df['out_of_order'] == True])

SN_total = len(S_NAP_df)
SN_valid = SN_total
SN_anomalous = '-'

summary_table = pd.DataFrame({
    'Task Dataset': ['T-SAD', 'A-SAD', 'S-NAP'],
    'Total': [T_total, A_total, SN_total],
    'Valid': [T_valid, A_valid, SN_valid],
    'Anomalous': [T_anomalous, A_anomalous, SN_anomalous]
})

summary_table['Total'] = summary_table['Total'].apply(lambda x: f"{x:,}")
summary_table['Valid'] = summary_table['Valid'].apply(lambda x: f"{x:,}")
summary_table['Anomalous'] = summary_table['Anomalous'].apply(lambda x: f"{x:,}" if isinstance(x, int) else x)
summary_table['Valid (%)'] = np.round((summary_table['Valid'].str.replace(',', '').astype(int) /
                                       summary_table['Total'].str.replace(',', '').astype(int)) * 100, 2)

summary_table

Unnamed: 0,Task Dataset,Total,Valid,Anomalous,Valid (%)
0,T-SAD,291251,150301,140950,51.61
1,A-SAD,316308,158154,158154,50.0
2,S-NAP,1289081,1289081,-,100.0


In [21]:
from typing import Optional
from datasets.arrow_dataset import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle
import ast
import sys

def remove_duplicates(pair_df):
    """
    Removes duplicate rows in the DataFrame based on specific columns.
    Additional columns like 'trace', 'eventually_follows', and 'prefix'
    are considered if present in the DataFrame.
    """
    columns = ["revision_id", "model_id", "unique_activities"]
    if "trace" in pair_df.columns:
        columns.append("trace")
    if "eventually_follows" in pair_df.columns:
        columns.append("eventually_follows")
    if "prefix" in pair_df.columns:
        columns.append("prefix") # update this to consider multiple possible options?
    pair_df = pair_df.drop_duplicates(subset=columns)
    return pair_df


def setify(x: str):
    """
    Converts a string representation of a set into an actual Python set.
    Ensures the result is a set, otherwise raises an AssertionError.
    """
    set_: set[str] = eval(x)
    assert isinstance(set_, set), f"Conversion failed for {x}"
    return set_

def stratified_sample(df, label_col, frac, random_state=42) -> pd.DataFrame:
    """
    Performs stratified sampling to reduce the dataset size by a given fraction.
    """
    stratified_df, _ = train_test_split(
        df, 
        stratify=df[label_col], 
        test_size=1-frac, 
        random_state=random_state
    )
    return stratified_df

def split_by_model(df, task, pkl_path="data/train_val_test.pkl", frac: Optional[float] = None) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Splits a DataFrame into train, validation, and test subsets based on IDs.
    Only includes rows with more than one unique activity.
    """
    df["id"] = df["model_id"].astype(str) + "_" + df["revision_id"].astype(str)
    df["num_unique_activities"] = df["unique_activities"].apply(len)

    # only keep rows with at least 2 activities
    df = df[df["num_unique_activities"] > 1]

    # Load pre-defined train/val/test IDs
    with open(pkl_path, "rb") as file:
        train_ids, val_ids, test_ids = pickle.load(file)
        
    # Split the DataFrame into train, validation, and test sets
    train_df = df[df["id"].isin(train_ids)]
    val_df = df[df["id"].isin(val_ids)]
    test_df = df[df["id"].isin(test_ids)]

    # Perform stratified sampling for tasks with binary labels
    # if frac is not None, otherwise perform random sampling
    if frac is not None and 0 < frac < 1:
        if task in ["TRACE_ANOMALY", "OUT_OF_ORDER"]:
            train_df = stratified_sample(train_df, label_col="ds_labels", frac=frac)
            val_df = stratified_sample(val_df, label_col="ds_labels", frac=frac)
            test_df = stratified_sample(test_df, label_col="ds_labels", frac=frac)
        else:
            # For S-NAP, perform random sampling
            train_df = train_df.sample(frac=frac, random_state=42)
            val_df = val_df.sample(frac=frac, random_state=42)
            test_df = test_df.sample(frac=frac, random_state=42)

    return train_df, val_df, test_df

def next_label(row):
    sorted_acts = sorted(list(row["unique_activities"]))
    if row["next"] not in sorted_acts:
        return 0
    return sorted_acts.index(row["next"]) + 1

def load_dataset(file_name: str, task: str, frac: Optional[float]) -> tuple[Dataset, Dataset, Dataset]:
    """
    Dynamically loads and processes a dataset based on the file name and task.
    """
    df = pd.read_csv(file_name)

    if task == "TRACE_ANOMALY":
        # T-SAD
        df["ds_labels"] = (~df["anomalous"]).astype(bool)  # Invert labels
        df["trace"] = df["trace"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
        df["trace"] = df["trace"].apply(lambda x: tuple(x))
        df = remove_duplicates(df)
        df["unique_activities"] = df["unique_activities"].apply(setify)
        columns = ["model_id", "revision_id", "unique_activities", "trace", "ds_labels"]
        df = df.loc[:, columns]

    elif task == "OUT_OF_ORDER":
        # A-SAD
        df["ds_labels"] = (~df["out_of_order"]).astype(bool)  # Invert labels
        df = remove_duplicates(df)
        df["unique_activities"] = df["unique_activities"].apply(setify)
        columns = ["model_id", "revision_id", "unique_activities", "ds_labels", "eventually_follows"]
        df = df.loc[:, columns]

    elif task == "NEXT_ACTIVITY":
        # S-NAP
        df["prefix"] = df["prefix"].apply(lambda x: tuple(x))
        df = remove_duplicates(df)
        df["prefix"] = df["prefix"].apply(lambda x: list(x))
        df["unique_activities"] = df["unique_activities"].apply(setify)
        mask = ~(df["next"] == "[END]")
        df = df[mask]
        df["ds_labels"] = df.apply(next_label, axis=1)
        columns = ["model_id", "revision_id", "trace", "prefix", "next", "unique_activities", "ds_labels"]
        df = df.loc[:, columns]
    else:
        raise ValueError(f"Unsupported task: {task}")

    # split into train/val/test
    train_df, val_df, test_df = split_by_model(df, task=task, frac=frac)

    return (
        Dataset.from_pandas(train_df.reset_index(drop=True)),
        Dataset.from_pandas(val_df.reset_index(drop=True)),
        Dataset.from_pandas(test_df.reset_index(drop=True)),
    )

def print_stats(split_name, split_df):
        total_samples = len(split_df)
        label_counts = pd.Series(split_df['ds_labels']).value_counts()
        label_percentages = label_counts / total_samples * 100
        print(f"--- {split_name} Split Statistics ---")
        print(f"Total samples: {total_samples}")
        print(f"Label distribution:\n{label_counts}")
        print(f"Label percentages:\n{label_percentages.round(2)}%\n")

def format_time(seconds):
    days = seconds // (24 * 3600)
    seconds %= (24 * 3600)
    hours = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
    return f"{int(days)}d {int(hours)}h {int(minutes)}m {int(seconds)}s"

In [40]:
fractions = [0.05, 0.1, 0.2, 0.5, 0.6, 0.7, 0.8, 1.0]
datasets = {
    "T-SAD": len(T_SAD_df),
    "A-SAD": len(A_SAD_df),
    "S-NAP": len(S_NAP_df)
}

train_ratio = 0.8
eval_ratio = 0.15
test_ratio = 0.05

split_results = {"Task": [], "Fraction": [], "Train": [], "Eval": [], "Test": [], "Total": []}

for task, total_samples in datasets.items():
    for frac in fractions:
        samples = int(total_samples * frac)
        train_samples = int(samples * train_ratio)
        eval_samples = int(samples * eval_ratio)
        test_samples = int(samples * test_ratio)
        
        split_results["Task"].append(task)
        split_results["Fraction"].append(f"{int(frac * 100)}%")
        split_results["Train"].append(train_samples)
        split_results["Eval"].append(eval_samples)
        split_results["Test"].append(test_samples)
        split_results["Total"].append(samples)

split_df = pd.DataFrame(split_results)

split_df

Unnamed: 0,Task,Fraction,Train,Eval,Test,Total
0,T-SAD,5%,11649,2184,728,14562
1,T-SAD,10%,23300,4368,1456,29125
2,T-SAD,20%,46600,8737,2912,58250
3,T-SAD,50%,116500,21843,7281,145625
4,T-SAD,60%,139800,26212,8737,174750
5,T-SAD,70%,163100,30581,10193,203875
6,T-SAD,80%,186400,34950,11650,233000
7,T-SAD,100%,233000,43687,14562,291251
8,A-SAD,5%,12652,2372,790,15815
9,A-SAD,10%,25304,4744,1581,31630
