In [1]:
import os
os.chdir("..")

In [2]:
import pandas as pd
import random
import importlib
from instruction_tuning_util import get_formatted_prompt, get_formatted_answer, generate_inverted_answers
from data_loading_util import (
    load_trace_data, load_pairs_data, load_next_activity_data, load_discovery_data
)
from datasets import concatenate_datasets

# Load prompt templates
prompts = importlib.import_module("prompts.prompts_all")

In [None]:
# Task and dataset configurations
TASK_CONFIG = {
    "A-SAD": {"loader": load_pairs_data, "task_name": "activity_anomaly"},
    "T-SAD": {"loader": load_trace_data, "task_name": "trace_anomaly"},
    "S-NAP": {"loader": load_next_activity_data, "task_name": "next_activity"},
    "S-DFD": {"loader": load_discovery_data, "task_name": "dfg", "subset_key": "dfg"},
    "S-PTD": {"loader": load_discovery_data, "task_name": "process_tree", "subset_key": "pt"},
}

# Table-based instruction type probabilities
PROBABILITIES = {
    "A-SAD": {"Normal": 0.8, "Neg.Inv.": 0.1, "Pos.Inv.": 0.1},
    "T-SAD": {"Normal": 0.8, "Neg.Inv.": 0.1, "Pos.Inv.": 0.1},
    "S-NAP": {"Normal": 0.8, "Neg.Inv.": 0.1, "Pos.Inv.": 0.1},
    "S-DFD": {"Normal": 0.8, "Neg.Inv.": 0.2, "Pos.Inv.": 0.0},
    "S-PTD": {"Normal": 1.0, "Neg.Inv.": 0.0, "Pos.Inv.": 0.0},
}

def select_prompt(task_name, gold_answer, prob_cfg, rng):
    instruction_type = "normal"
    template_list = prompts.TASK_PROMPTS_VARIANTS[task_name]

    if task_name in {"trace_anomaly", "activity_anomaly"}:
        if gold_answer == "False" and rng.random() < 0.2:
            instruction_type = "neg_inv"
            template_list = prompts.TASK_PROMPTS_INVERTED_NEGATIVE[task_name]
        elif gold_answer == "True" and rng.random() < 0.2:
            instruction_type = "pos_inv"
            template_list = prompts.TASK_PROMPTS_INVERTED_POSITIVE[task_name]
    else:
        r = rng.random()
        if r < prob_cfg["Neg.Inv."]:
            instruction_type = "neg_inv"
            template_list = prompts.TASK_PROMPTS_INVERTED_NEGATIVE[task_name]
        elif r < prob_cfg["Neg.Inv."] + prob_cfg["Pos.Inv."]:
            instruction_type = "pos_inv"
            template_list = prompts.TASK_PROMPTS_INVERTED_POSITIVE[task_name]

    index = rng.randrange(len(template_list))
    return template_list[index], instruction_type, index


def process_task(task_key, config, prob_cfg, rng):
    print(f"Processing {task_key}...")
    loader = config["loader"]
    task_name = config["task_name"]
    subset_key = config.get("subset_key")

    # Load all splits and concatenate
    train, val, test = loader()
    full_ds = concatenate_datasets([train, val, test])

    if subset_key:
        full_ds = full_ds.filter(lambda x: x[subset_key] is not None)
        full_ds = full_ds.add_column("gold_answer", full_ds[subset_key])
    elif "ds_labels" in full_ds.column_names:
        full_ds = full_ds.rename_column("ds_labels", "gold_answer")
    elif "next" in full_ds.column_names:
        full_ds = full_ds.rename_column("next", "gold_answer")

    full_ds = full_ds.add_column("task_name", [task_name] * len(full_ds))

    rows = []

    for example in full_ds:
        example["gold_answer"] = str(example["gold_answer"])
        template, instruction_type, variant_index = select_prompt(task_name, example["gold_answer"], prob_cfg, rng)

        if instruction_type != "normal":
            example = generate_inverted_answers(example, task_name, rng)

        prompt_text = get_formatted_prompt(example, template["template"])
        answer_text = get_formatted_answer(example, template["answer"])

        # Common fields
        new_row = {
            "id": example["id"],
            "unique_activities": example["unique_activities"],
            "instruction": prompts.GENERAL_INTRO + "\n\n" + prompt_text,
            "output": answer_text,
            "instruction_type": instruction_type,
            "variant": variant_index + 1,
        }

        # Add all fields for other tasks
        if task_name == "activity_anomaly":
            new_row["eventually_follows"] = example.get("eventually_follows")
            new_row["is_valid"] = example.get("gold_answer")
        elif task_name == "trace_anomaly":
            new_row["trace"] = example.get("trace")
            new_row["is_valid"] = example.get("gold_answer")
        elif task_name == "next_activity":
            new_row["prefix"] = example.get("prefix")
            new_row["trace"] = example.get("trace")
            new_row["next"] = example.get("gold_answer")
        elif task_name == "dfg":
            new_row["dfg"] = example.get("dfg")
        elif task_name == "process_tree":
            new_row["pt"] = example.get("pt")

        rows.append(new_row)

    output_dir = "datasets/instructions"
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"{task_key}_instructions.csv")

    df = pd.DataFrame(rows)
    df.to_csv(output_path, index=False)
    print(f"Saved: {task_key}_instructions.csv")

In [None]:
# Run processing for all tasks
rng = random.Random(42)

for task_id, cfg in TASK_CONFIG.items():
    process_task(task_id, cfg, PROBABILITIES[task_id])

In [23]:
# or per task
rng = random.Random(42)

task_id = "S-PTD"
process_task(task_id, TASK_CONFIG[task_id], PROBABILITIES[task_id], rng)

Processing S-PTD...


Filter:   0%|          | 0/15580 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/15580 [00:00<?, ? examples/s]

Saved: S-PTD_instructions.csv
