In [2]:
import pandas as pd
import ast
from datasets import Dataset
from llama_fine_tuning_util import load_trace_data, load_pairs_data, load_next_activity_data

def calculate_tsad_statistics(dataset: Dataset):
    df = dataset.to_pandas()
    total_samples = len(df)
    valid_samples = (df["ds_labels"] == "True").sum()
    anomalous_samples = (df["ds_labels"] == "False").sum()
    trace_lengths = df["trace"].apply(len)
    
    min_length = trace_lengths.min()
    max_length = trace_lengths.max()
    mean_length = trace_lengths.mean()
    median_length = trace_lengths.median()
    
    print(f"T-SAD Statistics:")
    print(f"Total samples: {total_samples}")
    print(f"Valid samples: {valid_samples}")
    print(f"Anomalous samples: {anomalous_samples}")
    print(f"Trace length: min={min_length}, max={max_length}, mean={mean_length:.2f}, median={median_length}")

def calculate_asad_statistics(dataset: Dataset):
    df = dataset.to_pandas()
    total_samples = len(df)
    valid_samples = (df["ds_labels"] == "True").sum()
    anomalous_samples = (df["ds_labels"] == "False").sum()

    print(f"\nA-SAD Statistics:")
    print(f"Total samples: {total_samples}")
    print(f"Valid samples: {valid_samples}")
    print(f"Anomalous samples: {anomalous_samples}")

def calculate_snap_statistics(dataset: Dataset):
    df = dataset.to_pandas()
    total_samples = len(df)
    prefix_lengths = df["prefix"].apply(len)
    
    min_length = prefix_lengths.min()
    max_length = prefix_lengths.max()
    mean_length = prefix_lengths.mean()
    median_length = prefix_lengths.median()
    
    print(f"\nS-NAP Statistics:")
    print(f"Total samples: {total_samples}")
    print(f"Prefix length: min={min_length}, max={max_length}, mean={mean_length:.2f}, median={median_length}")


In [3]:
trace_train, trace_val, trace_test = load_trace_data()
pairs_train, pairs_val, pairs_test = load_pairs_data()
snap_train, snap_val, snap_test = load_next_activity_data()

In [10]:
from datasets import concatenate_datasets

label_column = "ds_labels"
positive_samples_full = trace_val.filter(lambda x: x[label_column] == "True")
negative_samples_full = trace_val.filter(lambda x: x[label_column] == "False")
max_samples_per_class = 2 * min(positive_samples_full.num_rows, negative_samples_full.num_rows)

samples_per_class = max_samples_per_class // 2

# Separate positive and negative samples
positive_samples = positive_samples_full.shuffle(seed=4).select(range(samples_per_class))
negative_samples = negative_samples_full.shuffle(seed=4).select(range(samples_per_class))
trace_val = concatenate_datasets([positive_samples, negative_samples])

Filter:   0%|          | 0/26978 [00:00<?, ? examples/s]

Filter:   0%|          | 0/26978 [00:00<?, ? examples/s]

In [11]:
from datasets import concatenate_datasets

label_column = "ds_labels"
positive_samples_full = trace_test.filter(lambda x: x[label_column] == "True")
negative_samples_full = trace_test.filter(lambda x: x[label_column] == "False")
max_samples_per_class = 2 * min(positive_samples_full.num_rows, negative_samples_full.num_rows)

samples_per_class = max_samples_per_class // 2

# Separate positive and negative samples
positive_samples = positive_samples_full.shuffle(seed=4).select(range(samples_per_class))
negative_samples = negative_samples_full.shuffle(seed=4).select(range(samples_per_class))
trace_test = concatenate_datasets([positive_samples, negative_samples])

Filter:   0%|          | 0/10820 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10820 [00:00<?, ? examples/s]

In [14]:
print(trace_all)
print(pairs_all)
print(snap_all)

Dataset({
    features: ['model_id', 'revision_id', 'unique_activities', 'trace', 'ds_labels', 'id', 'num_unique_activities'],
    num_rows: 184304
})
Dataset({
    features: ['model_id', 'revision_id', 'unique_activities', 'eventually_follows', 'ds_labels', 'id', 'num_unique_activities'],
    num_rows: 316308
})
Dataset({
    features: ['model_id', 'revision_id', 'unique_activities', 'trace', 'prefix', 'next', 'id', 'num_unique_activities'],
    num_rows: 575339
})


In [13]:
trace_all = concatenate_datasets([trace_train, trace_val, trace_test])
pairs_all = concatenate_datasets([pairs_train, pairs_val, pairs_test])
snap_all = concatenate_datasets([snap_train, snap_val, snap_test])

In [None]:
# Calculate statistics
calculate_tsad_statistics(trace_all)
# 16549 + 16549 = valid + test (equal)
# 74340 + 76866 = train

calculate_asad_statistics(pairs_all)
calculate_snap_statistics(snap_all)

T-SAD Statistics:
Total samples: 184304
Valid samples: 93415
Anomalous samples: 90889
Trace length: min=2, max=10, mean=7.26, median=8.0

A-SAD Statistics:
Total samples: 316308
Valid samples: 158154
Anomalous samples: 158154

S-NAP Statistics:
Total samples: 575339
Prefix length: min=1, max=9, mean=5.67, median=6.0


In [21]:
import pandas as pd
import ast
from datasets import Dataset, concatenate_datasets
from llama_fine_tuning_util import load_discovery_data

def calculate_sdfd_statistics(dataset: Dataset):
    df = dataset.to_pandas()
    total_samples = len(df)
    
    # Parse DFGs
    df["dfg_parsed"] = df["dfg"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    edge_counts = df["dfg_parsed"].apply(len)

    min_edges = edge_counts.min()
    max_edges = edge_counts.max()
    mean_edges = edge_counts.mean()
    median_edges = edge_counts.median()

    print(f"\nS-DFD Statistics:")
    print(f"Total samples: {total_samples}")
    print(f"Edges: min={min_edges}, max={max_edges}, mean={mean_edges:.2f}, median={median_edges}")

def calculate_sptd_operator_distribution(dataset):
    # Convert to pandas DataFrame
    df = dataset.to_pandas()

    # Initialize operator counters
    operators = {"->": 0, "+": 0, "X": 0, "*": 0}

    # Count operators in each pt string
    for pt_text in df["pt"]:
        if isinstance(pt_text, str):
            operators["->"] += pt_text.count("->")
            operators["+"]  += pt_text.count("+")
            operators["X"]  += pt_text.count("X")
            operators["*"]  += pt_text.count("*")

    # Calculate total operators counted
    total_ops = sum(operators.values())

    print("=== S-PTD Operator Distribution ===")
    for op, count in operators.items():
        percentage = (count / total_ops) * 100 if total_ops > 0 else 0
        print(f"{op}: {count} occurrences ({percentage:.2f}%)")


In [18]:
# Load discovery splits
discovery_train, discovery_val, discovery_test = load_discovery_data()

In [19]:
print(discovery_train)
print(discovery_val)
print(discovery_test)

Dataset({
    features: ['model_id', 'revision_id', 'unique_activities', 'dfg', 'pt', 'id', 'num_unique_activities'],
    num_rows: 11311
})
Dataset({
    features: ['model_id', 'revision_id', 'unique_activities', 'dfg', 'pt', 'id', 'num_unique_activities'],
    num_rows: 2745
})
Dataset({
    features: ['model_id', 'revision_id', 'unique_activities', 'dfg', 'pt', 'id', 'num_unique_activities'],
    num_rows: 1524
})


In [22]:

# Combine splits
discovery_all = concatenate_datasets([discovery_train, discovery_val, discovery_test])

# Calculate statistics
calculate_sdfd_statistics(discovery_all)
calculate_sptd_operator_distribution(discovery_all)


S-DFD Statistics:
Total samples: 15580
Edges: min=1, max=87, mean=5.22, median=4.0
=== S-PTD Operator Distribution ===
->: 18676 occurrences (70.15%)
+: 3537 occurrences (13.29%)
X: 4202 occurrences (15.78%)
*: 209 occurrences (0.79%)
