In [1]:
import os
import re
from typing import Dict, Optional, List
from dataclasses import dataclass
import pandas as pd
import  torch

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
def fixed_read_csv(path):
    """Fix event.csv dynamically by quoting the Description field if missing."""
    fixed_lines = []
    with open(path, 'r', encoding='utf-8') as f:
        header = f.readline().strip()
        fixed_lines.append(header)
        for line in f:
            line = line.rstrip('\n')
            # Split only the first 8 commas: 9 columns total
            parts = line.split(',', 8)
            if len(parts) == 9:
                # If the last field (Description) contains commas but is unquoted, wrap it
                desc = parts[-1]
                if not desc.startswith('"') and ',' in desc:
                    parts[-1] = f'"{desc}"'
                fixed_lines.append(','.join(parts))

    # Now read the corrected lines with pandas
    from io import StringIO
    buffer = StringIO('\n'.join(fixed_lines))
    df = pd.read_csv(buffer)
    return df

In [8]:
def extract_structure_from_folder(dirname):
    _ABN_PREFIX = re.compile(r"^(AN|NM)_", re.IGNORECASE)
    d = dirname.strip('/')
    m = _ABN_PREFIX.match(d)
    if not m:
        # Fallback for COM
        return {
            'is_abnormal': None,
            'class_label': None,
            'category': None,
            'fault_type': None,
            'workload': None,
            'variables': None,
        }

    # Label
    is_abn = (m.group(1).upper() == 'AN')
    parts = d.split('_')

    # Category + Fault Type
    category = None
    fault = None

    if is_abn:
        if len(parts) >= 3:
            category = parts[1]
            #print(f"category : {category}")
            
            fault = parts[2]
            #print(f"fault : {fault}")
        start_idx = 3
    else:
        if len(parts) >= 2:
            category = parts[1]
            #print(f" category (normal) : {category}")
        start_idx = 2

    # Workload + Variables
    allowed_workloads = {'r', 'w', 'rw', 'rpc', 'rwrpc'}
    for i in range(start_idx, len(parts)):
        tok = parts[i].lower()
        if tok in allowed_workloads:
            workload = tok
            if i + 1 < len(parts):
                # Keep the rest
                variables = '_'.join(parts[i+1:])
            break

    return {
        'is_abnormal': int(is_abn),
        'class_label': 'Abnormal' if is_abn else 'Normal',
        'category': category,
        'fault_type': fault,
        'workload': workload,
        'variables': variables,
    }

In [9]:
@dataclass
class TraceSet:
    name: str
    dirpath: str
    labels: Dict[str, Optional[str]]
    trace: pd.DataFrame
    event: pd.DataFrame
    edge: pd.DataFrame
    operation: pd.DataFrame


def load_trace_set(dirpath):
    name = os.path.basename(os.path.normpath(dirpath))
    labels = extract_structure_from_folder(name)

    try:
        print(f"Loading: {name}")

        trace = pd.read_csv(os.path.join(dirpath, "trace.csv"))
        event = fixed_read_csv(os.path.join(dirpath, "event.csv"))
        edge = pd.read_csv(os.path.join(dirpath, "edge.csv"))
        operation = pd.read_csv(os.path.join(dirpath, "operation.csv"))

        return TraceSet(name, dirpath, labels, trace, event, edge, operation)

    except Exception as e:
        print(f"Error reading files in folder: {dirpath}")
        print(f"Error message: {type(e).__name__}: {e}")
        raise

def iter_trace_sets(root):
    required = {"trace.csv", "event.csv", "edge.csv", "operation.csv"}
    valid_dirs = []

    for name in sorted(os.listdir(root)):
        folder = os.path.join(root, name)
        if not os.path.isdir(folder):
            continue
        files = set(os.listdir(folder))
        if required.issubset(files):
            valid_dirs.append(folder)

    return valid_dirs

In [10]:
def build_master_tables(sets):
    traces, events, edges, ops = [], [], [], []

    for ts in sets:
        labels = ts.labels or {}
        meta = {
            # "SetName":    ts.name,
            "Label": labels.get("class_label"),
            "IsAbnormal": labels.get("is_abnormal"),
            "FaultType":  labels.get("fault_type"),
            "Category":    labels.get("category"),
            "Workload":    labels.get("workload"),
            "Variables":   labels.get("variables"),
        }
        traces.append(ts.trace.assign(**meta))
        events.append(ts.event)
        edges.append(ts.edge)
        ops.append(ts.operation)

    traces_df = pd.concat(traces, ignore_index=True)
    events_df = pd.concat(events, ignore_index=True)
    edges_df  = pd.concat(edges,  ignore_index=True)
    ops_df    = pd.concat(ops,    ignore_index=True)
    
    return traces_df, events_df, edges_df, ops_df

In [11]:
def load_tracebench(root_dir):
    # Load all trace sets
    sets = [load_trace_set(p) for p in iter_trace_sets(root_dir)]
    #all_dirs = iter_trace_sets(root_dir)
    #sample_dirs = all_dirs[0]
    #sets = [load_trace_set(p) for p in sample_dirs]
    print(f"Loaded {len(sets)} trace sets from {root_dir}")

    traces_df, events_df, edges_df, ops_df = build_master_tables(sets)
    print(f"Loaded {len(sets)} sets from {root_dir}")
    print(f"traces_df:   {traces_df.shape}")
    print(f"events_df:   {events_df.shape}")
    print(f"edges_df:    {edges_df.shape}")
    print(f"ops_df:      {ops_df.shape}")
    
    return traces_df, events_df, edges_df, ops_df


In [12]:
DATA_PATH = "tracebench"
traces_df, events_df, edges_df, ops_df = load_tracebench(DATA_PATH)

Loading: AN_Data_corruptBlk_r_00FDN_30C_1to19B_0to120INT_15RT_5WT
Loading: AN_Data_corruptBlk_r_01FDN_30C_1to19B_0to120INT_15RT_5WT
Loading: AN_Data_corruptBlk_r_02FDN_30C_1to19B_0to120INT_15RT_5WT
Loading: AN_Data_corruptBlk_r_03FDN_30C_1to19B_0to120INT_15RT_5WT
Loading: AN_Data_corruptBlk_r_04FDN_30C_1to19B_0to120INT_15RT_5WT
Loading: AN_Data_corruptBlk_r_05FDN_30C_1to19B_0to120INT_15RT_5WT
Loading: AN_Data_corruptBlk_r_10FDN_30C_1to19B_0to120INT_15RT_5WT
Loading: AN_Data_corruptBlk_r_15FDN_30C_1to19B_0to120INT_15RT_5WT
Loading: AN_Data_corruptBlk_r_20FDN_30C_1to19B_0to120INT_15RT_5WT
Loading: AN_Data_corruptBlk_r_25FDN_30C_1to19B_0to120INT_15RT_5WT
Loading: AN_Data_corruptBlk_r_30FDN_30C_1to19B_0to120INT_15RT_5WT
Loading: AN_Data_corruptBlk_r_35FDN_30C_1to19B_0to120INT_15RT_5WT
Loading: AN_Data_corruptBlk_r_40FDN_30C_1to19B_0to120INT_15RT_5WT
Loading: AN_Data_corruptBlk_r_45FDN_30C_1to19B_0to120INT_15RT_5WT
Loading: AN_Data_corruptBlk_r_50FDN_30C_1to19B_0to120INT_15RT_5WT
Loading: A

In [16]:
traces_df['IsAbnormal'].isna().sum()

49115

In [17]:
events_df.shape

(14777715, 9)

In [22]:
def build_encoder_sequences(events_df, traces_df, join_cols=("OpName","Description"), order_cols=("StartTime","TID")):
    # Act on event_df
    df = events_df.copy()

    # Sort by TaskID then StartTime then TID (Order events in Trace)
    if order_cols: df = df.sort_values(["TaskID"] + list(order_cols))

    # Create events text column = content of join_cols
    def get_events_text(row): return " : ".join(str(row[c]) for c in join_cols if c in row and pd.notna(row[c]))
    df["EventText"] = df.apply(get_events_text, axis=1)

    # Group by TaskID and aggregate text
    seq = df.groupby("TaskID")["EventText"].apply(lambda s: " ".join([t for t in s if t])).reset_index()

    # Merge labels and info from traces_df
    labels = traces_df[["TaskID","IsAbnormal","FaultType","Category"]]
    return seq.merge(labels, on="TaskID", how="left")


In [24]:
encoder_seq = build_encoder_sequences(events_df, traces_df)

In [25]:
encoder_seq.to_csv('master_tables/mon_fichier.csv', index=False)  

In [None]:
import numpy as np

def build_graph_data_for_trace(events_df, edges_df, traces_df, task_id, opname_to_ix=None):
    # Get related lines in dfs
    ev = events_df[events_df["TaskID"]==task_id].copy()
    ed = edges_df[edges_df["TaskID"]==task_id].copy()
    tr = traces_df[traces_df["TaskID"]==task_id].iloc[0]

    # print(ev.head())
    # print(ed.head())
    # print(tr)   

    # Get all events in this trace
    tids = ev["TID"].astype(str).tolist()

    # Assign index to each event
    idx = {t:i for i,t in enumerate(tids)}

    # Node features : X [N, F]
    # Edge index : [2, E]
    # Labels : y [1]

    # Node features : normalized start time, normalized duration, in-degree, out-degree
    st = ev["StartTime"].astype("int64")
    et = ev["EndTime"].astype("int64")
    dur = (et-st).clip(lower=0)
    # Start time normalized
    stn = (st - st.min()) / max(1, (st.max()-st.min()))
    # Duration normalized
    dun = dur / max(1, dur.max())

    # Compute in-degree and out-degree
    indeg = pd.Series(0, index=tids)
    outdeg = pd.Series(0, index=tids)
    for _, r in ed.iterrows():
        # Count connections
        if r["FatherTID"] in idx: outdeg[r["FatherTID"]] += 1
        if r["ChildTID"]  in idx: indeg[r["ChildTID"]]  += 1

    indeg_d  = indeg.to_dict()
    outdeg_d = outdeg.to_dict()

    # Final X
    x_num = np.c_[stn.to_numpy(), dun.to_numpy(),
                  ev["TID"].map(indeg_d).fillna(0).to_numpy(),
                  ev["TID"].map(outdeg_d).fillna(0).to_numpy()]

    # Categorical features
    op_ix   = ev["OpName"].map(lambda s: opname_to_ix.get(str(s),0)).to_numpy()

    # Edges index
    src, dst = [], []
    for _, r in ed.iterrows():
        u, v = r["FatherTID"], r["ChildTID"] # extract edges
        if u in idx and v in idx: src.append(idx[u]); dst.append(idx[v]) # map to index

    data = {
        "x_num": x_num.astype("float32"),
        "op_idx": op_ix.astype("int64"),
        "edge_index": np.array([src, dst], dtype="int64"),
        "y": np.array([int(tr["IsAbnormal"])], dtype="int64"),
        "TraceId": task_id,
    }
    return data


In [None]:
unique_ops = sorted(events_df["OpName"].astype(str).unique())
opname_to_ix = {op: i+1 for i, op in enumerate(unique_ops)}
opname_to_ix["<UNK>"] = 0   # reserve 0 for unknown ops

build_graph_data_for_trace(events_df, edges_df, traces_df, "B076E6516B275ABB", opname_to_ix)