In [1]:
from __future__ import annotations

import json, hashlib, random, sys, platform, heapq
from dataclasses import dataclass, asdict, replace
from pathlib import Path
from typing import Any, Dict, List, Tuple

import numpy as np
import pandas as pd

import networkx as nx

from scipy import sparse
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest

from collections import deque

In [2]:
@dataclass(frozen=True)
class BaselineConfig:
    # Data / split
    golden_id: str = "episode_016"
    dataset_path: str = "episodes_all_baseline.parquet"   
    alert_csv_path: str = "Golden/Alert_1.csv"           

    # Feature extraction
    max_text_features: int = 5000
    min_df: int = 5
    ngram_range: tuple = (1, 2)
    text_col: str = "masked_message_cl"
    cat_cols: tuple = ()

    # Isolation Forest
    if_n_estimators: int = 100
    if_contamination: str = "auto"
    if_random_state: int = 42

    # Graph
    max_gap_host: int = 60
    max_gap_actor: int = 120              
    use_host_edges: bool = True          
    use_actor_edges: bool = True

    # Alert attach
    attach_strategy: str = "prefer_sshd_success_then_closest"

    # RCA walk
    rca_max_nodes: int = 200
    rca_max_hops: int | None = None
    rca_max_back_seconds: int = 30 * 60
    rca_forward_slack_seconds: int = 60
    rca_priority: str = "(-score_norm, abs(dt))"

    # Evaluation @k
    ks: tuple = (5, 10, 20, 50)

cfg = BaselineConfig()

In [3]:
NOTEBOOK_DIR = Path.cwd()
PROJECT_DIR = NOTEBOOK_DIR.parent

RUNS_DIR = PROJECT_DIR / "runs" / "baseline"
RUNS_DIR.mkdir(parents=True, exist_ok=True)

def seed_everything(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)

def get_env_info() -> Dict[str, Any]:
    info = {
        "python": sys.version.replace("\n", " "),
        "platform": platform.platform(),
    }
    try:
        import sklearn
        info["sklearn"] = sklearn.__version__
    except Exception:
        pass
    try:
        import pandas
        info["pandas"] = pandas.__version__
    except Exception:
        pass
    try:
        import numpy
        info["numpy"] = numpy.__version__
    except Exception:
        pass
    try:
        import networkx as nx_
        info["networkx"] = nx_.__version__
    except Exception:
        pass
    try:
        import scipy
        info["scipy"] = scipy.__version__
    except Exception:
        pass
    return info

def save_json(obj: Any, path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2, ensure_ascii=False, default=str)

def file_sha256_12(path: Path) -> str:
    b = path.read_bytes()
    return hashlib.sha256(b).hexdigest()[:12]

seed_everything(cfg.if_random_state)

In [4]:
dataset_path = PROJECT_DIR / "data" / cfg.dataset_path
assert dataset_path.exists(), f"Missing dataset parquet: {dataset_path}"

episodes_df = pd.read_parquet(dataset_path)
episodes_df = episodes_df.reset_index(drop=True)
episodes_df["node_id"] = episodes_df.index.astype(int)

dataset_fingerprint = file_sha256_12(dataset_path)

cfg_fingerprint = hashlib.sha256(json.dumps(asdict(cfg), sort_keys=True).encode("utf-8")).hexdigest()[:12]
RUN_DIR = RUNS_DIR / f"run_{cfg_fingerprint}_{dataset_fingerprint}"
RUN_DIR.mkdir(parents=True, exist_ok=True)

save_json(asdict(cfg), RUN_DIR / "baseline_config.json")
save_json(get_env_info(), RUN_DIR / "environment.json")
save_json({"dataset_path": str(dataset_path), "dataset_sha256_12": dataset_fingerprint}, RUN_DIR / "dataset_fingerprint.json")

print("Loaded:", dataset_path)
print("Rows:", len(episodes_df))
print("Run dir:", RUN_DIR)

Loaded: C:\Users\patri\OneDrive\Documentos\MASTER THESIS\FRAMEWORK\2025-10-16T07_27Z_ssh_alert_01\Data Extraction\data\episodes_all_baseline.parquet
Rows: 569803
Run dir: C:\Users\patri\OneDrive\Documentos\MASTER THESIS\FRAMEWORK\2025-10-16T07_27Z_ssh_alert_01\Data Extraction\runs\baseline\run_6377dd375892_ae702f1c8f84


In [5]:
@dataclass
class BaselineFeatureBuilder:
    cfg: BaselineConfig

    def __post_init__(self):
        self.text_vectorizer = TfidfVectorizer(
            max_features=self.cfg.max_text_features,
            min_df=self.cfg.min_df,
            ngram_range=self.cfg.ngram_range,
        )

        transformers = [
            ("text", self.text_vectorizer, self.cfg.text_col),
        ]

        cat_cols = list(self.cfg.cat_cols) if getattr(self.cfg, "cat_cols", None) else []
        if len(cat_cols) > 0:
            try:
                self.cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
            except TypeError:
                self.cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse=True)

            transformers.append(("cat", self.cat_encoder, cat_cols))
        
        self.preprocessor = ColumnTransformer(transformers=transformers)
        self.pipeline = Pipeline(steps=[("preprocess", self.preprocessor)])

    def fit_transform(self, df: pd.DataFrame) -> sparse.csr_matrix:
        return self.pipeline.fit_transform(df)

    def transform(self, df: pd.DataFrame) -> sparse.csr_matrix:
        return self.pipeline.transform(df)

train_df = episodes_df[episodes_df["episode_id"] != cfg.golden_id].copy()
golden_df = episodes_df[episodes_df["episode_id"] == cfg.golden_id].copy()

train_df[cfg.text_col] = train_df[cfg.text_col].fillna("").astype(str)
golden_df[cfg.text_col] = golden_df[cfg.text_col].fillna("").astype(str)

assert len(golden_df) > 0, f"No rows for golden_id={cfg.golden_id}"

feat_builder = BaselineFeatureBuilder(cfg=cfg)
X_train = feat_builder.fit_transform(train_df)
X_gold  = feat_builder.transform(golden_df)

iso = IsolationForest(
    n_estimators=cfg.if_n_estimators,
    contamination=cfg.if_contamination,
    random_state=cfg.if_random_state,
    n_jobs=-1,
)
iso.fit(X_train)

raw_scores = iso.decision_function(X_gold)   # higher = more normal
golden_df["baseline_score_iso"] = -raw_scores  # higher = more suspicious

episodes_df.loc[golden_df.index, "baseline_score_iso"] = golden_df["baseline_score_iso"].values
print(golden_df["baseline_score_iso"].describe())

count    29998.000000
mean        -0.117049
std          0.034239
min         -0.154611
25%         -0.139395
50%         -0.128928
75%         -0.101724
max          0.109014
Name: baseline_score_iso, dtype: float64


In [6]:
def add_normalized_score(df: pd.DataFrame, col: str = "baseline_score_iso", out_col: str = "score_norm") -> pd.DataFrame:
    df = df.copy()
    s = df[col]
    mask = s.notna()
    if mask.sum() == 0:
        df[out_col] = 0.0
        return df
    s_valid = s[mask]
    s_min, s_max = float(s_valid.min()), float(s_valid.max())
    if s_max == s_min:
        df[out_col] = 0.5
        return df
    score = (s - s_min) / (s_max - s_min)
    score[~mask] = 0.0
    df[out_col] = score
    return df

golden_df = add_normalized_score(golden_df, col="baseline_score_iso", out_col="score_norm")

In [7]:
episodes_df.loc[golden_df.index, "score_norm"] = golden_df["score_norm"].values

In [8]:
def build_episode_graph(df: pd.DataFrame, cfg: BaselineConfig) -> nx.DiGraph:
    df = df.sort_values("timestamp").copy()
    G = nx.DiGraph()

    for _, row in df.iterrows():
        nid = int(row["node_id"])
        G.add_node(
            nid,
            timestamp=row["timestamp"],
            stream=row["stream"],
            masked_message_cl=row["masked_message_cl"],
            actor_ip_anon=row.get("actor_ip_anon", "none"),
            host_anon=row.get("host_anon", "none"),
            baseline_score_iso=float(row.get("baseline_score_iso", np.nan)),
            score_norm=float(row.get("score_norm", 0.0)),
        )

    def add_temporal_edges_for_key(key_col: str, max_gap: int, edge_type: str):
        for key, group in df.groupby(key_col):
            if str(key) in ("none", "", "nan", "None"):
                continue
            group = group.sort_values("timestamp")
            prev_nid, prev_ts = None, None
            for _, r in group.iterrows():
                nid = int(r["node_id"])
                ts = r["timestamp"]
                if prev_nid is not None:
                    dt = (ts - prev_ts).total_seconds()
                    if 0 <= dt <= max_gap:
                        G.add_edge(prev_nid, nid, kind=edge_type, dt=float(dt))
                prev_nid, prev_ts = nid, ts

    if cfg.use_actor_edges:
        add_temporal_edges_for_key("actor_ip_anon", cfg.max_gap_actor, "actor_ip_temporal")
    if cfg.use_host_edges:
        add_temporal_edges_for_key("host_anon", cfg.max_gap_host, "host_temporal")

    return G

G_golden = build_episode_graph(golden_df, cfg)
print("Golden graph:", G_golden.number_of_nodes(), "nodes,", G_golden.number_of_edges(), "edges")

Golden graph: 29998 nodes, 33485 edges


In [9]:
ALERT_NODE_ID = "ALERT_NODE"

def short_hash(x):
    return "none" if pd.isna(x) or not str(x).strip() else hashlib.sha256(str(x).encode()).hexdigest()[:12]

alert_path = PROJECT_DIR / cfg.alert_csv_path
assert alert_path.exists(), f"Missing alert csv: {alert_path}"
alert_df = pd.read_csv(alert_path)
alert_row = alert_df.iloc[0]

alert_ts = pd.to_datetime(alert_row["@timestamp"], utc=True)
alert_src_ip = alert_row.get("source.ip", None)
alert_name   = alert_row.get("kibana.alert.rule.name", "")
alert_reason = alert_row.get("kibana.alert.reason", "")
alert_desc   = alert_row.get("description", "")

def attach_alert_node(G: nx.DiGraph, df: pd.DataFrame, alert_ts: pd.Timestamp, alert_src_ip=None,
                      alert_name: str = "", alert_reason: str = "", alert_desc: str = "") -> list[int]:
    G.add_node(
        ALERT_NODE_ID,
        timestamp=alert_ts,
        stream="alert",
        masked_message_cl=f"[alert] name={alert_name} reason={alert_reason} desc={alert_desc}",
        baseline_score_iso=0.0,
        score_norm=1.0,
    )

    attacker_ip_str = None
    if alert_src_ip is not None:
        attacker_ip_str = str(alert_src_ip).strip()
        if not attacker_ip_str or attacker_ip_str.lower() in ("nan", "none"):
            attacker_ip_str = None

    df_sorted = df.sort_values("timestamp").copy()
    df_sorted["dt_abs"] = (df_sorted["timestamp"] - alert_ts).abs().dt.total_seconds()

    best = pd.DataFrame()

    # 1) prefer sshd success from attacker ip
    if attacker_ip_str is not None:
        attacker_hash = short_hash(attacker_ip_str)
        cand1 = df_sorted[
            (df_sorted["stream"] == "system.auth") &
            (df_sorted["masked_message_cl"].astype(str).str.contains("out=success", na=False)) &
            (df_sorted["masked_message_cl"].astype(str).str.contains("proc=sshd", na=False)) &
            (df_sorted.get("actor_ip_anon", "") == attacker_hash)
        ]
        if not cand1.empty:
            best = cand1.nsmallest(1, "dt_abs")

    # 2) else any sshd success
    if best.empty:
        cand2 = df_sorted[
            (df_sorted["stream"] == "system.auth") &
            (df_sorted["masked_message_cl"].astype(str).str.contains("out=success", na=False)) &
            (df_sorted["masked_message_cl"].astype(str).str.contains("proc=sshd", na=False))
        ]
        if not cand2.empty:
            best = cand2.nsmallest(1, "dt_abs")

    # 3) else closest event
    if best.empty:
        best = df_sorted.nsmallest(1, "dt_abs")

    attached = [int(best.iloc[0]["node_id"])]
    for nid in attached:
        dt = (G.nodes[nid]["timestamp"] - alert_ts).total_seconds()
        G.add_edge(ALERT_NODE_ID, nid, kind="alert_to_log", dt=float(dt))

    return attached

attached_log_ids = attach_alert_node(G_golden, golden_df, alert_ts, alert_src_ip, alert_name, alert_reason, alert_desc)
save_json({"attached_log_ids": attached_log_ids}, RUN_DIR / "alert_attach.json")
print("Attached node_ids:", attached_log_ids)

Attached node_ids: [560913]


In [10]:
def rca_walk(G: nx.DiGraph, cfg: BaselineConfig, alert_node_id: str = ALERT_NODE_ID) -> tuple[nx.DiGraph, set, list]:
    alert_ts = G.nodes[alert_node_id]["timestamp"]

    def get_score_norm(node_id):
        if node_id == alert_node_id:
            return 1.0
        try:
            return float(G.nodes[node_id].get("score_norm", 0.0))
        except Exception:
            return 0.0

    selected = set([alert_node_id])
    visited  = set([alert_node_id])
    ranked   = [alert_node_id]

    heap = []
    for succ in G.successors(alert_node_id):
        ts = G.nodes[succ]["timestamp"]
        dt = (alert_ts - ts).total_seconds()
        if dt > cfg.rca_max_back_seconds:
            continue
        if dt < -cfg.rca_forward_slack_seconds:
            continue
        score = get_score_norm(succ)
        heapq.heappush(heap, ((-score, abs(dt)), 1, succ))

    while heap and len(selected) < cfg.rca_max_nodes:
        (_, hops, node_id) = heapq.heappop(heap)
        if node_id in visited:
            continue
        if cfg.rca_max_hops is not None and hops > cfg.rca_max_hops:
            continue

        visited.add(node_id)
        selected.add(node_id)
        ranked.append(node_id)

        for pred in G.predecessors(node_id):
            if pred in visited or pred == alert_node_id:
                continue
            ts = G.nodes[pred]["timestamp"]
            dt = (alert_ts - ts).total_seconds()
            if dt > cfg.rca_max_back_seconds:
                continue
            if dt < -cfg.rca_forward_slack_seconds:
                continue
            score = get_score_norm(pred)
            heapq.heappush(heap, ((-score, abs(dt)), hops + 1, pred))

    subG = G.subgraph(selected).copy()
    return subG, selected, ranked

subG, selected_nodes, rca_ranked = rca_walk(G_golden, cfg, alert_node_id=ALERT_NODE_ID)
print("RCA subgraph:")
print("  Nodes:", subG.number_of_nodes())
print("  Edges:", subG.number_of_edges())

RCA subgraph:
  Nodes: 200
  Edges: 227


In [11]:
def rca_walk_score_with_hops(G: nx.DiGraph, cfg, alert_node_id: str = ALERT_NODE_ID):
    if alert_node_id not in G:
        raise KeyError(
            f"{alert_node_id} not in graph. Did you call attach_alert_node(G, ...)? "
            f"Available example nodes: {list(G.nodes)[:5]}"
        )

    alert_ts = G.nodes[alert_node_id]["timestamp"]

    def get_score_norm(node_id):
        if node_id == alert_node_id:
            return 1.0
        return float(G.nodes[node_id].get("score_norm", 0.0) or 0.0)

    def priority_key(score: float, dt: float):
        return (-score, abs(dt))

    selected = {alert_node_id}
    visited  = {alert_node_id}
    ranked   = [alert_node_id]
    hop_of   = {alert_node_id: 0}

    heap = []
    for succ in G.successors(alert_node_id):
        ts = G.nodes[succ]["timestamp"]
        dt = (alert_ts - ts).total_seconds()

        if dt > cfg.rca_max_back_seconds:
            continue
        if dt < -cfg.rca_forward_slack_seconds:
            continue

        score = get_score_norm(succ)
        heapq.heappush(heap, (priority_key(score, dt), 1, succ))

    while heap and len(selected) < cfg.rca_max_nodes:
        (_, hops, node_id) = heapq.heappop(heap)

        if node_id in visited:
            continue
        if cfg.rca_max_hops is not None and hops > cfg.rca_max_hops:
            continue

        visited.add(node_id)
        selected.add(node_id)
        ranked.append(node_id)
        hop_of[node_id] = hops

        for pred in G.predecessors(node_id):
            if pred in visited or pred == alert_node_id:
                continue

            ts = G.nodes[pred]["timestamp"]
            dt = (alert_ts - ts).total_seconds()

            if dt > cfg.rca_max_back_seconds:
                continue
            if dt < -cfg.rca_forward_slack_seconds:
                continue

            score = get_score_norm(pred)
            heapq.heappush(heap, (priority_key(score, dt), hops + 1, pred))

    subG = G.subgraph(selected).copy()
    return subG, selected, ranked, hop_of

In [12]:
cfg_tmp = replace(cfg, rca_max_hops=None, rca_max_nodes=5000)

In [13]:
_, selected, ranked, hop_of = rca_walk_score_with_hops(G_golden, cfg_tmp)

# EVALUATION

In [14]:
golden_by_id = golden_df.set_index("node_id", drop=False)

def _filter_ranked_nodes(rca_ranked: List[Any], alert_node_id: Any) -> List[int]:
    return [int(n) for n in rca_ranked if n != alert_node_id and n != "ALERT_NODE"]

def _ranked_to_items(ranked_nodes: List[int], df_by_id: pd.DataFrame, use_or_duplicates: bool, evidence_col: str="evidence_id") -> List[str]:
    if not use_or_duplicates:
        return [str(n) for n in ranked_nodes]
    ranked_items, seen = [], set()
    for n in ranked_nodes:
        eid = df_by_id.loc[n, evidence_col]
        eid = "missing_evidence" if pd.isna(eid) else str(eid)
        if eid not in seen:
            seen.add(eid)
            ranked_items.append(eid)
    return ranked_items

def _gt_items(df: pd.DataFrame, gt_col: str, use_or_duplicates: bool, evidence_col: str="evidence_id") -> set:
    gt_mask = df[gt_col].astype(bool)
    if not use_or_duplicates:
        return set(map(str, df.loc[gt_mask, "node_id"].tolist()))
    return set(map(str, df.loc[gt_mask, evidence_col].dropna().tolist()))

def _prf(returned_set: set, gt_set: set) -> Tuple[int, float, float, float]:
    tp = len(returned_set & gt_set)
    p = tp / max(1, len(returned_set))
    r = tp / max(1, len(gt_set))
    f1 = 0.0 if (p + r) == 0 else (2 * p * r / (p + r))
    return tp, p, r, f1

def _pr_at_k(ranked_items: List[str], gt_set: set, k: int) -> Tuple[int, float, float, int]:
    effective_k = min(k, len(ranked_items))
    topk = set(ranked_items[:effective_k])
    tp = len(topk & gt_set)
    p = tp / max(1, effective_k)
    r = tp / max(1, len(gt_set))
    return tp, p, r, effective_k

def _hit_at_k(ranked_items: List[str], gt_set: set, k: int) -> Tuple[int, int]:
    k_used = min(k, len(ranked_items))
    if k_used == 0:
        return 0, 0
    topk = set(ranked_items[:k_used])
    hit = 1 if len(topk & gt_set) > 0 else 0
    return hit, k_used

def evaluate_rca_episode(
    df: pd.DataFrame,
    df_by_id: pd.DataFrame,
    rca_ranked: List[Any],
    cfg: BaselineConfig,
    use_or_duplicates: bool = True,
    evidence_col: str = "evidence_id",
    compute_hit_for: str = "core", 
) -> Dict[str, Any]:

    ranked_nodes = _filter_ranked_nodes(rca_ranked, ALERT_NODE_ID)
    ranked_items = _ranked_to_items(ranked_nodes, df_by_id, use_or_duplicates, evidence_col=evidence_col)
    returned_set = set(ranked_items)

    gt_core = _gt_items(df, "gt_core", use_or_duplicates, evidence_col=evidence_col)
    gt_ext  = _gt_items(df, "gt_extended", use_or_duplicates, evidence_col=evidence_col)

    core_tp, core_p, core_r, core_f1 = _prf(returned_set, gt_core)
    ext_tp,  ext_p,  ext_r,  ext_f1  = _prf(returned_set, gt_ext)

    out = {
        "mode": "or_duplicates" if use_or_duplicates else "node_level",
        "S_nodes": len(ranked_nodes),
        "S_items": len(returned_set),
        "returned_items_total_ranked": len(ranked_items),
        "gt_core_size": len(gt_core),
        "gt_ext_size": len(gt_ext),
        "core_tp": core_tp, "core_precision": core_p, "core_recall": core_r, "core_f1": core_f1,
        "ext_tp":  ext_tp,  "ext_precision":  ext_p,  "ext_recall":  ext_r,  "ext_f1":  ext_f1,
    }

    for k in cfg.ks:
        tp, p, r, k_used = _pr_at_k(ranked_items, gt_core, k)
        out[f"core_tp@{k}"] = tp
        out[f"core_P@{k}"] = p
        out[f"core_R@{k}"] = r
        out[f"core_k_used@{k}"] = k_used

        tp, p, r, k_used = _pr_at_k(ranked_items, gt_ext, k)
        out[f"ext_tp@{k}"] = tp
        out[f"ext_P@{k}"] = p
        out[f"ext_R@{k}"] = r
        out[f"ext_k_used@{k}"] = k_used

        if compute_hit_for in ("core"):
            hit, k_used = _hit_at_k(ranked_items, gt_core, k)
            out[f"core_Hit@{k}"] = hit
            out[f"core_hit_k_used@{k}"] = k_used

        if compute_hit_for in ("ext"):
            hit, k_used = _hit_at_k(ranked_items, gt_ext, k)
            out[f"ext_Hit@{k}"] = hit
            out[f"ext_hit_k_used@{k}"] = k_used

    return out

metrics_or   = evaluate_rca_episode(golden_df, golden_by_id, rca_ranked, cfg, use_or_duplicates=True, compute_hit_for="core")
metrics_node = evaluate_rca_episode(golden_df, golden_by_id, rca_ranked, cfg, use_or_duplicates=False, compute_hit_for="core")

save_json(metrics_or, RUN_DIR / "metrics_or_duplicates.json")
save_json(metrics_node, RUN_DIR / "metrics_node_level.json")
save_json({"rca_ranked": [str(x) for x in rca_ranked]}, RUN_DIR / "rca_ranked.json")

metrics_or

{'mode': 'or_duplicates',
 'S_nodes': 199,
 'S_items': 174,
 'returned_items_total_ranked': 174,
 'gt_core_size': 32,
 'gt_ext_size': 40,
 'core_tp': 24,
 'core_precision': 0.13793103448275862,
 'core_recall': 0.75,
 'core_f1': 0.23300970873786406,
 'ext_tp': 32,
 'ext_precision': 0.1839080459770115,
 'ext_recall': 0.8,
 'ext_f1': 0.29906542056074764,
 'core_tp@5': 1,
 'core_P@5': 0.2,
 'core_R@5': 0.03125,
 'core_k_used@5': 5,
 'ext_tp@5': 4,
 'ext_P@5': 0.8,
 'ext_R@5': 0.1,
 'ext_k_used@5': 5,
 'core_Hit@5': 1,
 'core_hit_k_used@5': 5,
 'core_tp@10': 4,
 'core_P@10': 0.4,
 'core_R@10': 0.125,
 'core_k_used@10': 10,
 'ext_tp@10': 9,
 'ext_P@10': 0.9,
 'ext_R@10': 0.225,
 'ext_k_used@10': 10,
 'core_Hit@10': 1,
 'core_hit_k_used@10': 10,
 'core_tp@20': 11,
 'core_P@20': 0.55,
 'core_R@20': 0.34375,
 'core_k_used@20': 20,
 'ext_tp@20': 17,
 'ext_P@20': 0.85,
 'ext_R@20': 0.425,
 'ext_k_used@20': 20,
 'core_Hit@20': 1,
 'core_hit_k_used@20': 20,
 'core_tp@50': 21,
 'core_P@50': 0.42,
 '

In [15]:
def per_stream_breakdown(golden_df: pd.DataFrame, selected_node_ids: set, gt_col: str = "gt_core") -> pd.DataFrame:
    df = golden_df.copy()
    df["selected"] = df["node_id"].isin(selected_node_ids)
    out = (
        df.groupby("stream")
        .apply(lambda g: pd.Series({
            "tp": int((g[gt_col].astype(bool) & g["selected"]).sum()),
            "fp": int((~g[gt_col].astype(bool) & g["selected"]).sum()),
            "fn": int((g[gt_col].astype(bool) & ~g["selected"]).sum()),
            "selected": int(g["selected"].sum()),
            "gt_total": int(g[gt_col].astype(bool).sum()),
            "total_logs": len(g),
        }))
    )
    out["precision"] = out["tp"] / (out["tp"] + out["fp"]).replace({0: pd.NA})
    out["recall"]    = out["tp"] / (out["tp"] + out["fn"]).replace({0: pd.NA})
    return out.fillna(0.0).sort_values(["gt_total", "selected"], ascending=False)

def get_missed_and_extra_tables(golden_df: pd.DataFrame, selected_node_ids: set, gt_col: str="gt_core", top_n: int=20):
    df = golden_df.copy()
    df["selected"] = df["node_id"].isin(selected_node_ids)
    missed = df[(df[gt_col].astype(bool)) & (~df["selected"])].copy()
    extra  = df[(~df[gt_col].astype(bool)) & (df["selected"])].copy()

    cols = ["timestamp", "stream", "masked_message_cl", "baseline_score_iso", "score_norm", "evidence_id", "node_id"]
    missed = missed.sort_values(["score_norm", "timestamp"], ascending=[False, True])[cols].head(top_n)
    extra  = extra.sort_values(["score_norm", "timestamp"], ascending=[False, True])[cols].head(top_n)
    return missed, extra

selected_node_ids = set([int(n) for n in rca_ranked if n != ALERT_NODE_ID and n != "ALERT_NODE"])

per_stream_core = per_stream_breakdown(golden_df, selected_node_ids, gt_col="gt_core")
per_stream_ext  = per_stream_breakdown(golden_df, selected_node_ids, gt_col="gt_extended")

missed_core, extra_non_core = get_missed_and_extra_tables(golden_df, selected_node_ids, gt_col="gt_core", top_n=20)

save_json(per_stream_core.to_dict(orient="index"), RUN_DIR / "per_stream_core.json")
save_json(per_stream_ext.to_dict(orient="index"), RUN_DIR / "per_stream_ext.json")

print("Saved diagnostics to:", RUN_DIR)

Saved diagnostics to: C:\Users\patri\OneDrive\Documentos\MASTER THESIS\FRAMEWORK\2025-10-16T07_27Z_ssh_alert_01\Data Extraction\runs\baseline\run_6377dd375892_ae702f1c8f84


  .apply(lambda g: pd.Series({
  return out.fillna(0.0).sort_values(["gt_total", "selected"], ascending=False)
  .apply(lambda g: pd.Series({
  return out.fillna(0.0).sort_values(["gt_total", "selected"], ascending=False)


In [16]:
metrics_or

{'mode': 'or_duplicates',
 'S_nodes': 199,
 'S_items': 174,
 'returned_items_total_ranked': 174,
 'gt_core_size': 32,
 'gt_ext_size': 40,
 'core_tp': 24,
 'core_precision': 0.13793103448275862,
 'core_recall': 0.75,
 'core_f1': 0.23300970873786406,
 'ext_tp': 32,
 'ext_precision': 0.1839080459770115,
 'ext_recall': 0.8,
 'ext_f1': 0.29906542056074764,
 'core_tp@5': 1,
 'core_P@5': 0.2,
 'core_R@5': 0.03125,
 'core_k_used@5': 5,
 'ext_tp@5': 4,
 'ext_P@5': 0.8,
 'ext_R@5': 0.1,
 'ext_k_used@5': 5,
 'core_Hit@5': 1,
 'core_hit_k_used@5': 5,
 'core_tp@10': 4,
 'core_P@10': 0.4,
 'core_R@10': 0.125,
 'core_k_used@10': 10,
 'ext_tp@10': 9,
 'ext_P@10': 0.9,
 'ext_R@10': 0.225,
 'ext_k_used@10': 10,
 'core_Hit@10': 1,
 'core_hit_k_used@10': 10,
 'core_tp@20': 11,
 'core_P@20': 0.55,
 'core_R@20': 0.34375,
 'core_k_used@20': 20,
 'ext_tp@20': 17,
 'ext_P@20': 0.85,
 'ext_R@20': 0.425,
 'ext_k_used@20': 20,
 'core_Hit@20': 1,
 'core_hit_k_used@20': 20,
 'core_tp@50': 21,
 'core_P@50': 0.42,
 '

# EXPERIMENTS

## EXPERIMENT A: Connectivity/Reachability

In [17]:
def compute_reachable_by_walk(G: nx.DiGraph, cfg: BaselineConfig, alert_node_id: str = ALERT_NODE_ID) -> set[int]:
    alert_ts = G.nodes[alert_node_id]["timestamp"]

    def dt_ok(nid) -> bool:
        ts = G.nodes[nid]["timestamp"]
        dt = (alert_ts - ts).total_seconds()
        if dt > cfg.rca_max_back_seconds:
            return False
        if dt < -cfg.rca_forward_slack_seconds:
            return False
        return True

    visited = set([alert_node_id])
    reachable = set()
    q = deque()

    for succ in G.successors(alert_node_id):
        if succ == alert_node_id:
            continue
        if not dt_ok(succ):
            continue
        q.append((succ, 1))

    while q:
        nid, hops = q.popleft()

        if nid in visited:
            continue
        if cfg.rca_max_hops is not None and hops > cfg.rca_max_hops:
            continue

        visited.add(nid)
        if nid != alert_node_id:
            reachable.add(int(nid))

        for pred in G.predecessors(nid):
            if pred in visited or pred == alert_node_id:
                continue
            if not dt_ok(pred):
                continue
            q.append((pred, hops + 1))

    return reachable


def _evidence_series_with_fallback(df: pd.DataFrame) -> pd.Series:
    ev = df["evidence_id"] if "evidence_id" in df.columns else pd.Series([pd.NA] * len(df), index=df.index)
    ev = ev.where(ev.notna(), df["node_id"].apply(lambda x: f"node_{int(x)}"))
    ev = ev.astype(str)
    ev = ev.replace({"nan": "", "none": "", "None": ""})
    return ev


def node_ids_to_evidence_set(golden_df: pd.DataFrame, node_ids: set[int]) -> set[str]:
    sub = golden_df.loc[golden_df["node_id"].isin(node_ids), ["node_id", "evidence_id"]].copy()
    if sub.empty:
        return set()
    sub["evidence_id"] = _evidence_series_with_fallback(sub)
    ev_set = set(sub["evidence_id"].tolist())
    ev_set.discard("")  
    return ev_set


def gt_evidence_set(golden_df: pd.DataFrame, gt_col: str) -> set[str]:
    sub = golden_df.loc[golden_df[gt_col].astype(bool), ["node_id", "evidence_id"]].copy()
    if sub.empty:
        return set()
    sub["evidence_id"] = _evidence_series_with_fallback(sub)
    ev_set = set(sub["evidence_id"].tolist())
    ev_set.discard("")
    return ev_set


def reachable_recall_evidence(
    golden_df: pd.DataFrame,
    reachable_node_ids: set[int],
    gt_col: str,
) -> tuple[int, int, float]:
    gt_ev = gt_evidence_set(golden_df, gt_col)
    if len(gt_ev) == 0:
        return 0, 0, 0.0
    reachable_ev = node_ids_to_evidence_set(golden_df, reachable_node_ids)
    hit = len(gt_ev & reachable_ev)
    return hit, len(gt_ev), hit / len(gt_ev)

def build_graph_and_attach(
    golden_df: pd.DataFrame,
    cfg_variant: BaselineConfig,
    alert_ts: pd.Timestamp,
    alert_src_ip,
    alert_name: str,
    alert_reason: str,
    alert_desc: str,
) -> nx.DiGraph:
    Gv = build_episode_graph(golden_df, cfg_variant)
    attach_alert_node(Gv, golden_df, alert_ts, alert_src_ip, alert_name, alert_reason, alert_desc)
    return Gv

def run_reachability_ablation_A_evidence(
    golden_df: pd.DataFrame,
    cfg_base: BaselineConfig,
    alert_ts: pd.Timestamp,
    alert_src_ip,
    alert_name: str,
    alert_reason: str,
    alert_desc: str,
    actor_gaps=(30, 60, 120, 300, 600), 
    host_gaps=(30, 60, 120, 300),        
):
    rows = []

    core_gt_ev = gt_evidence_set(golden_df, "gt_core")
    ext_gt_ev  = gt_evidence_set(golden_df, "gt_extended")

    edge_variants = [
        ("both_edges", True, True,  cfg_base.max_gap_actor, cfg_base.max_gap_host),
        ("actor_only", True, False, cfg_base.max_gap_actor, cfg_base.max_gap_host),
        ("host_only",  False, True, cfg_base.max_gap_actor, cfg_base.max_gap_host),
        ("no_edges",   False, False,cfg_base.max_gap_actor, cfg_base.max_gap_host),
    ]

    for name, ua, uh, a_gap, h_gap in edge_variants:
        cfgv = replace(
            cfg_base,
            use_actor_edges=ua,
            use_host_edges=uh,
            max_gap_actor=a_gap,
            max_gap_host=h_gap,
        )

        Gv = build_graph_and_attach(golden_df, cfgv, alert_ts, alert_src_ip, alert_name, alert_reason, alert_desc)
        reachable_nodes = compute_reachable_by_walk(Gv, cfgv, ALERT_NODE_ID)

        reachable_ev = node_ids_to_evidence_set(golden_df, reachable_nodes)

        core_hit, core_tot, rr_core = reachable_recall_evidence(golden_df, reachable_nodes, "gt_core")
        ext_hit,  ext_tot,  rr_ext  = reachable_recall_evidence(golden_df, reachable_nodes, "gt_extended")

        rows.append({
            "ablation": "edge_type",
            "variant": name,
            "use_actor_edges": ua,
            "use_host_edges": uh,
            "actor_gap_s": int(a_gap),
            "host_gap_s": int(h_gap),
            "nodes": Gv.number_of_nodes(),
            "edges": Gv.number_of_edges(),
            "reachable_nodes": int(len(reachable_nodes)),
            "reachable_evidence": int(len(reachable_ev)),
            "core_gt_evidence_total": int(len(core_gt_ev)),
            "ext_gt_evidence_total": int(len(ext_gt_ev)),
            "ReachRec_core_evidence": rr_core,
            "core_reachable_evidence": f"{core_hit}/{core_tot}",
            "ReachRec_ext_evidence": rr_ext,
            "ext_reachable_evidence": f"{ext_hit}/{ext_tot}",
        })

    for a_gap in actor_gaps:
        cfgv = replace(
            cfg_base,
            use_actor_edges=True,
            use_host_edges=True,
            max_gap_actor=int(a_gap),
            max_gap_host=int(cfg_base.max_gap_host),
        )

        Gv = build_graph_and_attach(golden_df, cfgv, alert_ts, alert_src_ip, alert_name, alert_reason, alert_desc)
        reachable_nodes = compute_reachable_by_walk(Gv, cfgv, ALERT_NODE_ID)
        reachable_ev = node_ids_to_evidence_set(golden_df, reachable_nodes)

        core_hit, core_tot, rr_core = reachable_recall_evidence(golden_df, reachable_nodes, "gt_core")
        ext_hit,  ext_tot,  rr_ext  = reachable_recall_evidence(golden_df, reachable_nodes, "gt_extended")

        rows.append({
            "ablation": "actor_gap_sweep",
            "variant": f"actor_gap={a_gap}",
            "use_actor_edges": True,
            "use_host_edges": True,
            "actor_gap_s": int(a_gap),
            "host_gap_s": int(cfg_base.max_gap_host),
            "nodes": Gv.number_of_nodes(),
            "edges": Gv.number_of_edges(),
            "reachable_nodes": int(len(reachable_nodes)),
            "reachable_evidence": int(len(reachable_ev)),
            "core_gt_evidence_total": int(len(core_gt_ev)),
            "ext_gt_evidence_total": int(len(ext_gt_ev)),
            "ReachRec_core_evidence": rr_core,
            "core_reachable_evidence": f"{core_hit}/{core_tot}",
            "ReachRec_ext_evidence": rr_ext,
            "ext_reachable_evidence": f"{ext_hit}/{ext_tot}",
        })

    for h_gap in host_gaps:
        cfgv = replace(
            cfg_base,
            use_actor_edges=True,
            use_host_edges=True,
            max_gap_actor=int(cfg_base.max_gap_actor),
            max_gap_host=int(h_gap),
        )

        Gv = build_graph_and_attach(golden_df, cfgv, alert_ts, alert_src_ip, alert_name, alert_reason, alert_desc)
        reachable_nodes = compute_reachable_by_walk(Gv, cfgv, ALERT_NODE_ID)
        reachable_ev = node_ids_to_evidence_set(golden_df, reachable_nodes)

        core_hit, core_tot, rr_core = reachable_recall_evidence(golden_df, reachable_nodes, "gt_core")
        ext_hit,  ext_tot,  rr_ext  = reachable_recall_evidence(golden_df, reachable_nodes, "gt_extended")

        rows.append({
            "ablation": "host_gap_sweep",
            "variant": f"host_gap={h_gap}",
            "use_actor_edges": True,
            "use_host_edges": True,
            "actor_gap_s": int(cfg_base.max_gap_actor),
            "host_gap_s": int(h_gap),
            "nodes": Gv.number_of_nodes(),
            "edges": Gv.number_of_edges(),
            "reachable_nodes": int(len(reachable_nodes)),
            "reachable_evidence": int(len(reachable_ev)),
            "core_gt_evidence_total": int(len(core_gt_ev)),
            "ext_gt_evidence_total": int(len(ext_gt_ev)),
            "ReachRec_core_evidence": rr_core,
            "core_reachable_evidence": f"{core_hit}/{core_tot}",
            "ReachRec_ext_evidence": rr_ext,
            "ext_reachable_evidence": f"{ext_hit}/{ext_tot}",
        })

    out = pd.DataFrame(rows)

    out = out.sort_values(
        ["ablation", "ReachRec_core_evidence", "ReachRec_ext_evidence", "reachable_evidence", "edges"],
        ascending=[True, False, False, False, False],
    )
    return out

ablation_A_evidence_baseline = run_reachability_ablation_A_evidence(
    golden_df=golden_df,
    cfg_base=cfg,
    alert_ts=alert_ts,
    alert_src_ip=alert_src_ip,
    alert_name=alert_name,
    alert_reason=alert_reason,
    alert_desc=alert_desc,
)

display(ablation_A_evidence_baseline)

Unnamed: 0,ablation,variant,use_actor_edges,use_host_edges,actor_gap_s,host_gap_s,nodes,edges,reachable_nodes,reachable_evidence,core_gt_evidence_total,ext_gt_evidence_total,ReachRec_core_evidence,core_reachable_evidence,ReachRec_ext_evidence,ext_reachable_evidence
8,actor_gap_sweep,actor_gap=600,True,True,600,60,29999,34377,7349,6438,32,40,1.0,32/32,1.0,40/40
7,actor_gap_sweep,actor_gap=300,True,True,300,60,29999,34051,6707,6093,32,40,1.0,32/32,1.0,40/40
6,actor_gap_sweep,actor_gap=120,True,True,120,60,29999,33486,6628,6014,32,40,1.0,32/32,1.0,40/40
5,actor_gap_sweep,actor_gap=60,True,True,60,60,29999,32764,99,80,32,40,0.75,24/32,0.75,30/40
4,actor_gap_sweep,actor_gap=30,True,True,30,60,29999,31768,97,78,32,40,0.75,24/32,0.75,30/40
0,edge_type,both_edges,True,True,120,60,29999,33486,6628,6014,32,40,1.0,32/32,1.0,40/40
1,edge_type,actor_only,True,False,120,60,29999,14541,241,227,32,40,1.0,32/32,1.0,40/40
2,edge_type,host_only,False,True,120,60,29999,22933,28,28,32,40,0.65625,21/32,0.525,21/40
3,edge_type,no_edges,False,False,120,60,29999,1,1,1,32,40,0.03125,1/32,0.025,1/40
12,host_gap_sweep,host_gap=300,True,True,120,300,29999,34707,8684,7213,32,40,1.0,32/32,1.0,40/40


In [19]:
ablation_A_path_baseline = RUN_DIR / "ablation_A_reachability_baseline.csv"
ablation_A_evidence_baseline.to_csv(ablation_A_path_baseline, index=False)
print("Saved:", ablation_A_path_baseline)

Saved: C:\Users\patri\OneDrive\Documentos\MASTER THESIS\FRAMEWORK\2025-10-16T07_27Z_ssh_alert_01\Data Extraction\runs\baseline\run_6377dd375892_ae702f1c8f84\ablation_A_reachability_baseline.csv


## EXPERIMENT C: Walk budget + hop limit

In [18]:
def _run_walk_eval(
    G_golden,
    golden_df,
    cfgv,
    alert_node_id=ALERT_NODE_ID,
    mode_name="baseline",
    use_or_duplicates=True,
    compute_hop_diagnostics=True,
):
    hop_of = None

    if compute_hop_diagnostics and ("rca_walk_score_with_hops" in globals()):
        subGv, selected_v, ranked_v, hop_of = rca_walk_score_with_hops(
            G_golden, cfgv, alert_node_id=alert_node_id
        )
    else:
        subGv, selected_v, ranked_v = rca_walk(G_golden, cfgv, alert_node_id=alert_node_id)

    m = evaluate_rca_episode(
        golden_df,
        golden_by_id,
        ranked_v,
        cfgv,
        use_or_duplicates=use_or_duplicates,
    )

    row = {
        "model": mode_name,
        "rca_max_nodes": int(cfgv.rca_max_nodes),
        "rca_max_hops": ("None" if cfgv.rca_max_hops is None else int(cfgv.rca_max_hops)),
        "selected_nodes": len([n for n in selected_v if n != alert_node_id]),
    }
    row.update(m)

    # Hop diagnostics
    if hop_of is not None:
        sel_wo_alert = [n for n in selected_v if n != alert_node_id]
        hops_vals = [hop_of.get(n) for n in sel_wo_alert if hop_of.get(n) is not None]
        if len(hops_vals) > 0:
            row["max_hop_selected"] = int(np.max(hops_vals))
            row["p90_hop_selected"] = float(np.quantile(hops_vals, 0.90))
            row["mean_hop_selected"] = float(np.mean(hops_vals))
        else:
            row["max_hop_selected"] = 0
            row["p90_hop_selected"] = 0.0
            row["mean_hop_selected"] = 0.0

    return row


def _sort_ablation(df: pd.DataFrame) -> pd.DataFrame:
    sort_cols = []
    for c in ["core_f1", "ext_f1", "core_P@10", "ext_P@10"]:
        if c in df.columns:
            sort_cols.append(c)
    for c in ["S_items", "returned_items_total_ranked", "S_nodes", "selected_nodes"]:
        if c in df.columns:
            sort_cols.append(c)

    ascending = []
    for c in sort_cols:
        if c in ("S_items", "returned_items_total_ranked", "S_nodes", "selected_nodes"):
            ascending.append(True)
        else:
            ascending.append(False)

    if sort_cols:
        return df.sort_values(sort_cols, ascending=ascending)
    return df


# Hop sensitivity 
def run_hop_sensitivity(
    G_golden,
    golden_df,
    cfg_base,
    alert_node_id=ALERT_NODE_ID,
    fixed_max_nodes=5000,                   
    max_hops_grid=(1, 2, 3, 5, 10, 25, 50, 100, 150, None),
    mode_name="baseline",
    compute_hop_diagnostics=True,
):
    rows = []
    for max_hops in max_hops_grid:
        cfgv = replace(
            cfg_base,
            rca_max_nodes=int(fixed_max_nodes),
            rca_max_hops=max_hops,
        )
        row = _run_walk_eval(
            G_golden=G_golden,
            golden_df=golden_df,
            cfgv=cfgv,
            alert_node_id=alert_node_id,
            mode_name=mode_name,
            compute_hop_diagnostics=compute_hop_diagnostics,
        )
        row["ablation"] = "hop_sensitivity"
        rows.append(row)

    out = pd.DataFrame(rows)

    def _hop_key(x):
        return 10**9 if x == "None" else int(x)

    if "rca_max_hops" in out.columns:
        out = out.sort_values("rca_max_hops", key=lambda s: s.map(_hop_key))

    return out



# Node budget sensitivity
def run_budget_sensitivity(
    G_golden,
    golden_df,
    cfg_base,
    alert_node_id=ALERT_NODE_ID,
    max_nodes_grid=(25, 50, 100, 150, 200),
    fixed_max_hops=None,                     
    mode_name="baseline",
    compute_hop_diagnostics=True,
):
    rows = []
    for max_nodes in max_nodes_grid:
        cfgv = replace(
            cfg_base,
            rca_max_nodes=int(max_nodes),
            rca_max_hops=fixed_max_hops,
        )
        row = _run_walk_eval(
            G_golden=G_golden,
            golden_df=golden_df,
            cfgv=cfgv,
            alert_node_id=alert_node_id,
            mode_name=mode_name,
            compute_hop_diagnostics=compute_hop_diagnostics,
        )
        row["ablation"] = "budget_sensitivity"
        rows.append(row)

    out = pd.DataFrame(rows)
    if "rca_max_nodes" in out.columns:
        out = out.sort_values("rca_max_nodes")

    return out


hop_sens_baseline = run_hop_sensitivity(
    G_golden=G_golden,
    golden_df=golden_df,
    cfg_base=cfg,
    alert_node_id=ALERT_NODE_ID,
    fixed_max_nodes=5000,
    max_hops_grid=(1, 2, 3, 5, 10, 25, 50, 100, 150, None),
    mode_name="baseline",
    compute_hop_diagnostics=True,
)

budget_sens_baseline = run_budget_sensitivity(
    G_golden=G_golden,
    golden_df=golden_df,
    cfg_base=cfg,
    alert_node_id=ALERT_NODE_ID,
    max_nodes_grid=(25, 50, 100, 150, 200),
    fixed_max_hops=None, 
    mode_name="baseline",
    compute_hop_diagnostics=True,
)

display(_sort_ablation(hop_sens_baseline))
display(_sort_ablation(budget_sens_baseline))

Unnamed: 0,model,rca_max_nodes,rca_max_hops,selected_nodes,mode,S_nodes,S_items,returned_items_total_ranked,gt_core_size,gt_ext_size,...,ext_tp@50,ext_P@50,ext_R@50,ext_k_used@50,core_Hit@50,core_hit_k_used@50,max_hop_selected,p90_hop_selected,mean_hop_selected,ablation
5,baseline,5000,25.0,60,or_duplicates,60,46,46,32,40,...,27,0.586957,0.675,46,1,46,25,24.0,15.366667,hop_sensitivity
4,baseline,5000,10.0,29,or_duplicates,29,22,22,32,40,...,15,0.681818,0.375,22,1,22,10,10.0,6.482759,hop_sensitivity
6,baseline,5000,50.0,139,or_duplicates,139,120,120,32,40,...,27,0.54,0.675,50,1,50,50,49.0,29.705036,hop_sensitivity
3,baseline,5000,5.0,10,or_duplicates,10,9,9,32,40,...,5,0.555556,0.125,9,1,9,5,5.0,3.4,hop_sensitivity
0,baseline,5000,1.0,1,or_duplicates,1,1,1,32,40,...,1,1.0,0.025,1,1,1,1,1.0,1.0,hop_sensitivity
1,baseline,5000,2.0,3,or_duplicates,3,3,3,32,40,...,1,0.333333,0.025,3,1,3,2,2.0,1.666667,hop_sensitivity
2,baseline,5000,3.0,5,or_duplicates,5,5,5,32,40,...,2,0.4,0.05,5,1,5,3,3.0,2.2,hop_sensitivity
9,baseline,5000,,4999,or_duplicates,4999,4385,4385,32,40,...,27,0.54,0.675,50,1,50,527,292.0,124.966393,hop_sensitivity
7,baseline,5000,100.0,4809,or_duplicates,4809,4683,4683,32,40,...,27,0.54,0.675,50,1,50,100,97.0,78.364525,hop_sensitivity
8,baseline,5000,150.0,4999,or_duplicates,4999,4846,4846,32,40,...,27,0.54,0.675,50,1,50,150,101.0,81.541708,hop_sensitivity


Unnamed: 0,model,rca_max_nodes,rca_max_hops,selected_nodes,mode,S_nodes,S_items,returned_items_total_ranked,gt_core_size,gt_ext_size,...,ext_tp@50,ext_P@50,ext_R@50,ext_k_used@50,core_Hit@50,core_hit_k_used@50,max_hop_selected,p90_hop_selected,mean_hop_selected,ablation
1,baseline,50,,49,or_duplicates,49,41,41,32,40,...,27,0.658537,0.675,41,1,41,35,30.2,16.918367,budget_sensitivity
2,baseline,100,,99,or_duplicates,99,82,82,32,40,...,27,0.54,0.675,50,1,50,51,42.0,24.959596,budget_sensitivity
0,baseline,25,,24,or_duplicates,24,17,17,32,40,...,15,0.882353,0.375,17,1,17,16,14.0,9.416667,budget_sensitivity
3,baseline,150,,149,or_duplicates,149,132,132,32,40,...,27,0.54,0.675,50,1,50,89,74.2,37.926174,budget_sensitivity
4,baseline,200,,199,or_duplicates,199,174,174,32,40,...,27,0.54,0.675,50,1,50,126,116.2,56.211055,budget_sensitivity


In [20]:
ablation_C1_path_baseline = RUN_DIR / "ablation_C1_reachability_baseline.csv"
hop_sens_baseline.to_csv(ablation_C1_path_baseline, index=False)
print("Saved:", ablation_C1_path_baseline)

Saved: C:\Users\patri\OneDrive\Documentos\MASTER THESIS\FRAMEWORK\2025-10-16T07_27Z_ssh_alert_01\Data Extraction\runs\baseline\run_6377dd375892_ae702f1c8f84\ablation_C1_reachability_baseline.csv


In [21]:
ablation_C2_path_baseline = RUN_DIR / "ablation_C2_reachability_baseline.csv"
budget_sens_baseline.to_csv(ablation_C2_path_baseline, index=False)
print("Saved:", ablation_C2_path_baseline)

Saved: C:\Users\patri\OneDrive\Documentos\MASTER THESIS\FRAMEWORK\2025-10-16T07_27Z_ssh_alert_01\Data Extraction\runs\baseline\run_6377dd375892_ae702f1c8f84\ablation_C2_reachability_baseline.csv
