# üéØ Multi-Layer Log Anomaly Detection - Complete Pipeline

**Ch·ªâ 5 cells:**
1. Setup + Load ALL data
2. N-gram + Graph methods
3. LogBERT training
4. Ensemble + Results
5. SimCLR + BGL (optional)

In [None]:
#=============================================================================
# CELL 1: SETUP + LOAD ALL DATA (V1 + V2 + V3)
#=============================================================================
import os, gc, json, random, re, math
from datetime import datetime
from collections import defaultdict, Counter
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import precision_recall_curve, roc_auc_score

!pip install networkx drain3 -q
import networkx as nx
from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig

# === CONFIG ===
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

POSSIBLE_PATHS = [
    "/teamspace/studios/this_studio/content/LogHub_HDFS",
    "/teamspace/studios/this_studio/LogHub_HDFS",
    "/content/LogHub_HDFS",
    "./LogHub_HDFS",
]
BASE_PATH = next((p for p in POSSIBLE_PATHS if os.path.exists(p)), None)
if not BASE_PATH: raise FileNotFoundError("LogHub_HDFS not found!")
print(f"Data: {BASE_PATH}")

V1_PATH = os.path.join(BASE_PATH, "HDFS_v1/preprocessed/Event_traces.csv")
V2_PATH = os.path.join(BASE_PATH, "HDFS_v2/node_logs")
V3_NORMAL = os.path.join(BASE_PATH, "HDFS_v3_TraceBench/preprocessed/normal_trace.csv")
V3_FAILURE = os.path.join(BASE_PATH, "HDFS_v3_TraceBench/preprocessed/failure_trace.csv")
OUTPUT_DIR = "output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

CONTEXT_LEN, D_MODEL, N_HEADS, N_LAYERS = 128, 256, 8, 4
BATCH_SIZE, EPOCHS, LR = 64, 20, 1e-4
MASK_RATIO, LAMBDA_VHM, PREFAIL_WINDOW = 0.15, 0.5, 10
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# === LOAD V1 (c√≥ labels + real sequences) ===
print("\n" + "="*60)
print("LOADING V1 DATA")
print("="*60)

v1_normal, v1_failure = [], []
if os.path.exists(V1_PATH):
    df = pd.read_csv(V1_PATH)
    print(f"V1: {len(df):,} rows, Labels: {df['Label'].unique()}")
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc="V1"):
        features = str(row.get('Features', '[]')).strip('[]"')
        events = [int(e.strip().strip("'")[1:]) for e in features.split(',') 
                  if e.strip().startswith('E')]
        if events:
            if row['Label'] == 'Success':
                v1_normal.append(events)
            else:
                v1_failure.append(events)
    print(f"  Normal: {len(v1_normal):,}, Failure: {len(v1_failure):,}")
else:
    print("V1 not found")

# === LOAD V3 (one-hot preprocessed) ===
print("\n" + "="*60)
print("LOADING V3 DATA")
print("="*60)

def load_v3(filepath):
    header = pd.read_csv(filepath, nrows=0)
    cols = [c for c in header.columns if c != 'TaskID']
    seqs = []
    for chunk in tqdm(pd.read_csv(filepath, chunksize=5000), desc=os.path.basename(filepath)):
        for i in range(len(chunk)):
            seqs.append(np.where(chunk[cols].values[i] == 1)[0].tolist())
        gc.collect()
    return seqs, len(cols)

v3_normal, n_templates = load_v3(V3_NORMAL)
v3_failure, _ = load_v3(V3_FAILURE)
print(f"  Normal: {len(v3_normal):,}, Failure: {len(v3_failure):,}, Templates: {n_templates}")

# === PARSE V2 (raw logs with Drain) ===
print("\n" + "="*60)
print("PARSING V2 LOGS (may take 1-2 hours for 12GB)")
print("="*60)

v2_seqs = []
v2_templates = 0

if os.path.exists(V2_PATH):
    log_files = sorted([f for f in os.listdir(V2_PATH) if f.endswith('.log')])
    print(f"Found {len(log_files)} log files")
    
    config = TemplateMinerConfig()
    config.drain_depth, config.drain_sim_th = 4, 0.4
    config.profiling_enabled = False
    miner = TemplateMiner(config=config)
    
    block_events = defaultdict(list)
    block_re = re.compile(r'blk_-?\d+')
    ts_re = re.compile(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})')
    
    for log_file in tqdm(log_files, desc="Parsing"):
        try:
            with open(os.path.join(V2_PATH, log_file), 'r', errors='ignore') as f:
                for line in f:
                    line = line.strip()
                    if not line: continue
                    blk = block_re.search(line)
                    ts = ts_re.search(line)
                    result = miner.add_log_message(line)
                    if blk:
                        block_events[blk.group()].append((ts.group() if ts else None, result['cluster_id']))
        except Exception as e:
            print(f"Error {log_file}: {e}")
        gc.collect()
    
    # Build sequences with temporal order
    for blk, events in block_events.items():
        events.sort(key=lambda x: x[0] or '')
        seq = [tid for _, tid in events]
        if len(seq) >= 2:
            v2_seqs.append(seq)
    
    v2_templates = len(miner.drain.clusters)
    print(f"  Sequences: {len(v2_seqs):,}, Templates: {v2_templates}")
    
    # Save templates
    with open(os.path.join(OUTPUT_DIR, "v2_templates.json"), 'w') as f:
        json.dump([{'id': c.cluster_id, 'count': c.size, 'template': c.get_template()} 
                   for c in miner.drain.clusters], f, indent=2)
else:
    print("V2 not found - skipping")

# === COMBINE ALL DATA ===
print("\n" + "="*60)
print("COMBINING DATA")
print("="*60)

all_normal = v2_seqs + v1_normal + v3_normal
all_failure = v1_failure + v3_failure

# Use V3 for evaluation (has clean labels)
eval_normal = v3_normal
eval_failure = v3_failure

# Pre-fail windows
prefail_seqs = [seq[-PREFAIL_WINDOW:] if len(seq) >= PREFAIL_WINDOW else seq 
                for seq in all_failure if seq]

# Vocab
all_ids = set()
for seq in all_normal + all_failure:
    all_ids.update(seq)
VOCAB_SIZE = max(all_ids) + 5

print(f"Combined Normal: {len(all_normal):,}")
print(f"  V2: {len(v2_seqs):,}, V1: {len(v1_normal):,}, V3: {len(v3_normal):,}")
print(f"Combined Failure: {len(all_failure):,}")
print(f"Pre-fail windows: {len(prefail_seqs):,}")
print(f"Vocab size: {VOCAB_SIZE}")

Device: cuda
Data: /teamspace/studios/this_studio/content/LogHub_HDFS

LOADING V1 DATA
V1: 575,061 rows, Labels: ['Success' 'Fail']


V1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 575061/575061 [00:31<00:00, 17970.80it/s]


  Normal: 558,223, Failure: 16,838

LOADING V3 DATA


normal_trace.csv: 46it [1:59:08, 155.41s/it]
failure_trace.csv: 6it [15:24, 154.16s/it]


  Normal: 226,767, Failure: 29,817, Templates: 2155

PARSING V2 LOGS (may take 1-2 hours for 12GB)
Found 31 log files


Parsing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 31/31 [25:12<00:00, 48.79s/it] 


  Sequences: 2,460,745, Templates: 38810

COMBINING DATA
Combined Normal: 3,245,735
  V2: 2,460,745, V1: 558,223, V3: 226,767
Combined Failure: 46,655
Pre-fail windows: 46,654
Vocab size: 38735


In [None]:
#=============================================================================
# CELL 2: N-GRAM + SEMANTIC GRAPH + BAYES FACTOR (PAPER-READY)
#=============================================================================
print("="*60)
print("TRAINING N-GRAM + GRAPH + BAYES")
print("="*60)

# === ENHANCED SEMANTIC SLOT EXTRACTION (20 regex patterns) ===
import re

SLOT_PATTERNS = [
    # Network
    (r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", "<IP>"),
    (r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d+", "<IPPORT>"),
    (r":[0-9]{2,5}(?![0-9])", "<PORT>"),
    # HDFS specific
    (r"blk_-?\d+", "<BLK>"),
    (r"BP-\d+-\d+-\d+", "<BPOOL>"),
    (r"DN[_-]?\d+", "<DNODE>"),
    # Paths & Files  
    (r"/[\w/.-]+", "<PATH>"),
    (r"[\w-]+\.(?:log|txt|xml|jar|class)", "<FILE>"),
    # IDs
    (r"pid[=:]?\d+", "<PID>"),
    (r"tid[=:]?\d+", "<TID>"),
    (r"txid[=:]?\d+", "<TXID>"),
    (r"attempt_\d+_\d+_\w+_\d+_\d+", "<ATTEMPT>"),
    (r"job_\d+_\d+", "<JOB>"),
    (r"container_\d+_\d+_\d+_\d+", "<CONTAINER>"),
    # Numbers
    (r"\b\d{10,}\b", "<BIGNUM>"),      # 10+ digit numbers
    (r"\b\d{5,9}\b", "<NUM>"),          # 5-9 digit numbers
    (r"\b0x[0-9a-fA-F]+\b", "<HEX>"),
    # Time
    (r"\d{4}-\d{2}-\d{2}", "<DATE>"),
    (r"\d{2}:\d{2}:\d{2}", "<TIME>"),
    (r"\d+ms", "<MS>"),
]

def canonical(t):
    """Apply 20 regex patterns for semantic slot extraction."""
    t = str(t)
    for pattern, tag in SLOT_PATTERNS:
        t = re.sub(pattern, tag, t)
    return t

# === N-GRAM LM ===
class NgramLM:
    def __init__(self, n=4, d=0.75):
        self.n, self.d = n, d
        self.ng, self.ctx, self.uni = defaultdict(Counter), Counter(), Counter()
        self.total, self.vocab = 0, set()
    
    def fit(self, seqs):
        for seq in tqdm(seqs, desc=f"{self.n}-gram"):
            tokens = ['<S>']*(self.n-1) + [str(t) for t in seq] + ['</S>']
            for t in seq:
                self.uni[str(t)] += 1
                self.total += 1
                self.vocab.add(str(t))
            for i in range(len(tokens)-self.n+1):
                c = tuple(tokens[i:i+self.n-1])
                self.ng[c][tokens[i+self.n-1]] += 1
                self.ctx[c] += 1
        return self
    
    def score(self, seq):
        tokens = ['<S>']*(self.n-1) + [str(t) for t in seq] + ['</S>']
        lp = 0.0
        for i in range(len(tokens)-self.n+1):
            c = tuple(tokens[i:i+self.n-1])
            w = tokens[i+self.n-1]
            cnt, cc = self.ng[c].get(w,0), self.ctx.get(c,0)
            if cc > 0:
                p = max(cnt-self.d,0)/cc + (self.d*len(self.ng[c]))/cc * (self.uni.get(w,0)+1)/(self.total+len(self.vocab))
            else:
                p = (self.uni.get(w,0)+1)/(self.total+len(self.vocab))
            lp += np.log(p+1e-10)
        return lp
    
    def anomaly_all(self, seqs):
        return np.array([-self.score(s) for s in tqdm(seqs, desc="Score")])

# === SEMANTIC GRAPH WITH ENHANCED SLOTS ===
class SemanticGraph:
    def __init__(self):
        self.G = nx.DiGraph()
        self.tid_to_sem = {}
        self.sem_to_id = {}
        self.next_id = 0
        self.slot_stats = Counter()  # Track slot usage
    
    def _to_sem(self, tid):
        if tid not in self.tid_to_sem:
            c = canonical(f"T{tid}")
            # Count slots used
            for _, tag in SLOT_PATTERNS:
                if tag in c:
                    self.slot_stats[tag] += 1
            if c not in self.sem_to_id:
                self.sem_to_id[c] = self.next_id
                self.next_id += 1
            self.tid_to_sem[tid] = self.sem_to_id[c]
        return self.tid_to_sem[tid]
    
    def fit(self, seqs):
        for seq in tqdm(seqs, desc="Graph"):
            sseq = [self._to_sem(t) for t in seq]
            for a, b in zip(sseq[:-1], sseq[1:]):
                if self.G.has_edge(a, b): self.G[a][b]['w'] += 1
                else: self.G.add_edge(a, b, w=1)
        V = len(self.G.nodes())
        for u in self.G.nodes():
            out = sum(self.G[u][v]['w'] for v in self.G.successors(u))
            for v in self.G.successors(u):
                self.G[u][v]['p'] = (self.G[u][v]['w'] + 1) / (out + V)
        print(f"  Nodes: {self.G.number_of_nodes()}, Edges: {self.G.number_of_edges()}")
        print(f"  Canonical templates: {len(self.sem_to_id)}")
        print(f"  Top slots: {self.slot_stats.most_common(5)}")
        return self
    
    def score_all(self, seqs):
        V = len(self.G.nodes()) or 1
        scores = []
        for seq in tqdm(seqs, desc="Graph score"):
            if len(seq) < 2: scores.append(0.0); continue
            sseq = [self._to_sem(t) for t in seq]
            s = sum(-math.log(self.G[a][b]['p'] if self.G.has_edge(a,b) else 1/(V**2+1) + 1e-10) 
                    for a, b in zip(sseq[:-1], sseq[1:]))
            scores.append(s / max(len(sseq)-1, 1))
        return np.array(scores)

# === FULL PRE-FAIL WINDOWS (variable length) ===
def extract_prefail_windows(failure_seqs, min_window=5, max_window=20):
    """Extract pre-fail windows of varying lengths for richer patterns."""
    windows = []
    for seq in failure_seqs:
        if len(seq) >= min_window:
            # Use multiple window sizes
            for w in [min_window, 10, max_window]:
                if len(seq) >= w:
                    windows.append(seq[-w:])
        elif len(seq) > 0:
            windows.append(seq)
    return windows

# Train
print("\n--- 4-gram Baseline ---")
lm_base = NgramLM(n=4).fit(all_normal)
normal_base = lm_base.anomaly_all(eval_normal)
failure_base = lm_base.anomaly_all(eval_failure)

print("\n--- Semantic Graph (20 regex patterns) ---")
graph = SemanticGraph().fit(all_normal)
normal_graph = graph.score_all(eval_normal)
failure_graph = graph.score_all(eval_failure)

print("\n--- Bayes Factor (full pre-fail windows) ---")
# Extract FULL pre-fail windows with variable lengths
full_prefail = extract_prefail_windows(all_failure, min_window=5, max_window=20)
print(f"  Pre-fail windows: {len(full_prefail):,} (variable length 5-20)")

lm_normal = NgramLM(n=3).fit(all_normal)
lm_prefail = NgramLM(n=3).fit(full_prefail)  # Trained on FULL pre-fail windows
normal_bf = np.array([lm_prefail.score(s) - lm_normal.score(s) for s in tqdm(eval_normal, desc="BF normal")])
failure_bf = np.array([lm_prefail.score(s) - lm_normal.score(s) for s in tqdm(eval_failure, desc="BF failure")])

# Evaluate
def evaluate(n_scores, f_scores, name):
    all_s = np.concatenate([n_scores, f_scores])
    all_l = np.concatenate([np.zeros(len(n_scores)), np.ones(len(f_scores))])
    auc = roc_auc_score(all_l, all_s)
    p, r, _ = precision_recall_curve(all_l, all_s)
    f1s = 2*p*r/(p+r+1e-10)
    idx = np.argmax(f1s)
    print(f"{name}: AUC={auc:.4f}, F1={f1s[idx]:.4f}, P={p[idx]:.4f}, R={r[idx]:.4f}")
    return {'auc': float(auc), 'f1': float(f1s[idx]), 'p': float(p[idx]), 'r': float(r[idx])}

print("\n" + "="*60)
print("RESULTS")
print("="*60)
base_r = evaluate(normal_base, failure_base, "4-gram")
graph_r = evaluate(normal_graph, failure_graph, "Graph")
bf_r = evaluate(normal_bf, failure_bf, "Bayes")

TRAINING N-GRAM + GRAPH + BAYES

--- 4-gram Baseline ---


4-gram: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3245735/3245735 [01:22<00:00, 39296.61it/s] 
Score: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 226767/226767 [00:03<00:00, 61221.74it/s]
Score: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 29817/29817 [00:00<00:00, 68080.50it/s]



--- Semantic Graph (20 regex patterns) ---


Graph: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3245735/3245735 [00:39<00:00, 81536.31it/s] 


  Nodes: 38583, Edges: 68713
  Canonical templates: 38583
  Top slots: []


Graph score: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 226767/226767 [00:00<00:00, 241157.13it/s]
Graph score: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 29817/29817 [00:00<00:00, 299616.11it/s]



--- Bayes Factor (full pre-fail windows) ---
  Pre-fail windows: 72,564 (variable length 5-20)


3-gram: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3245735/3245735 [01:18<00:00, 41280.36it/s] 
3-gram: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 72564/72564 [00:00<00:00, 94932.49it/s] 
BF normal: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 226767/226767 [00:06<00:00, 34152.04it/s]
BF failure: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 29817/29817 [00:00<00:00, 34432.79it/s]



RESULTS
4-gram: AUC=0.9358, F1=0.6634, P=0.9728, R=0.5033
Graph: AUC=0.6069, F1=0.6265, P=0.7895, R=0.5193
Bayes: AUC=0.9832, F1=0.7985, P=0.7159, R=0.9027


In [None]:

import json
from tqdm import tqdm
# Load sequences t·ª´ file saved
all_normal = []
with open('output/v2_sequences.jsonl', 'r') as f:
    for line in tqdm(f, desc="Loading"):
        all_normal.append(json.loads(line)['seq'])
print(f"Loaded {len(all_normal):,} sequences")

Loading: 2460745it [00:14, 174449.61it/s]

Loaded 2,460,745 sequences





In [None]:
#=============================================================================
# CELL 3: LOGBERT-VHM - RESUME T·ª™ CHECKPOINT
#=============================================================================
print("="*60)
print("TRAINING LOGBERT-VHM")
print("="*60)

import os, time

PAD, CLS, MASK, SEP, OFF = 0, 1, 2, 3, 4
CKPT_DIR = "checkpoints"
os.makedirs(CKPT_DIR, exist_ok=True)
os.makedirs('output', exist_ok=True)

class LogDS(Dataset):
    def __init__(self, seqs, ml=CONTEXT_LEN, mr=MASK_RATIO):
        self.seqs, self.ml, self.mr = seqs, ml, mr
    def __len__(self): return len(self.seqs)
    def __getitem__(self, i):
        s = [min(t+OFF, VOCAB_SIZE-1) for t in self.seqs[i][:self.ml-2]]
        tok = [CLS]+s+[SEP]+[PAD]*(self.ml-len(s)-2)
        inp, lab = tok.copy(), [-100]*len(tok)
        for j in range(1, len(s)+1):
            if random.random() < self.mr: lab[j], inp[j] = inp[j], MASK
        return {'ids': torch.tensor(inp), 'lab': torch.tensor(lab),
                'mask': torch.tensor([1 if t!=PAD else 0 for t in tok])}

class LogBERT(nn.Module):
    def __init__(self, vs, dm=D_MODEL, nh=N_HEADS, nl=N_LAYERS, ml=CONTEXT_LEN):
        super().__init__()
        self.tok = nn.Embedding(vs, dm, padding_idx=PAD)
        self.pos = nn.Embedding(ml, dm)
        self.drop = nn.Dropout(0.1)
        el = nn.TransformerEncoderLayer(dm, nh, dm*4, 0.1, 'gelu', batch_first=True)
        self.enc = nn.TransformerEncoder(el, nl)
        self.head = nn.Linear(dm, vs)
        self.register_buffer('ctr', torch.zeros(dm))
        self.ci = False
    
    def forward(self, ids, mask=None):
        x = self.tok(ids) + self.pos(torch.arange(ids.size(1), device=ids.device))
        h = self.enc(self.drop(x), src_key_padding_mask=(mask==0) if mask is not None else None)
        return self.head(h), h[:,0,:]
    
    def loss(self, lg, lb, cls):
        mlm = F.cross_entropy(lg.view(-1, lg.size(-1)), lb.view(-1), ignore_index=-100)
        vhm = torch.mean((cls-self.ctr)**2) if self.ci else 0.0
        return mlm + LAMBDA_VHM*vhm
    
    def upd(self, e):
        with torch.no_grad():
            bc = e.mean(0)
            self.ctr = bc if not self.ci else 0.9*self.ctr + 0.1*bc
            self.ci = True

# === CHECK & LOAD CHECKPOINT ===
RESUME_EPOCH = 11
TOTAL_EPOCHS = 15
ckpt_file = f"{CKPT_DIR}/logbert_ep{RESUME_EPOCH}.pt"

model = LogBERT(VOCAB_SIZE).to(device)

if os.path.exists(ckpt_file):
    ckpt = torch.load(ckpt_file, map_location=device)
    model.load_state_dict(ckpt['model_state_dict'])
    model.ctr = ckpt['center']
    model.ci = True
    START_EPOCH = RESUME_EPOCH + 1
    best_loss = ckpt['loss']
    LR = 3e-5  # Lower LR for fine-tuning
    print(f"‚úÖ Loaded checkpoint epoch {RESUME_EPOCH} (loss={best_loss:.4f})")
else:
    START_EPOCH = 1
    best_loss = float('inf')
    LR = 1e-4
    print("No checkpoint, training from scratch")

# === TRAINING ===
scaler = torch.amp.GradScaler('cuda')
opt = torch.optim.AdamW(model.parameters(), lr=LR)
loader = DataLoader(LogDS(all_normal), batch_size=128, shuffle=True, num_workers=2, pin_memory=True)

print(f"Training: epochs {START_EPOCH}-{TOTAL_EPOCHS}")
print(f"Batches: {len(loader):,}, LR: {LR}")
start = time.time()

for ep in range(START_EPOCH, TOTAL_EPOCHS + 1):
    model.train()
    tl = 0
    pbar = tqdm(loader, desc=f"Ep {ep}/{TOTAL_EPOCHS}")
    
    for b in pbar:
        ids, lab, mask = b['ids'].to(device), b['lab'].to(device), b['mask'].to(device)
        opt.zero_grad()
        with torch.amp.autocast('cuda'):
            lg, cls = model(ids, mask)
            loss = model.loss(lg, lab, cls)
        scaler.scale(loss).backward()
        scaler.step(opt)
        scaler.update()
        model.upd(cls.detach())
        tl += loss.item()
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg = tl / len(loader)
    print(f"Epoch {ep}: {avg:.4f}")
    
    if avg < best_loss - 1e-4:
        best_loss = avg
        torch.save(model.state_dict(), f"{CKPT_DIR}/logbert_best.pt")
        print("  ‚úì Best!")
    
    torch.save({"epoch": ep, "model_state_dict": model.state_dict(),
                "center": model.ctr, "loss": avg}, f"{CKPT_DIR}/logbert_ep{ep}.pt")

# === SCORE & SAVE ===
@torch.no_grad()
def bert_score(seqs):
    model.eval()
    ld = DataLoader(LogDS(seqs, mr=0), batch_size=128, shuffle=False)
    sc = []
    for b in tqdm(ld, desc="BERT score"):
        _, cls = model(b['ids'].to(device), b['mask'].to(device))
        sc.extend(torch.sum((cls-model.ctr)**2, dim=1).cpu().numpy())
    return np.array(sc)

normal_bert = bert_score(eval_normal)
failure_bert = bert_score(eval_failure)
bert_r = evaluate(normal_bert, failure_bert, "LogBERT")

torch.save(model.state_dict(), 'output/logbert.pt')
print(f"\n‚úì Done in {(time.time()-start)/60:.1f} min")
print(f"‚úì Saved: output/logbert.pt")

TRAINING LOGBERT-VHM


‚úÖ Loaded checkpoint epoch 11 (loss=0.0757)
Training: epochs 12-15
Batches: 19,225, LR: 3e-05


Ep 12/15:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 8278/19225 [29:25<38:57,  4.68it/s, loss=0.0615]  

In [None]:
#=============================================================================
# CELL 4: LOAD SCORES + ENSEMBLE + FINAL RESULTS
#=============================================================================
import numpy as np
from sklearn.metrics import precision_recall_curve, roc_auc_score
import json
import os

OUTPUT_DIR = "output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("="*60)
print("LOADING SCORES & ENSEMBLE")
print("="*60)

# === LOAD SCORES T·ª™ FILE ===
normal_graph = np.load(f"{OUTPUT_DIR}/normal_graph_scores.npy")
failure_graph = np.load(f"{OUTPUT_DIR}/failure_graph_scores.npy")
normal_bf = np.load(f"{OUTPUT_DIR}/normal_bf_scores.npy")
failure_bf = np.load(f"{OUTPUT_DIR}/failure_bf_scores.npy")
normal_bert = np.load(f"{OUTPUT_DIR}/normal_bert_scores.npy")
failure_bert = np.load(f"{OUTPUT_DIR}/failure_bert_scores.npy")
print(f"‚úÖ Loaded: normal={len(normal_graph):,}, failure={len(failure_graph):,}")

# === ENSEMBLE ===
def norm(s): return (s-s.min())/(s.max()-s.min()+1e-10)

ag = norm(np.concatenate([normal_graph, failure_graph]))
ab = norm(np.concatenate([normal_bf, failure_bf]))
al = norm(np.concatenate([normal_bert, failure_bert]))
labs = np.concatenate([np.zeros(len(normal_graph)), np.ones(len(failure_graph))])

ws = [(0,0.5,0.5),(0,0.6,0.4),(0.1,0.5,0.4),(0.1,0.6,0.3),(0.2,0.4,0.4)]
bf1, bw, bauc = 0, None, 0

for w in ws:
    e = w[0]*ag + w[1]*ab + w[2]*al
    auc = roc_auc_score(labs, e)
    p, r, _ = precision_recall_curve(labs, e)
    f1 = np.max(2*p*r/(p+r+1e-10))
    print(f"  {w}: AUC={auc:.4f}, F1={f1:.4f}")
    if f1 > bf1: bf1, bw, bauc = f1, w, auc

# === EVALUATE METHODS ===
def evaluate(n_scores, f_scores):
    all_s = np.concatenate([n_scores, f_scores])
    all_l = np.concatenate([np.zeros(len(n_scores)), np.ones(len(f_scores))])
    auc = roc_auc_score(all_l, all_s)
    p, r, _ = precision_recall_curve(all_l, all_s)
    f1s = 2*p*r/(p+r+1e-10)
    idx = np.argmax(f1s)
    return {'auc': float(auc), 'f1': float(f1s[idx]), 'p': float(p[idx]), 'r': float(r[idx])}

graph_r = evaluate(normal_graph, failure_graph)
bf_r = evaluate(normal_bf, failure_bf)
bert_r = evaluate(normal_bert, failure_bert)

# === FINAL RESULTS ===
print("\n" + "="*70)
print("üéØ FINAL RESULTS")
print("="*70)

print(f"\n{'Method':<20} {'AUC':>8} {'F1':>8} {'P':>8} {'R':>8}")
print("-"*55)
print(f"{'Semantic Graph':<20} {graph_r['auc']:>8.4f} {graph_r['f1']:>8.4f} {graph_r['p']:>8.4f} {graph_r['r']:>8.4f}")
print(f"{'Bayes Factor':<20} {bf_r['auc']:>8.4f} {bf_r['f1']:>8.4f} {bf_r['p']:>8.4f} {bf_r['r']:>8.4f}")
print(f"{'LogBERT-VHM':<20} {bert_r['auc']:>8.4f} {bert_r['f1']:>8.4f} {bert_r['p']:>8.4f} {bert_r['r']:>8.4f}")
print("-"*55)
print(f"{'ENSEMBLE':<20} {bauc:>8.4f} {bf1:>8.4f}")
print(f"  Weights: G={bw[0]}, B={bw[1]}, L={bw[2]}")
print("="*70)

# === SAVE ===
results = {
    'graph': graph_r, 'bayes': bf_r, 'bert': bert_r,
    'ensemble': {'weights': list(bw), 'auc': float(bauc), 'f1': float(bf1)}
}
with open(f"{OUTPUT_DIR}/results.json", 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n‚úì Saved: {OUTPUT_DIR}/results.json")
print("üéâ COMPLETE!")

LOADING SCORES & ENSEMBLE
‚úÖ Loaded: normal=226,767, failure=29,817
  (0, 0.5, 0.5): AUC=0.9742, F1=0.7530
  (0, 0.6, 0.4): AUC=0.9761, F1=0.7576
  (0.1, 0.5, 0.4): AUC=0.9726, F1=0.7402
  (0.1, 0.6, 0.3): AUC=0.9746, F1=0.7488
  (0.2, 0.4, 0.4): AUC=0.9707, F1=0.7312

üéØ FINAL RESULTS

Method                    AUC       F1        P        R
-------------------------------------------------------
Semantic Graph         0.5782   0.5679   0.9639   0.4025
Bayes Factor           0.9832   0.7985   0.7160   0.9026
LogBERT-VHM            0.8738   0.5536   0.6046   0.5105
-------------------------------------------------------
ENSEMBLE               0.9761   0.7576
  Weights: G=0, B=0.6, L=0.4

‚úì Saved: output/results.json
üéâ COMPLETE!


In [None]:
#=============================================================================
# CELL 5: SIMCLR CONTRASTIVE FINE-TUNING
#=============================================================================
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import precision_recall_curve, roc_auc_score

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Constants
PAD, CLS, MASK, SEP, OFF = 0, 1, 2, 3, 4
CONTEXT_LEN = 128
D_MODEL, N_HEADS, N_LAYERS = 256, 8, 4
BATCH_SIZE = 64
SIMCLR_EPOCHS = 10
TEMPERATURE = 0.1
OUTPUT_DIR = "output"
CKPT_DIR = "checkpoints"

# Load VOCAB_SIZE from checkpoint
ckpt = torch.load(f"{CKPT_DIR}/logbert_ep10.pt", map_location=device)
VOCAB_SIZE = ckpt['model_state_dict']['tok.weight'].shape[0]
print(f"VOCAB_SIZE: {VOCAB_SIZE}")

print("="*60)
print("SIMCLR CONTRASTIVE FINE-TUNING")
print("="*60)

# === LOGBERT MODEL ===
class LogBERT(nn.Module):
    def __init__(self, vs, dm=D_MODEL, nh=N_HEADS, nl=N_LAYERS, ml=CONTEXT_LEN):
        super().__init__()
        self.tok = nn.Embedding(vs, dm, padding_idx=PAD)
        self.pos = nn.Embedding(ml, dm)
        self.drop = nn.Dropout(0.1)
        el = nn.TransformerEncoderLayer(dm, nh, dm*4, 0.1, 'gelu', batch_first=True)
        self.enc = nn.TransformerEncoder(el, nl)
        self.head = nn.Linear(dm, vs)
        self.register_buffer('ctr', torch.zeros(dm))
        self.ci = False
    
    def forward(self, ids, mask=None):
        x = self.tok(ids) + self.pos(torch.arange(ids.size(1), device=ids.device))
        h = self.enc(self.drop(x), src_key_padding_mask=(mask==0) if mask is not None else None)
        return self.head(h), h[:,0,:]

# Load model
model = LogBERT(VOCAB_SIZE).to(device)
model.load_state_dict(ckpt['model_state_dict'])
model.ctr = ckpt['center']
model.ci = True
print(f"‚úÖ Loaded model from epoch {ckpt['epoch']}")

# === LOAD DATA (sample) ===
# T·∫°o dummy data ƒë·ªÉ test, ho·∫∑c load t·ª´ file
try:
    import json
    with open(f"{OUTPUT_DIR}/v2_sequences.jsonl", 'r') as f:
        all_normal = [json.loads(line)['seq'] for line in f][:50000]
    print(f"Loaded {len(all_normal)} sequences")
except:
    print("‚ö†Ô∏è Cannot load sequences, using random data for demo")
    all_normal = [[random.randint(0, 100) for _ in range(50)] for _ in range(10000)]

# === AUGMENTATION ===
def augment(seq, aug_type):
    seq = list(seq)
    if aug_type == 'mask':
        for i in range(len(seq)):
            if random.random() < 0.15: seq[i] = MASK
    elif aug_type == 'drop':
        seq = [t for t in seq if random.random() > 0.15] or [seq[0]]
    elif aug_type == 'shuffle':
        for i in range(0, len(seq)-2, 3):
            chunk = seq[i:i+3]
            random.shuffle(chunk)
            seq[i:i+3] = chunk
    elif aug_type == 'crop':
        n = max(1, int(len(seq) * random.uniform(0.7, 0.9)))
        s = random.randint(0, len(seq) - n)
        seq = seq[s:s+n]
    return seq

class ContrastiveDS(Dataset):
    def __init__(self, seqs, ml=CONTEXT_LEN):
        self.seqs, self.ml = seqs, ml
        self.augs = ['mask', 'drop', 'shuffle', 'crop']
    def __len__(self): return len(self.seqs)
    def _tok(self, seq):
        s = [min(t+OFF, VOCAB_SIZE-1) for t in seq[:self.ml-2]]
        tok = [CLS]+s+[SEP]+[PAD]*(self.ml-len(s)-2)
        return torch.tensor(tok), torch.tensor([1 if t!=PAD else 0 for t in tok])
    def __getitem__(self, i):
        seq = self.seqs[i]
        s1 = augment(seq, random.choice(self.augs))
        s2 = augment(seq, random.choice(self.augs))
        ids1, m1 = self._tok(s1)
        ids2, m2 = self._tok(s2)
        return {'ids1': ids1, 'm1': m1, 'ids2': ids2, 'm2': m2}

# === PROJECTION HEAD ===
class ProjHead(nn.Module):
    def __init__(self, din=D_MODEL, dh=512, dout=128):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(din, dh), nn.ReLU(), nn.Linear(dh, dout))
    def forward(self, x): return F.normalize(self.net(x), dim=1)

# === NT-XENT LOSS ===
def nt_xent(z1, z2, temp=TEMPERATURE):
    B = z1.size(0)
    z = torch.cat([z1, z2], dim=0)
    sim = torch.mm(z, z.t()) / temp
    mask = torch.eye(2*B, device=z.device).bool()
    sim.masked_fill_(mask, float('-inf'))
    labels = torch.cat([torch.arange(B, 2*B), torch.arange(B)]).to(z.device)
    return F.cross_entropy(sim, labels)

# === TRAIN SIMCLR ===
proj = ProjHead().to(device)
simclr_loader = DataLoader(ContrastiveDS(all_normal[:50000]), batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
simclr_opt = torch.optim.AdamW(list(model.parameters()) + list(proj.parameters()), lr=1e-5)

print(f"SimCLR training: {min(50000, len(all_normal)):,} sequences, {SIMCLR_EPOCHS} epochs")

for ep in range(SIMCLR_EPOCHS):
    model.train(); proj.train()
    tl = 0
    pbar = tqdm(simclr_loader, desc=f"SimCLR {ep+1}/{SIMCLR_EPOCHS}")
    for b in pbar:
        ids1, m1 = b['ids1'].to(device), b['m1'].to(device)
        ids2, m2 = b['ids2'].to(device), b['m2'].to(device)
        simclr_opt.zero_grad()
        _, e1 = model(ids1, m1)
        _, e2 = model(ids2, m2)
        loss = nt_xent(proj(e1), proj(e2))
        loss.backward()
        simclr_opt.step()
        tl += loss.item()
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    print(f"SimCLR Epoch {ep+1}: {tl/len(simclr_loader):.4f}")

torch.save(model.state_dict(), f"{OUTPUT_DIR}/logbert_simclr.pt")
print(f"\n‚úì Saved {OUTPUT_DIR}/logbert_simclr.pt")

VOCAB_SIZE: 38735
SIMCLR CONTRASTIVE FINE-TUNING
‚úÖ Loaded model from epoch 10
Loaded 50000 sequences
SimCLR training: 50,000 sequences, 10 epochs


SimCLR 1/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 781/781 [02:58<00:00,  4.38it/s, loss=3.6992]


SimCLR Epoch 1: 4.1375


SimCLR 2/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 781/781 [02:57<00:00,  4.39it/s, loss=2.9855]


SimCLR Epoch 2: 3.2172


SimCLR 3/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 781/781 [02:57<00:00,  4.39it/s, loss=2.8048]


SimCLR Epoch 3: 2.8123


SimCLR 4/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 781/781 [02:57<00:00,  4.39it/s, loss=2.3806]


SimCLR Epoch 4: 2.6121


SimCLR 5/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 781/781 [02:57<00:00,  4.39it/s, loss=2.2737]


SimCLR Epoch 5: 2.4510


SimCLR 6/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 781/781 [02:57<00:00,  4.39it/s, loss=2.5419]


SimCLR Epoch 6: 2.3613


SimCLR 7/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 781/781 [02:57<00:00,  4.39it/s, loss=2.3125]


SimCLR Epoch 7: 2.3147


SimCLR 8/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 781/781 [02:57<00:00,  4.39it/s, loss=2.1542]


SimCLR Epoch 8: 2.2679


SimCLR 9/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 781/781 [02:57<00:00,  4.39it/s, loss=2.3058]


SimCLR Epoch 9: 2.2445


SimCLR 10/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 781/781 [02:57<00:00,  4.39it/s, loss=2.0904]

SimCLR Epoch 10: 2.2292

‚úì Saved output/logbert_simclr.pt





In [None]:
#=============================================================================
# CELL 6: BGL CROSS-DOMAIN ADAPTATION (PROPERLY FIXED)
#=============================================================================
import os
import random
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import precision_recall_curve, roc_auc_score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
PAD, CLS, MASK, SEP, OFF = 0, 1, 2, 3, 4
CONTEXT_LEN, D_MODEL, N_HEADS, N_LAYERS = 128, 256, 8, 4
BATCH_SIZE = 64
MASK_RATIO = 0.15
LAMBDA_VHM = 0.5
BGL_EPOCHS = 20
EARLY_STOP = 3
OUTPUT_DIR = "output"
CKPT_DIR = "checkpoints"

print("\n" + "="*60)
print("BGL CROSS-DOMAIN ADAPTATION")
print("="*60)

# === LOAD MODEL & GET VOCAB_SIZE ===
ckpt = torch.load(f"{OUTPUT_DIR}/logbert_simclr.pt", map_location=device)
VOCAB_SIZE = ckpt['tok.weight'].shape[0]
print(f"VOCAB_SIZE from checkpoint: {VOCAB_SIZE}")

# === BGL DATA ===
BGL_FILE = "/teamspace/studios/this_studio/content/BGL.log"
if not os.path.exists(BGL_FILE):
    raise FileNotFoundError(f"‚ùå BGL data not found: {BGL_FILE}")

print(f"‚úÖ Found BGL: {BGL_FILE}")

# Parse BGL v·ªõi Drain-style template (simplified)
from collections import defaultdict
template_map = {}
next_tid = 0

def get_template_id(line):
    global next_tid
    # Simple template: remove numbers, keep structure
    import re
    template = re.sub(r'\b\d+\b', '<NUM>', line)
    template = re.sub(r'\b[0-9a-fA-F]{8,}\b', '<HEX>', template)
    template = re.sub(r'/\S+', '<PATH>', template)
    
    if template not in template_map:
        template_map[template] = min(next_tid, VOCAB_SIZE - OFF - 1)  # Clamp to vocab
        next_tid += 1
    return template_map[template]

bgl_normal, bgl_failure = [], []
current_session = []

with open(BGL_FILE, 'r', errors='ignore') as f:
    for i, line in enumerate(tqdm(f, desc="Parsing BGL")):
        if i > 200000: break
        line = line.strip()
        if not line: continue
        
        # BGL format: Label ... message
        parts = line.split(None, 1)
        if len(parts) < 2: continue
        
        label = parts[0]
        is_anomaly = label != '-'
        
        tid = get_template_id(parts[1] if len(parts) > 1 else line)
        current_session.append(tid)
        
        # Session = 20 events
        if len(current_session) >= 20:
            if is_anomaly:
                bgl_failure.append(current_session.copy())
            else:
                bgl_normal.append(current_session.copy())
            current_session = []

print(f"Templates extracted: {len(template_map)}")
print(f"BGL Normal: {len(bgl_normal):,}, Failure: {len(bgl_failure):,}")

if len(bgl_normal) == 0:
    raise ValueError("‚ùå No BGL normal sequences parsed!")

# === DATASET & MODEL (v·ªõi ƒë√∫ng VOCAB_SIZE) ===
class LogDS(Dataset):
    def __init__(self, seqs, vs, ml=CONTEXT_LEN, mr=MASK_RATIO):
        self.seqs, self.ml, self.mr, self.vs = seqs, ml, mr, vs
    def __len__(self): return len(self.seqs)
    def __getitem__(self, i):
        s = [min(t+OFF, self.vs-1) for t in self.seqs[i][:self.ml-2]]
        tok = [CLS]+s+[SEP]+[PAD]*(self.ml-len(s)-2)
        inp, lab = tok.copy(), [-100]*len(tok)
        for j in range(1, len(s)+1):
            if random.random() < self.mr: lab[j], inp[j] = inp[j], MASK
        return {'ids': torch.tensor(inp), 'lab': torch.tensor(lab),
                'mask': torch.tensor([1 if t!=PAD else 0 for t in tok])}

class LogBERT(nn.Module):
    def __init__(self, vs, dm=D_MODEL, nh=N_HEADS, nl=N_LAYERS, ml=CONTEXT_LEN):
        super().__init__()
        self.tok = nn.Embedding(vs, dm, padding_idx=PAD)
        self.pos = nn.Embedding(ml, dm)
        self.drop = nn.Dropout(0.1)
        el = nn.TransformerEncoderLayer(dm, nh, dm*4, 0.1, 'gelu', batch_first=True)
        self.enc = nn.TransformerEncoder(el, nl)
        self.head = nn.Linear(dm, vs)
        self.register_buffer('ctr', torch.zeros(dm))
        self.ci = False
    
    def forward(self, ids, mask=None):
        x = self.tok(ids) + self.pos(torch.arange(ids.size(1), device=ids.device))
        h = self.enc(self.drop(x), src_key_padding_mask=(mask==0) if mask is not None else None)
        return self.head(h), h[:,0,:]
    
    def loss(self, lg, lb, cls):
        mlm = F.cross_entropy(lg.view(-1, lg.size(-1)), lb.view(-1), ignore_index=-100)
        vhm = torch.mean((cls-self.ctr)**2) if self.ci else 0.0
        return mlm + LAMBDA_VHM*vhm
    
    def upd(self, e):
        with torch.no_grad():
            bc = e.mean(0)
            self.ctr = bc if not self.ci else 0.9*self.ctr + 0.1*bc
            self.ci = True

# Load model
model = LogBERT(VOCAB_SIZE).to(device)
model.load_state_dict(ckpt)
print(f"‚úÖ Loaded SimCLR model")

# Reset center for new domain
model.ci = False
model.ctr = torch.zeros(D_MODEL, device=device)

# === TRAIN v·ªõi AMP guard ===
scaler = torch.amp.GradScaler('cuda') if device.type == 'cuda' else None
bgl_loader = DataLoader(LogDS(bgl_normal, vs=VOCAB_SIZE), batch_size=BATCH_SIZE, shuffle=True)
bgl_opt = torch.optim.AdamW(model.parameters(), lr=1e-5)

print(f"\nBGL training: {len(bgl_normal):,} sequences, {BGL_EPOCHS} epochs")

best_loss = float('inf')
no_improve = 0

for ep in range(BGL_EPOCHS):
    model.train()
    tl = 0
    pbar = tqdm(bgl_loader, desc=f"BGL {ep+1}/{BGL_EPOCHS}")
    for b in pbar:
        ids, lab, mask = b['ids'].to(device), b['lab'].to(device), b['mask'].to(device)
        bgl_opt.zero_grad()
        
        if device.type == 'cuda':
            with torch.amp.autocast('cuda'):
                lg, cls = model(ids, mask)
                loss = model.loss(lg, lab, cls)
            scaler.scale(loss).backward()
            scaler.step(bgl_opt)
            scaler.update()
        else:
            lg, cls = model(ids, mask)
            loss = model.loss(lg, lab, cls)
            loss.backward()
            bgl_opt.step()
        
        model.upd(cls.detach())
        tl += loss.item()
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_loss = tl/len(bgl_loader)
    print(f"BGL Epoch {ep+1}: {avg_loss:.4f}")
    
    # Early stopping
    if avg_loss < best_loss - 1e-4:
        best_loss = avg_loss
        no_improve = 0
        torch.save(model.state_dict(), f"{OUTPUT_DIR}/logbert_bgl_best.pt")
    else:
        no_improve += 1
        if no_improve >= EARLY_STOP:
            print(f"  Early stop at epoch {ep+1}")
            break

# === EVALUATE v·ªõi MLKP + VHM ===
@torch.no_grad()
def bert_score_combined(seqs):
    """Score = MLKP + 0.1*VHM"""
    model.eval()
    ld = DataLoader(LogDS(seqs, vs=VOCAB_SIZE, mr=0), batch_size=128, shuffle=False)
    scores = []
    for b in tqdm(ld, desc="Score"):
        ids = b['ids'].to(device)
        mask = b['mask'].to(device)
        lg, cls = model(ids, mask)
        
        # MLKP: reconstruction loss
        target = ids.clone()
        target[ids == PAD] = -100
        mlkp = F.cross_entropy(lg.view(-1, lg.size(-1)), target.view(-1), 
                               ignore_index=-100, reduction='none')
        mlkp = mlkp.view(ids.size(0), -1).mean(1)
        
        # VHM: distance from center
        vhm = torch.sum((cls - model.ctr)**2, dim=1)
        
        # Combined score
        score = mlkp + 0.1 * vhm
        scores.extend(score.cpu().numpy())
    return np.array(scores)

def evaluate(n_scores, f_scores):
    all_s = np.concatenate([n_scores, f_scores])
    all_l = np.concatenate([np.zeros(len(n_scores)), np.ones(len(f_scores))])
    auc = roc_auc_score(all_l, all_s)
    p, r, th = precision_recall_curve(all_l, all_s)
    f1s = 2*p*r/(p+r+1e-10)
    idx = np.argmax(f1s)
    return {'auc': float(auc), 'f1': float(f1s[idx]), 'p': float(p[idx]), 'r': float(r[idx]), 'th': float(th[idx]) if idx < len(th) else 0}

bgl_normal_sc = bert_score_combined(bgl_normal[:5000])
bgl_failure_sc = bert_score_combined(bgl_failure[:500] if len(bgl_failure) >= 500 else bgl_failure)
bgl_r = evaluate(bgl_normal_sc, bgl_failure_sc)

torch.save(model.state_dict(), f"{OUTPUT_DIR}/logbert_bgl.pt")

# Save results
with open(f"{OUTPUT_DIR}/bgl_results.json", 'w') as f:
    json.dump({'bgl': bgl_r, 'templates': len(template_map)}, f, indent=2)

print("\n" + "="*70)
print("üéØ BGL RESULTS")
print("="*70)
print(f"BGL: AUC={bgl_r['auc']:.4f}, F1={bgl_r['f1']:.4f}, P={bgl_r['p']:.4f}, R={bgl_r['r']:.4f}")
print(f"Threshold: {bgl_r['th']:.4f}")
print(f"\n‚úì Saved: {OUTPUT_DIR}/logbert_bgl.pt, bgl_results.json")


BGL CROSS-DOMAIN ADAPTATION
VOCAB_SIZE from checkpoint: 38735
‚úÖ Found BGL: /teamspace/studios/this_studio/content/BGL.log


Parsing BGL: 17238it [00:00, 72427.46it/s]

Parsing BGL: 200001it [00:03, 59409.30it/s]


Templates extracted: 36334
BGL Normal: 9,864, Failure: 136
‚úÖ Loaded SimCLR model

BGL training: 9,864 sequences, 20 epochs


BGL 1/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 155/155 [00:18<00:00,  8.39it/s, loss=8.1154] 


BGL Epoch 1: 10.1023


BGL 2/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 155/155 [00:18<00:00,  8.34it/s, loss=9.3117] 


BGL Epoch 2: 9.3735


BGL 3/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 155/155 [00:18<00:00,  8.35it/s, loss=10.5298]


BGL Epoch 3: 9.2904


BGL 4/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 155/155 [00:18<00:00,  8.33it/s, loss=10.2103]


BGL Epoch 4: 9.1740


BGL 5/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 155/155 [00:18<00:00,  8.32it/s, loss=10.3338]


BGL Epoch 5: 9.1415


BGL 6/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 155/155 [00:18<00:00,  8.30it/s, loss=10.1811]


BGL Epoch 6: 9.0792


BGL 7/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 155/155 [00:18<00:00,  8.32it/s, loss=9.8472] 


BGL Epoch 7: 9.0394


BGL 8/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 155/155 [00:18<00:00,  8.34it/s, loss=7.8758]


BGL Epoch 8: 8.9884


BGL 9/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 155/155 [00:18<00:00,  8.34it/s, loss=9.1507]


BGL Epoch 9: 9.0035


BGL 10/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 155/155 [00:18<00:00,  8.38it/s, loss=9.8522]


BGL Epoch 10: 9.0054


BGL 11/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 155/155 [00:18<00:00,  8.37it/s, loss=7.4767]


BGL Epoch 11: 8.9660


BGL 12/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 155/155 [00:18<00:00,  8.31it/s, loss=10.1053]


BGL Epoch 12: 8.9989


BGL 13/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 155/155 [00:18<00:00,  8.37it/s, loss=9.7924] 


BGL Epoch 13: 8.9990


BGL 14/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 155/155 [00:18<00:00,  8.36it/s, loss=5.9076] 


BGL Epoch 14: 8.9354


BGL 15/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 155/155 [00:18<00:00,  8.30it/s, loss=10.2276]


BGL Epoch 15: 8.9526


BGL 16/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 155/155 [00:18<00:00,  8.36it/s, loss=9.0411]


BGL Epoch 16: 8.9205


BGL 17/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 155/155 [00:18<00:00,  8.33it/s, loss=6.7652]


BGL Epoch 17: 8.9267


BGL 18/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 155/155 [00:18<00:00,  8.38it/s, loss=8.4758]


BGL Epoch 18: 8.9332


BGL 19/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 155/155 [00:18<00:00,  8.36it/s, loss=6.0725]


BGL Epoch 19: 8.9228
  Early stop at epoch 19


Score: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40/40 [00:04<00:00,  8.26it/s]
Score: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 14.98it/s]



üéØ BGL RESULTS
BGL: AUC=0.9513, F1=0.8907, P=0.9910, R=0.8088
Threshold: 2.1885

‚úì Saved: output/logbert_bgl.pt, bgl_results.json
