# üéØ Phase 2: SimCLR Contrastive Fine-tuning + BGL Cross-Domain

**Prerequisites:** Run `complete_multi_layer.ipynb` first to get:
- `output/logbert_full.pt` - Pre-trained model
- `output/v2_sequences.jsonl` - HDFS sequences

**This notebook:**
1. SimCLR contrastive fine-tuning
2. BGL cross-domain adaptation
3. Compare results

In [None]:
#=============================================================================
# CELL 1: SETUP
#=============================================================================
import os, gc, json, random, copy
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import precision_recall_curve, roc_auc_score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

# Config
CONTEXT_LEN = 128
D_MODEL, N_HEADS, N_LAYERS = 256, 8, 4
BATCH_SIZE = 64
SIMCLR_EPOCHS = 10
BGL_EPOCHS = 15
LR = 1e-5  # Lower LR for fine-tuning
TEMPERATURE = 0.1  # SimCLR temperature
SEED = 42

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

OUTPUT_DIR = "output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
#=============================================================================
# CELL 2: LOAD PRE-TRAINED MODEL
#=============================================================================
print("="*60)
print("LOADING PRE-TRAINED MODEL")
print("="*60)

PAD, CLS, MASK, SEP, OFF = 0, 1, 2, 3, 4

class LogBERT(nn.Module):
    def __init__(self, vs, dm=D_MODEL, nh=N_HEADS, nl=N_LAYERS, ml=CONTEXT_LEN, lv=0.5):
        super().__init__()
        self.lv = lv
        self.tok = nn.Embedding(vs, dm, padding_idx=PAD)
        self.pos = nn.Embedding(ml, dm)
        self.drop = nn.Dropout(0.1)
        el = nn.TransformerEncoderLayer(dm, nh, dm*4, 0.1, 'gelu', batch_first=True)
        self.enc = nn.TransformerEncoder(el, nl)
        self.head = nn.Linear(dm, vs)
        self.register_buffer('ctr', torch.zeros(dm))
        self.ci = False
    
    def forward(self, ids, mask=None):
        B, L = ids.shape
        x = self.tok(ids) + self.pos(torch.arange(L, device=ids.device).unsqueeze(0))
        h = self.enc(self.drop(x), src_key_padding_mask=(mask==0) if mask is not None else None)
        return self.head(h), h[:,0,:]
    
    def get_embedding(self, ids, mask=None):
        """Get CLS embedding for contrastive learning."""
        _, cls = self.forward(ids, mask)
        return cls

# Load pre-trained model
model_path = os.path.join(OUTPUT_DIR, "logbert_full.pt")
if os.path.exists(model_path):
    # Get vocab size from saved model
    state = torch.load(model_path, map_location='cpu')
    VOCAB_SIZE = state['tok.weight'].shape[0]
    print(f"Vocab size: {VOCAB_SIZE}")
    
    model = LogBERT(VOCAB_SIZE).to(device)
    model.load_state_dict(state)
    print("‚úì Loaded pre-trained model")
else:
    print(f"ERROR: {model_path} not found!")
    print("Run complete_multi_layer.ipynb first.")
    VOCAB_SIZE = 2048  # Fallback

---
# üîÑ PHASE 1: SimCLR Contrastive Learning

In [None]:
#=============================================================================
# CELL 3: SIMCLR AUGMENTATION
#=============================================================================

def augment_sequence(seq, aug_type='mask', mask_ratio=0.15):
    """
    Augment sequence for contrastive learning.
    
    Augmentation types:
    - mask: Random masking
    - drop: Random token dropping
    - shuffle: Local shuffling (within window)
    - crop: Random cropping
    """
    seq = list(seq)  # Copy
    
    if aug_type == 'mask':
        # Replace random tokens with MASK
        for i in range(len(seq)):
            if random.random() < mask_ratio:
                seq[i] = MASK
    
    elif aug_type == 'drop':
        # Drop random tokens
        seq = [t for t in seq if random.random() > mask_ratio]
        if len(seq) == 0:
            seq = [SEP]  # Keep at least one
    
    elif aug_type == 'shuffle':
        # Local shuffling within windows of size 3
        window = 3
        for i in range(0, len(seq) - window + 1, window):
            chunk = seq[i:i+window]
            random.shuffle(chunk)
            seq[i:i+window] = chunk
    
    elif aug_type == 'crop':
        # Random crop (keep 70-90%)
        keep_ratio = random.uniform(0.7, 0.9)
        new_len = max(1, int(len(seq) * keep_ratio))
        start = random.randint(0, len(seq) - new_len)
        seq = seq[start:start + new_len]
    
    return seq

class ContrastiveDataset(Dataset):
    """Dataset for SimCLR contrastive learning."""
    
    def __init__(self, sequences, max_len=CONTEXT_LEN):
        self.seqs = sequences
        self.max_len = max_len
        self.aug_types = ['mask', 'drop', 'shuffle', 'crop']
    
    def __len__(self):
        return len(self.seqs)
    
    def _tokenize(self, seq):
        s = [t + OFF for t in seq[:self.max_len - 2]]
        tok = [CLS] + s + [SEP] + [PAD] * (self.max_len - len(s) - 2)
        mask = [1 if t != PAD else 0 for t in tok]
        return torch.tensor(tok), torch.tensor(mask)
    
    def __getitem__(self, idx):
        seq = self.seqs[idx]
        
        # Get two different augmented views
        aug1 = random.choice(self.aug_types)
        aug2 = random.choice(self.aug_types)
        
        seq1 = augment_sequence(seq, aug1)
        seq2 = augment_sequence(seq, aug2)
        
        ids1, mask1 = self._tokenize(seq1)
        ids2, mask2 = self._tokenize(seq2)
        
        return {
            'ids1': ids1, 'mask1': mask1,
            'ids2': ids2, 'mask2': mask2
        }

print("ContrastiveDataset defined.")

In [None]:
#=============================================================================
# CELL 4: SIMCLR LOSS (NT-Xent)
#=============================================================================

class ProjectionHead(nn.Module):
    """MLP projection head for SimCLR."""
    def __init__(self, input_dim=D_MODEL, hidden_dim=512, output_dim=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
    
    def forward(self, x):
        return F.normalize(self.net(x), dim=1)

def nt_xent_loss(z1, z2, temperature=TEMPERATURE):
    """
    NT-Xent (Normalized Temperature-scaled Cross Entropy) loss.
    SimCLR contrastive loss.
    """
    B = z1.shape[0]
    z = torch.cat([z1, z2], dim=0)  # 2B x D
    
    # Similarity matrix
    sim = torch.mm(z, z.t()) / temperature  # 2B x 2B
    
    # Mask out self-similarity
    mask = torch.eye(2 * B, device=z.device).bool()
    sim.masked_fill_(mask, float('-inf'))
    
    # Positive pairs: (i, i+B) and (i+B, i)
    pos_mask = torch.zeros(2 * B, 2 * B, device=z.device)
    for i in range(B):
        pos_mask[i, i + B] = 1
        pos_mask[i + B, i] = 1
    
    # Labels for cross-entropy
    labels = torch.cat([torch.arange(B, 2*B), torch.arange(B)], dim=0).to(z.device)
    
    loss = F.cross_entropy(sim, labels)
    return loss

print("SimCLR loss defined.")

In [None]:
#=============================================================================
# CELL 5: LOAD HDFS SEQUENCES FOR SIMCLR
#=============================================================================
print("\n" + "="*60)
print("LOADING HDFS SEQUENCES")
print("="*60)

# Load from JSONL
hdfs_seqs = []
jsonl_path = os.path.join(OUTPUT_DIR, "v2_sequences.jsonl")

if os.path.exists(jsonl_path):
    with open(jsonl_path, 'r') as f:
        for line in f:
            data = json.loads(line)
            hdfs_seqs.append(data['seq'])
    print(f"Loaded {len(hdfs_seqs):,} HDFS sequences")
else:
    print("v2_sequences.jsonl not found, creating dummy data...")
    # Create some dummy sequences for testing
    for _ in range(10000):
        seq_len = random.randint(5, 50)
        hdfs_seqs.append([random.randint(0, 100) for _ in range(seq_len)])

print(f"Avg sequence length: {np.mean([len(s) for s in hdfs_seqs]):.1f}")

In [None]:
#=============================================================================
# CELL 6: SIMCLR FINE-TUNING
#=============================================================================
print("\n" + "="*60)
print("SIMCLR CONTRASTIVE FINE-TUNING")
print("="*60)

# Create projection head
projection = ProjectionHead(D_MODEL).to(device)

# Prepare data
contrast_ds = ContrastiveDataset(hdfs_seqs)
contrast_loader = DataLoader(contrast_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, drop_last=True)

# Optimizer for both model and projection head
optimizer = torch.optim.AdamW(
    list(model.parameters()) + list(projection.parameters()),
    lr=LR
)

print(f"Training with SimCLR for {SIMCLR_EPOCHS} epochs")
print(f"Temperature: {TEMPERATURE}")

for epoch in range(SIMCLR_EPOCHS):
    model.train()
    projection.train()
    total_loss = 0
    
    pbar = tqdm(contrast_loader, desc=f"Epoch {epoch+1}/{SIMCLR_EPOCHS}")
    for batch in pbar:
        ids1 = batch['ids1'].to(device)
        mask1 = batch['mask1'].to(device)
        ids2 = batch['ids2'].to(device)
        mask2 = batch['mask2'].to(device)
        
        optimizer.zero_grad()
        
        # Get embeddings
        emb1 = model.get_embedding(ids1, mask1)
        emb2 = model.get_embedding(ids2, mask2)
        
        # Project
        z1 = projection(emb1)
        z2 = projection(emb2)
        
        # Contrastive loss
        loss = nt_xent_loss(z1, z2, TEMPERATURE)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_loss = total_loss / len(contrast_loader)
    print(f"Epoch {epoch+1}: SimCLR Loss = {avg_loss:.4f}")

# Save SimCLR fine-tuned model
torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, "logbert_simclr.pt"))
print("\n‚úì SimCLR fine-tuning complete!")
print("‚úì Saved: logbert_simclr.pt")

---
# üåê PHASE 2: BGL Cross-Domain Adaptation

In [None]:
#=============================================================================
# CELL 7: LOAD BGL DATA
#=============================================================================
print("\n" + "="*60)
print("BGL CROSS-DOMAIN ADAPTATION")
print("="*60)

# Possible BGL paths
BGL_PATHS = [
    "/teamspace/studios/this_studio/content/LogHub_BGL",
    "/teamspace/studios/this_studio/LogHub_BGL",
    "/content/LogHub_BGL",
    "./LogHub_BGL",
]

BGL_PATH = next((p for p in BGL_PATHS if os.path.exists(p)), None)

bgl_normal, bgl_failure = [], []

if BGL_PATH:
    print(f"BGL found at: {BGL_PATH}")
    
    # Try different file formats
    bgl_file = os.path.join(BGL_PATH, "BGL.log_structured.csv")
    if not os.path.exists(bgl_file):
        bgl_file = os.path.join(BGL_PATH, "BGL_sequence.csv")
    
    if os.path.exists(bgl_file):
        print(f"Loading {bgl_file}...")
        df = pd.read_csv(bgl_file, nrows=100000)  # Limit for speed
        print(f"Columns: {df.columns.tolist()}")
        
        # Group by BlockId or SessionWindow
        if 'BlockId' in df.columns:
            group_col = 'BlockId'
        elif 'SessionWindow' in df.columns:
            group_col = 'SessionWindow'
        else:
            # Create sessions by time window
            df['Session'] = df.index // 100
            group_col = 'Session'
        
        # Get EventId column
        event_col = 'EventId' if 'EventId' in df.columns else 'EventTemplate'
        label_col = 'Label' if 'Label' in df.columns else None
        
        # Build sequences
        for name, group in df.groupby(group_col):
            events = group[event_col].tolist()
            # Convert to IDs if strings
            if isinstance(events[0], str):
                events = [hash(e) % 1000 for e in events]
            
            if label_col and 'Anomaly' in str(group[label_col].iloc[0]):
                bgl_failure.append(events)
            else:
                bgl_normal.append(events)
        
        print(f"BGL Normal: {len(bgl_normal):,}")
        print(f"BGL Failure: {len(bgl_failure):,}")
    else:
        print(f"BGL file not found at {bgl_file}")
else:
    print("BGL dataset not found. Creating synthetic data for demo...")
    # Create synthetic BGL-like data
    for _ in range(5000):
        seq_len = random.randint(10, 100)
        bgl_normal.append([random.randint(0, 500) for _ in range(seq_len)])
    for _ in range(500):
        seq_len = random.randint(10, 100)
        bgl_failure.append([random.randint(0, 500) for _ in range(seq_len)])
    print(f"Created synthetic BGL data: {len(bgl_normal)} normal, {len(bgl_failure)} failure")

In [None]:
#=============================================================================
# CELL 8: BGL DATASET
#=============================================================================

class BGLDataset(Dataset):
    def __init__(self, sequences, max_len=CONTEXT_LEN, mask_ratio=0.15):
        self.seqs = sequences
        self.max_len = max_len
        self.mr = mask_ratio
    
    def __len__(self):
        return len(self.seqs)
    
    def __getitem__(self, idx):
        seq = self.seqs[idx][:self.max_len - 2]
        # Map to vocab range
        seq = [min(t + OFF, VOCAB_SIZE - 1) for t in seq]
        tok = [CLS] + seq + [SEP] + [PAD] * (self.max_len - len(seq) - 2)
        
        inp, lab = tok.copy(), [-100] * len(tok)
        for i in range(1, len(seq) + 1):
            if random.random() < self.mr:
                lab[i], inp[i] = inp[i], MASK
        
        return {
            'input_ids': torch.tensor(inp),
            'labels': torch.tensor(lab),
            'attention_mask': torch.tensor([1 if t != PAD else 0 for t in tok])
        }

print(f"BGLDataset defined. Vocab size: {VOCAB_SIZE}")

In [None]:
#=============================================================================
# CELL 9: FINE-TUNE ON BGL (Domain Adaptation)
#=============================================================================
print("\nFine-tuning on BGL...")

# Reset model center for new domain
model.ci = False
model.ctr = torch.zeros(D_MODEL, device=device)

bgl_loader = DataLoader(
    BGLDataset(bgl_normal),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2
)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
scaler = torch.cuda.amp.GradScaler()

print(f"BGL training: {len(bgl_normal):,} normal sequences, {BGL_EPOCHS} epochs")

for epoch in range(BGL_EPOCHS):
    model.train()
    total_loss = 0
    pbar = tqdm(bgl_loader, desc=f"BGL Ep {epoch+1}/{BGL_EPOCHS}")
    
    for batch in pbar:
        ids = batch['input_ids'].to(device)
        lab = batch['labels'].to(device)
        attn = batch['attention_mask'].to(device)
        
        optimizer.zero_grad()
        
        with torch.cuda.amp.autocast():
            logits, cls = model(ids, attn)
            mlm_loss = F.cross_entropy(
                logits.view(-1, logits.size(-1)),
                lab.view(-1),
                ignore_index=-100
            )
            # VHM loss for new domain
            if model.ci:
                vhm_loss = torch.mean((cls - model.ctr) ** 2)
            else:
                vhm_loss = 0.0
            loss = mlm_loss + 0.5 * vhm_loss
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        # Update center
        with torch.no_grad():
            bc = cls.mean(0)
            model.ctr = bc if not model.ci else 0.9 * model.ctr + 0.1 * bc
            model.ci = True
        
        total_loss += loss.item()
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    print(f"BGL Epoch {epoch+1}: Loss = {total_loss/len(bgl_loader):.4f}")

# Save BGL-adapted model
torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, "logbert_bgl.pt"))
print("\n‚úì BGL fine-tuning complete!")
print("‚úì Saved: logbert_bgl.pt")

In [None]:
#=============================================================================
# CELL 10: EVALUATE ON BGL
#=============================================================================
print("\n" + "="*60)
print("EVALUATING ON BGL")
print("="*60)

@torch.no_grad()
def compute_scores(model, sequences, batch_size=128):
    model.eval()
    loader = DataLoader(BGLDataset(sequences, mask_ratio=0), batch_size=batch_size, shuffle=False)
    scores = []
    for batch in tqdm(loader, desc="Scoring"):
        _, cls = model(batch['input_ids'].to(device), batch['attention_mask'].to(device))
        dist = torch.sum((cls - model.ctr) ** 2, dim=1)
        scores.extend(dist.cpu().numpy())
    return np.array(scores)

# Compute scores
bgl_normal_scores = compute_scores(model, bgl_normal)
bgl_failure_scores = compute_scores(model, bgl_failure)

# Evaluate
all_scores = np.concatenate([bgl_normal_scores, bgl_failure_scores])
all_labels = np.concatenate([np.zeros(len(bgl_normal_scores)), np.ones(len(bgl_failure_scores))])

auc = roc_auc_score(all_labels, all_scores)
prec, rec, _ = precision_recall_curve(all_labels, all_scores)
f1s = 2 * prec * rec / (prec + rec + 1e-10)
best_f1 = np.max(f1s)
best_idx = np.argmax(f1s)

print(f"\n=== BGL Results ===")
print(f"AUC-ROC: {auc:.4f}")
print(f"Best F1: {best_f1:.4f}")
print(f"Precision: {prec[best_idx]:.4f}")
print(f"Recall: {rec[best_idx]:.4f}")

In [None]:
#=============================================================================
# CELL 11: FINAL SUMMARY
#=============================================================================
print("\n" + "="*70)
print("üéØ SUMMARY")
print("="*70)

print("\n=== Models Saved ===")
print(f"‚úì logbert_full.pt    - Pre-trained on HDFS V1+V2+V3")
print(f"‚úì logbert_simclr.pt  - SimCLR contrastive fine-tuned")
print(f"‚úì logbert_bgl.pt     - BGL domain adapted")

print("\n=== BGL Cross-Domain Performance ===")
print(f"AUC: {auc:.4f}")
print(f"F1:  {best_f1:.4f}")

# Save results
results = {
    'simclr': {
        'epochs': SIMCLR_EPOCHS,
        'temperature': TEMPERATURE
    },
    'bgl': {
        'epochs': BGL_EPOCHS,
        'normal': len(bgl_normal),
        'failure': len(bgl_failure),
        'auc': float(auc),
        'f1': float(best_f1),
        'precision': float(prec[best_idx]),
        'recall': float(rec[best_idx])
    }
}

with open(os.path.join(OUTPUT_DIR, "simclr_bgl_results.json"), 'w') as f:
    json.dump(results, f, indent=2)

print("\n‚úì Results saved to simclr_bgl_results.json")
print("\n" + "="*70)
print("üéâ COMPLETE!")
print("="*70)