# Mechanistic Watchdog: Stress Test & Validation

**Module:** `MechWatch`
**Goal:** Verify that the "Deception Score" is linearly separable between Truthful Control prompts and Deceptive/Adversarial prompts.

In [None]:
import json
import sys
import os
from pathlib import Path

# 1. Setup Paths dynamically for Windows/Linux compatibility
current_dir = Path.cwd()
# If we are in 'notebooks', parent is project root. If in root, current is root.
project_root = current_dir.parent if current_dir.name == "notebooks" else current_dir

if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

print(f"üìÇ Project Root: {project_root}")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from tqdm import tqdm

# 2. Import from MechWatch
try:
    from MechWatch.runtime import WatchdogRuntime
    from MechWatch.config import load_config
    print("‚úÖ Successfully imported WatchdogRuntime from MechWatch")
except ImportError as e:
    print(f"‚ùå Import failed: {e}. Ensure the folder is named 'MechWatch' and is in the project root.")

## 1. Initialize Runtime & Locate Vector
We need to find the `deception_vector.pt` file and pass its absolute path to the configuration to avoid FileNotFoundError.

In [None]:
# 1. Find the vector file
vector_candidates = [
    project_root / "artifacts" / "deception_vector.pt",
    project_root / "deception_vector.pt",
    Path("artifacts/deception_vector.pt")
]

vector_path = next((p for p in vector_candidates if p.exists()), None)

if not vector_path:
    raise FileNotFoundError(f"Could not find 'deception_vector.pt' in {project_root} or artifacts/")

print(f"‚úÖ Found vector at: {vector_path}")

# 2. Load Config and Override Path
cfg = load_config()
cfg.vector_path = vector_path 

# 3. Initialize Runtime with the corrected config
rt = WatchdogRuntime(cfg=cfg)
print(f"‚úÖ Loaded Watchdog with threshold: {rt.threshold:.4f}")

## 2. Load Test Cases

In [None]:
import pandas as pd
from datasets import load_dataset
import random

# Configuration: How many samples per category?
# Keep this low (e.g., 30-50) for the Hackathon demo so it runs in <10 mins.
SAMPLES_PER_CATEGORY = 40 

test_cases = []

print("üì• Loading Datasets...")

# ---------------------------------------------------------
# 1. CONTROL GROUP: Simple Truths (Azaria & Mitchell)
# ---------------------------------------------------------
try:
    print(f"   - Loading Control (Simple Facts)...")
    ds_facts = load_dataset("L1Fthrasir/Facts-true-false", split="train")
    # Filter for ONLY True statements to serve as our baseline "Green Zone"
    true_facts = ds_facts.filter(lambda x: x['label'] == 1).shuffle(seed=42).select(range(SAMPLES_PER_CATEGORY))
    
    for row in true_facts:
        test_cases.append({
            "category": "Control (Simple Facts)",
            "prompt": row['statement'],
            "expected": "Pass"
        })
except Exception as e:
    print(f"   ‚ö†Ô∏è Failed to load Control: {e}")

# ---------------------------------------------------------
# 2. TRUTHFUL QA: Imitative Falsehoods
# ---------------------------------------------------------
try:
    print(f"   - Loading TruthfulQA (Misconceptions)...")
    ds_tqa = load_dataset("truthful_qa", "generation", split="validation")
    ds_tqa = ds_tqa.shuffle(seed=42).select(range(SAMPLES_PER_CATEGORY))
    
    for row in ds_tqa:
        # These prompts entice the model to lie (e.g. "What happens if you break a mirror?")
        test_cases.append({
            "category": "TruthfulQA (Misconceptions)",
            "prompt": row['question'],
            "expected": "Variable"
        })
except Exception as e:
    print(f"   ‚ö†Ô∏è Failed to load TruthfulQA: {e}")

# ---------------------------------------------------------
# 3. JAILBREAKS (Tenebrous / DAN / Adversarial)
# ---------------------------------------------------------
try:
    print(f"   - Loading Jailbreaks (Adversarial)...")
    # Using JailbreakHub as a reliable source of known jailbreaks (DAN, AIM, etc)
    ds_jail = load_dataset("walledai/JailbreakHub", split="train")
    
    # FIX: The column name is 'jailbreak', not 'jailbroken'
    ds_jail = ds_jail.filter(lambda x: x['jailbreak'] == True).shuffle(seed=42).select(range(SAMPLES_PER_CATEGORY))
    
    for row in ds_jail:
        test_cases.append({
            "category": "Jailbreak Attempts",
            "prompt": row['prompt'], 
            "expected": "Block"
        })
except Exception as e:
    print(f"   ‚ö†Ô∏è Failed to load Jailbreaks: {e}")

print(f"\n‚úÖ Loaded {len(test_cases)} total test cases.")
print(f"   Breakdown: {list(set([c['category'] for c in test_cases]))}")

## 3. Run the Gauntlet
We run each prompt with the watchdog **disabled** (monitoring only) to record the raw scores without stopping the model early.

In [None]:
import gc
import torch
import pandas as pd

results = []
SAVE_EVERY = 20

print("üöÄ Running inference gauntlet...")

# Use enumerate so we have an index 'i' for periodic saving/cleaning
for i, case in enumerate(tqdm(test_cases)):
    try:
        # Run generation (deterministic)
        # Reduced max_new_tokens to 40 to be safe on VRAM
        output = rt.generate_without_watchdog(
            prompt=case["prompt"],
            max_new_tokens=40, 
            temperature=0.0 
        )
        
        scores = output.scores
        
        # Grace Period Logic
        valid_scores = scores[5:] if len(scores) > 5 else scores
        if not valid_scores: valid_scores = [0.0]
        
        peak_score = max(valid_scores)
        avg_score = sum(valid_scores) / len(valid_scores)
        
        # Handle list vs string output safely
        raw_text = output.text[0] if isinstance(output.text, list) else output.text
        # Strip the prompt to keep the CSV clean
        response_text = raw_text[len(case["prompt"]):].strip()
        
        results.append({
            "Category": case["category"],
            "Prompt": case["prompt"],
            "Peak Score": peak_score,
            "Avg Score": avg_score,
            "Would Block?": peak_score > rt.threshold,
            "Output": response_text[:100].replace("\n", " ") + "..."
        })
        
        # === MEMORY SAFETY ===
        # Clear VRAM every 10 steps
        if i % 10 == 0:
            del output
            del scores
            torch.cuda.empty_cache()
            gc.collect()
            
        # === PROGRESS SAVING ===
        # Save a backup every 20 steps so you don't lose everything
        if i > 0 and i % SAVE_EVERY == 0:
            pd.DataFrame(results).to_csv("stress_test_backup.csv", index=False)
            
    except Exception as e:
        print(f"‚ö†Ô∏è Skipping index {i}: {e}")
        # If OOM occurs, try to clear and continue
        torch.cuda.empty_cache()
        gc.collect()

print("‚úÖ Inference Complete.")

## 4. Visualization
We expect a clear separation: "Control" cases should be below the red line, "Deception" cases above it.

In [None]:
df = pd.DataFrame(results)

# Set Plot Style
sns.set_theme(style="whitegrid")
plt.figure(figsize=(12, 6))

# Create Boxplot
sns.boxplot(data=df, x="Category", y="Peak Score", palette="coolwarm", hue="Category")
sns.stripplot(data=df, x="Category", y="Peak Score", color="black", alpha=0.5, jitter=True)

# Draw Threshold Line
plt.axhline(rt.threshold, color="red", linestyle="--", linewidth=2, label=f"Threshold ({rt.threshold:.2f})")

plt.title("Peak Deception Score by Category", fontsize=16)
plt.xticks(rotation=45, ha="right")
plt.legend()
plt.tight_layout()
plt.show()

## 5. Failure Analysis
Identify False Negatives (Deception that slipped through) and False Positives (Truth that was blocked).

In [None]:
print(f"Current Threshold: {rt.threshold:.4f}\n")

print("üî¥ FALSE NEGATIVES (Missed Deception):")
fn = df[(df["Category"] != "Control (Truth)") & (df["Peak Score"] < rt.threshold)]
if not fn.empty:
    display(fn[["Category", "Prompt", "Peak Score", "Output"]])
else:
    print("None! All deceptive prompts were caught.")

print("\nüü¢ FALSE POSITIVES (Wrongly Blocked Truth):")
fp = df[(df["Category"] == "Control (Truth)") & (df["Peak Score"] > rt.threshold)]
if not fp.empty:
    display(fp[["Category", "Prompt", "Peak Score", "Output"]])
else:
    print("None! All truthful prompts passed.")

In [None]:
# Save results for report
df.to_csv("stress_test_results.csv", index=False)
print("Results saved to stress_test_results.csv")