[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nawidayima/IPHR_Direction/blob/main/notebooks/pipeline_full_experiment.ipynb)

# Full Sycophancy Experiment Pipeline

**Goal:** Run the complete sycophancy experiment in a single notebook to minimize model loading overhead.

**Optimizations:**
- Single TransformerLens model for all Llama operations (gen + activation + steering)
- Steering experiment runs BEFORE DeepSeek (only 2 model loads instead of 3)
- Checkpoints after each section (resume if notebook crashes)
- nnsight fallback for DeepSeek if TransformerLens fails

**Estimated Runtime:** 3-4 hours on L4 GPU

**Sections:**
1. Setup & Config
2. Load Llama (TransformerLens)
3. Generate Llama Trajectories
4. Extract Llama Activations
5. Train Llama Probes
6. Llama Steering Experiment
7. DeepSeek Cross-Model Validation
8. Visualizations
9. Export & Download

**Setup:** Add `HF_TOKEN` to Colab Secrets (key icon in sidebar), then Run All.

---
## Section 1: Setup & Config
---

In [None]:
# Cell 1.1: Setup - Clone repo and install dependencies
# NOTE: After running this cell, RESTART RUNTIME then run from Cell 1.2

import os

# Clone repo (only if not already cloned)
if not os.path.exists('/content/IPHR_Direction'):
    !git clone https://github.com/nawidayima/IPHR_Direction.git
    %cd /content/IPHR_Direction
else:
    %cd /content/IPHR_Direction
    !git pull

# Install dependencies
!pip install numpy==1.26.4 -q
!pip install torch transformers accelerate pandas scipy tqdm matplotlib -q
!pip install transformer_lens -q
!pip install scikit-learn -q

# Install package in editable mode
!pip install -e . -q

print("="*60)
print("IMPORTANT: Restart runtime now!")
print("Runtime > Restart runtime, then run from Cell 1.2")
print("="*60)

In [None]:
# Cell 1.2: Mount Google Drive (with retry logic)
import os
import time

OUTPUT_DIR = None

try:
    from google.colab import drive
    
    for attempt in range(3):
        try:
            drive.mount('/content/drive')
            OUTPUT_DIR = '/content/drive/MyDrive/MATS_sycophancy'
            os.makedirs(OUTPUT_DIR, exist_ok=True)
            print(f"Drive mounted! Output dir: {OUTPUT_DIR}")
            break
        except Exception as e:
            print(f"Mount attempt {attempt+1} failed: {e}")
            time.sleep(5)
    else:
        print("WARNING: Drive mount failed, using local storage")
        OUTPUT_DIR = '/content/IPHR_Direction/experiments/pipeline_run'
        os.makedirs(OUTPUT_DIR, exist_ok=True)
except ImportError:
    print("Not in Colab, using local storage")
    OUTPUT_DIR = 'experiments/pipeline_run'
    os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Output directory: {OUTPUT_DIR}")

In [None]:
# Cell 1.3: Imports and config
import torch
import pandas as pd
import numpy as np
import random
import json
import gc
from pathlib import Path
from datetime import datetime
from tqdm.auto import tqdm
from scipy.stats import mcnemar
from huggingface_hub import login

# Sklearn for probes
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import GroupShuffleSplit

# Will import TransformerLens after HF auth

# Set seeds
random.seed(42)
torch.manual_seed(42)
np.random.seed(42)

# Config
LLAMA_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"
DEEPSEEK_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"

# Layer sweep (Arditi methodology)
LAYERS = [4, 8, 12, 14, 16, 18, 20, 22, 24, 26, 28, 31]

# Generation config
MAX_NEW_TOKENS = 100
TEMPERATURE = 0.0

# Dataset expansion
N_EXPANSION_RUNS = 3  # Additional runs with strong feedback

# Paths
OUTPUT_DIR = Path(OUTPUT_DIR)
LLAMA_TRAJECTORIES_PATH = OUTPUT_DIR / "llama_trajectories.csv"
LLAMA_ACTIVATIONS_PATH = OUTPUT_DIR / "llama_activations.pt"
LLAMA_PROBES_PATH = OUTPUT_DIR / "llama_probes.pt"
STEERING_RESULTS_PATH = OUTPUT_DIR / "steering_results.json"
DEEPSEEK_TRAJECTORIES_PATH = OUTPUT_DIR / "deepseek_trajectories.csv"
DEEPSEEK_ACTIVATIONS_PATH = OUTPUT_DIR / "deepseek_activations.pt"
CROSS_MODEL_RESULTS_PATH = OUTPUT_DIR / "cross_model_results.json"

# Check GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Cell 1.4: HuggingFace Authentication
hf_token = None

# Method 1: Colab Secrets
try:
    from google.colab import userdata
    hf_token = userdata.get('HF_TOKEN')
    print("Found HF_TOKEN in Colab Secrets")
except:
    pass

# Method 2: Environment variable
if not hf_token and "HF_TOKEN" in os.environ:
    hf_token = os.environ["HF_TOKEN"]
    print("Found HF_TOKEN in environment")

if hf_token:
    login(token=hf_token)
    print("Logged in to HuggingFace")
else:
    raise ValueError("No HF_TOKEN found. Add to Colab Secrets or environment.")

In [None]:
# Cell 1.5: Import sycophancy utilities
%cd /content/IPHR_Direction

from src.sycophancy import (
    QuestionCategory,
    FactualQuestion,
    SycophancyLabel,
    TrajectoryResult,
    SYSTEM_PROMPT,
    SCIENCE_QUESTIONS,
    GEOGRAPHY_QUESTIONS,
    get_all_questions,
    get_feedback,
    STRONG_NEGATIVE_FEEDBACK_TEMPLATES,
    extract_answer,
    check_answer,
    label_trajectory,
)

# Focus on science + geography (higher sycophancy rate from prior experiments)
questions = SCIENCE_QUESTIONS + GEOGRAPHY_QUESTIONS
print(f"Using {len(questions)} questions (science + geography)")
print(f"\nSample:")
for q in questions[:3]:
    print(f"  {q.question} -> {q.correct_answer}")

In [None]:
# Cell 1.6: Helper functions

def checkpoint_exists(path):
    """Check if a checkpoint file exists."""
    return Path(path).exists()


def clear_gpu_memory():
    """Clear GPU memory and garbage collect."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    print("GPU memory cleared")


def format_multiturn_prompt(model, row: pd.Series) -> str:
    """Format conversation up to decision point (before second response)."""
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": row["question"]},
        {"role": "assistant", "content": row["first_response"]},
        {"role": "user", "content": row["feedback"]},
    ]
    return model.tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=False,
    )


print("Helper functions defined")

---
## Section 2: Load Llama (TransformerLens)

**Key optimization:** Using TransformerLens for BOTH generation AND activation extraction.
This model instance will be used for Sections 2-6.

---

In [None]:
# Cell 2.1: Load Llama with TransformerLens
from transformer_lens import HookedTransformer

print(f"Loading {LLAMA_MODEL} with TransformerLens...")
print("This may take 5-10 minutes...")

llama_model = HookedTransformer.from_pretrained(
    LLAMA_MODEL,
    fold_ln=False,
    center_writing_weights=False,
    center_unembed=False,
    device="cuda",
    dtype=torch.bfloat16,
)

print(f"\nModel loaded!")
print(f"  Layers: {llama_model.cfg.n_layers}")
print(f"  d_model: {llama_model.cfg.d_model}")
print(f"  Heads: {llama_model.cfg.n_heads}")

In [None]:
# Cell 2.2: Test generation (TransformerLens can generate!)
test_prompt = "The capital of France is"
print(f"Test prompt: {test_prompt}")

output = llama_model.generate(
    test_prompt,
    max_new_tokens=10,
    temperature=0,
    stop_at_eos=True,
)
print(f"Model output: {output}")

# Test cache access
tokens = llama_model.to_tokens(test_prompt)
_, cache = llama_model.run_with_cache(tokens)
print(f"\nCache access works! Shape at layer 16: {cache['resid_post', 16].shape}")

---
## Section 3: Generate Llama Trajectories

**Goal:** Generate multi-turn sycophancy trajectories using strong negative feedback.
Target: 30+ sycophantic examples for robust probe training.

---

In [None]:
# Cell 3.1: Check for existing checkpoint
if checkpoint_exists(LLAMA_TRAJECTORIES_PATH):
    print(f"Loading existing trajectories from {LLAMA_TRAJECTORIES_PATH}")
    llama_df = pd.read_csv(LLAMA_TRAJECTORIES_PATH)
    print(f"Loaded {len(llama_df)} trajectories")
    print(f"Label distribution:")
    print(llama_df['label'].value_counts())
    SKIP_GENERATION = True
else:
    print("No existing trajectories found. Will generate new ones.")
    SKIP_GENERATION = False

In [None]:
# Cell 3.2: Generation function using TransformerLens

def generate_response_tl(model, messages, max_new_tokens=100):
    """Generate response using TransformerLens model."""
    prompt = model.tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=False,
    )
    
    output = model.generate(
        prompt,
        max_new_tokens=max_new_tokens,
        temperature=0,
        stop_at_eos=True,
    )
    
    # Remove prompt from output
    response = output[len(prompt):].strip()
    return response


def generate_trajectory_tl(
    model,
    question: FactualQuestion,
    feedback_type: str,
    question_id: str,
) -> TrajectoryResult:
    """Generate a full multi-turn trajectory using TransformerLens."""
    # Turn 1: Ask question
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": question.question},
    ]
    
    first_response = generate_response_tl(model, messages)
    first_answer = extract_answer(first_response, question.category)
    first_correct = check_answer(first_answer, question)
    
    # Get feedback message (use strong negative for higher sycophancy rate)
    if feedback_type == "negative":
        feedback = random.choice(STRONG_NEGATIVE_FEEDBACK_TEMPLATES)
    else:
        feedback = get_feedback(feedback_type)
    
    # Turn 2: Give feedback, get second response
    messages.append({"role": "assistant", "content": first_response})
    messages.append({"role": "user", "content": feedback})
    
    second_response = generate_response_tl(model, messages)
    second_answer = extract_answer(second_response, question.category)
    
    # Determine if answer changed
    if first_answer and second_answer:
        answer_changed = first_answer.lower().strip() != second_answer.lower().strip()
    else:
        answer_changed = False
    
    # Label the trajectory
    label = label_trajectory(first_answer, second_answer, question, feedback_type)
    
    return TrajectoryResult(
        question_id=question_id,
        question=question.question,
        correct_answer=question.correct_answer,
        category=question.category.value,
        first_response=first_response,
        first_answer=first_answer,
        first_correct=first_correct,
        feedback_type=feedback_type,
        feedback=feedback,
        second_response=second_response,
        second_answer=second_answer,
        answer_changed=answer_changed,
        label=label,
    )


# Quick test
if not SKIP_GENERATION:
    test_q = questions[0]
    print(f"Testing with: {test_q.question}")
    test_traj = generate_trajectory_tl(llama_model, test_q, "negative", "test_001")
    print(f"First answer: {test_traj.first_answer}")
    print(f"Second answer: {test_traj.second_answer}")
    print(f"Label: {test_traj.label.value}")

In [None]:
# Cell 3.3: Generate trajectories (with strong negative feedback)

if not SKIP_GENERATION:
    print(f"Generating trajectories for {len(questions)} questions...")
    print(f"Using strong negative feedback for higher sycophancy rate")
    print(f"Running {N_EXPANSION_RUNS} runs...")
    
    all_trajectories = []
    
    for run_idx in range(N_EXPANSION_RUNS):
        print(f"\n--- Run {run_idx + 1}/{N_EXPANSION_RUNS} ---")
        
        for q_idx, q in enumerate(tqdm(questions, desc=f"Run {run_idx + 1}")):
            question_id = f"llama_r{run_idx}_q{q_idx:03d}"
            
            try:
                # Only negative feedback (focus on sycophancy detection)
                traj = generate_trajectory_tl(llama_model, q, "negative", question_id)
                all_trajectories.append(traj)
            except Exception as e:
                print(f"Error at {question_id}: {e}")
            
            # Clear cache periodically
            if q_idx % 10 == 0:
                clear_gpu_memory()
        
        # Check progress
        temp_df = pd.DataFrame([t.to_dict() for t in all_trajectories])
        valid = temp_df[(temp_df['first_correct'] == True) & (temp_df['label'].isin(['sycophantic', 'maintained']))]
        n_syc = (valid['label'] == 'sycophantic').sum()
        print(f"Progress: {n_syc} sycophantic examples so far")
        
        if n_syc >= 30:
            print(f"Target reached! Stopping early.")
            break
    
    # Create DataFrame
    llama_df = pd.DataFrame([t.to_dict() for t in all_trajectories])
    print(f"\nGenerated {len(llama_df)} trajectories")

In [None]:
# Cell 3.4: Analyze and save trajectories

# Filter to valid trajectories
llama_valid = llama_df[
    (llama_df['first_correct'] == True) & 
    (llama_df['label'].isin(['sycophantic', 'maintained']))
].copy()

n_syc = (llama_valid['label'] == 'sycophantic').sum()
n_maintained = (llama_valid['label'] == 'maintained').sum()

print("Llama Trajectory Summary:")
print(f"  Total trajectories: {len(llama_df)}")
print(f"  Valid for probing: {len(llama_valid)}")
print(f"    Sycophantic: {n_syc}")
print(f"    Maintained: {n_maintained}")
print(f"  Sycophancy rate: {n_syc / len(llama_valid):.1%}")

if n_syc < 30:
    print(f"\nWARNING: Only {n_syc} sycophantic examples (target: 30+)")
    print("Proceeding anyway, but probe may be underpowered.")

# Save checkpoint
llama_df.to_csv(LLAMA_TRAJECTORIES_PATH, index=False)
print(f"\nCHECKPOINT SAVED: {LLAMA_TRAJECTORIES_PATH}")

---
## Section 4: Extract Llama Activations

**Decision point:** First generated token after feedback (where model decides to maintain/change).

---

In [None]:
# Cell 4.1: Check for existing checkpoint
if checkpoint_exists(LLAMA_ACTIVATIONS_PATH):
    print(f"Loading existing activations from {LLAMA_ACTIVATIONS_PATH}")
    llama_act_data = torch.load(LLAMA_ACTIVATIONS_PATH)
    print(f"Loaded activations for {llama_act_data['n_samples']} samples")
    SKIP_ACTIVATIONS = True
else:
    print("No existing activations found. Will extract new ones.")
    SKIP_ACTIVATIONS = False

In [None]:
# Cell 4.2: Extract activations

if not SKIP_ACTIVATIONS:
    print(f"Extracting activations from {len(llama_valid)} valid trajectories...")
    print(f"Layers: {LAYERS}")
    
    activations = {layer: [] for layer in LAYERS}
    labels = []
    metadata = []
    
    for idx, (_, row) in enumerate(tqdm(llama_valid.iterrows(), total=len(llama_valid))):
        try:
            # Format prompt up to decision point
            prompt = format_multiturn_prompt(llama_model, row)
            tokens = llama_model.to_tokens(prompt)
            
            # Generate 1 token to capture decision state
            with torch.no_grad():
                logits = llama_model(tokens)
                next_token = logits[0, -1, :].argmax().unsqueeze(0).unsqueeze(0)
                tokens_extended = torch.cat([tokens, next_token], dim=1)
                _, cache = llama_model.run_with_cache(tokens_extended)
            
            # Extract at first generated token
            for layer in LAYERS:
                act = cache["resid_post", layer][0, -1, :].cpu().to(torch.float32)
                activations[layer].append(act)
            
            # Label: 1 = sycophantic, 0 = maintained
            labels.append(1 if row['label'] == 'sycophantic' else 0)
            metadata.append({
                'question_id': row['question_id'],
                'category': row['category'],
                'label': row['label'],
            })
            
        except Exception as e:
            print(f"Error at idx {idx}: {e}")
        
        if idx % 10 == 0:
            clear_gpu_memory()
    
    # Stack into tensors
    for layer in LAYERS:
        activations[layer] = torch.stack(activations[layer])
    labels = torch.tensor(labels)
    
    print(f"\nExtracted {len(labels)} samples")
    print(f"  Sycophantic: {labels.sum().item()}")
    print(f"  Maintained: {(~labels.bool()).sum().item()}")

In [None]:
# Cell 4.3: Save activations checkpoint

if not SKIP_ACTIVATIONS:
    llama_act_data = {
        'model_name': LLAMA_MODEL,
        'layers': LAYERS,
        'd_model': llama_model.cfg.d_model,
        'activations': activations,
        'labels': labels,
        'metadata': metadata,
        'n_samples': len(labels),
        'timestamp': datetime.now().isoformat(),
    }
    
    torch.save(llama_act_data, LLAMA_ACTIVATIONS_PATH)
    print(f"CHECKPOINT SAVED: {LLAMA_ACTIVATIONS_PATH}")
    print(f"File size: {LLAMA_ACTIVATIONS_PATH.stat().st_size / 1e6:.1f} MB")

---
## Section 5: Train Llama Probes

**Methods:**
- Difference-in-Means (DiM): Simple direction from class means
- Logistic Regression (LR): Learned linear classifier

---

In [None]:
# Cell 5.1: Check for existing checkpoint
if checkpoint_exists(LLAMA_PROBES_PATH):
    print(f"Loading existing probes from {LLAMA_PROBES_PATH}")
    llama_probe_data = torch.load(LLAMA_PROBES_PATH)
    print(f"Best DiM layer: {llama_probe_data['best_layer_dim']} (AUC={llama_probe_data['dim_aucs'][llama_probe_data['best_layer_dim']]:.4f})")
    SKIP_PROBES = True
else:
    print("No existing probes found. Will train new ones.")
    SKIP_PROBES = False

In [None]:
# Cell 5.2: Train probes

if not SKIP_PROBES:
    # Load activation data
    activations = llama_act_data['activations']
    labels = llama_act_data['labels']
    metadata = llama_act_data['metadata']
    
    # Train/test split by question (prevent data leakage)
    question_ids = [m['question_id'].split('_q')[1] for m in metadata]  # Group by question number
    groups = np.array([int(qid[:3]) for qid in question_ids])  # Extract numeric part
    
    splitter = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    train_idx, test_idx = next(splitter.split(labels, labels, groups))
    
    print(f"Train/test split: {len(train_idx)}/{len(test_idx)}")
    
    # Store results
    dim_directions = {}
    dim_aucs = {}
    lr_weights = {}
    lr_biases = {}
    lr_aucs = {}
    
    labels_np = labels.numpy()
    
    print("\nTraining probes per layer...")
    for layer in tqdm(LAYERS, desc="Layers"):
        X = activations[layer].numpy()
        
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = labels_np[train_idx], labels_np[test_idx]
        
        # DiM: Difference in means
        syc_mean = X_train[y_train == 1].mean(axis=0)
        maint_mean = X_train[y_train == 0].mean(axis=0)
        dim_dir = syc_mean - maint_mean
        dim_dir_norm = dim_dir / np.linalg.norm(dim_dir)
        
        dim_projections = X_test @ dim_dir_norm
        dim_auc = roc_auc_score(y_test, dim_projections) if len(np.unique(y_test)) > 1 else 0.5
        
        dim_directions[layer] = dim_dir_norm
        dim_aucs[layer] = dim_auc
        
        # Logistic Regression
        lr = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
        lr.fit(X_train, y_train)
        lr_probs = lr.predict_proba(X_test)[:, 1]
        lr_auc = roc_auc_score(y_test, lr_probs) if len(np.unique(y_test)) > 1 else 0.5
        
        lr_weights[layer] = lr.coef_[0]
        lr_biases[layer] = lr.intercept_[0]
        lr_aucs[layer] = lr_auc
        
        print(f"Layer {layer:2d}: DiM AUC = {dim_auc:.4f}, LR AUC = {lr_auc:.4f}")
    
    # Find best layers
    best_layer_dim = max(dim_aucs, key=dim_aucs.get)
    best_layer_lr = max(lr_aucs, key=lr_aucs.get)
    
    print(f"\nBest DiM: Layer {best_layer_dim} (AUC = {dim_aucs[best_layer_dim]:.4f})")
    print(f"Best LR:  Layer {best_layer_lr} (AUC = {lr_aucs[best_layer_lr]:.4f})")

In [None]:
# Cell 5.3: Save probes checkpoint

if not SKIP_PROBES:
    llama_probe_data = {
        'model_name': LLAMA_MODEL,
        'layers': LAYERS,
        'dim_directions': dim_directions,
        'dim_aucs': dim_aucs,
        'lr_weights': lr_weights,
        'lr_biases': lr_biases,
        'lr_aucs': lr_aucs,
        'best_layer_dim': best_layer_dim,
        'best_layer_lr': best_layer_lr,
        'train_indices': train_idx,
        'test_indices': test_idx,
        'n_train': len(train_idx),
        'n_test': len(test_idx),
        'timestamp': datetime.now().isoformat(),
    }
    
    torch.save(llama_probe_data, LLAMA_PROBES_PATH)
    print(f"CHECKPOINT SAVED: {LLAMA_PROBES_PATH}")

---
## Section 6: Llama Steering Experiment

**Intervention:** Directional ablation (Arditi et al.): `h' = h - (h . v̂) * v̂`

**Key:** This runs BEFORE DeepSeek to reuse the same Llama model instance.

---

In [None]:
# Cell 6.1: Check for existing checkpoint
if checkpoint_exists(STEERING_RESULTS_PATH):
    print(f"Loading existing steering results from {STEERING_RESULTS_PATH}")
    with open(STEERING_RESULTS_PATH) as f:
        steering_results = json.load(f)
    print(f"Baseline sycophancy: {steering_results['primary']['baseline_rate']:.1%}")
    print(f"Ablated sycophancy: {steering_results['primary']['ablated_rate']:.1%}")
    SKIP_STEERING = True
else:
    print("No existing steering results found. Will run experiment.")
    SKIP_STEERING = False

In [None]:
# Cell 6.2: Define steering functions

# Get best layer and direction
STEER_LAYER = llama_probe_data['best_layer_dim']
sycophancy_direction = llama_probe_data['dim_directions'][STEER_LAYER]
sycophancy_dir_tensor = torch.tensor(sycophancy_direction, dtype=torch.float32, device="cuda")

print(f"Steering layer: {STEER_LAYER}")
print(f"Probe AUC: {llama_probe_data['dim_aucs'][STEER_LAYER]:.4f}")


def ablate_sycophancy_hook(activation, hook, direction):
    """Remove sycophancy direction: h' = h - (h . v̂) * v̂"""
    direction = direction.to(activation.device).to(activation.dtype)
    projection = torch.einsum('bsd,d->bs', activation, direction)
    return activation - torch.einsum('bs,d->bsd', projection, direction)


def generate_baseline(model, row, max_new_tokens=100):
    """Generate without intervention."""
    prompt = format_multiturn_prompt(model, row)
    output = model.generate(prompt, max_new_tokens=max_new_tokens, temperature=0, stop_at_eos=True)
    return output[len(prompt):].strip()


def generate_with_ablation(model, row, layer, direction, max_new_tokens=100):
    """Generate with sycophancy direction ablated."""
    prompt = format_multiturn_prompt(model, row)
    hook_fn = lambda act, hook: ablate_sycophancy_hook(act, hook, direction)
    hook_name = f"blocks.{layer}.hook_resid_post"
    
    with model.hooks([(hook_name, hook_fn)]):
        output = model.generate(prompt, max_new_tokens=max_new_tokens, temperature=0, stop_at_eos=True)
    return output[len(prompt):].strip()

In [None]:
# Cell 6.3: Run steering experiment on test set

if not SKIP_STEERING:
    # Get test indices
    test_indices = llama_probe_data['test_indices']
    df_test = llama_valid.iloc[test_indices].copy()
    
    print(f"Running steering experiment on {len(df_test)} test samples...")
    
    test_results = []
    
    for idx, (_, row) in enumerate(tqdm(df_test.iterrows(), total=len(df_test))):
        try:
            category = QuestionCategory(row['category'])
            
            # Generate baseline
            baseline_response = generate_baseline(llama_model, row)
            baseline_answer = extract_answer(baseline_response, category)
            
            # Generate with ablation
            ablated_response = generate_with_ablation(
                llama_model, row, STEER_LAYER, sycophancy_dir_tensor
            )
            ablated_answer = extract_answer(ablated_response, category)
            
            # Check if answers changed
            first_answer = row['first_answer']
            
            baseline_changed = (
                first_answer and baseline_answer and
                first_answer.lower().strip() != baseline_answer.lower().strip()
            )
            ablated_changed = (
                first_answer and ablated_answer and
                first_answer.lower().strip() != ablated_answer.lower().strip()
            )
            
            test_results.append({
                'question_id': row['question_id'],
                'first_answer': first_answer,
                'baseline_answer': baseline_answer,
                'baseline_changed': baseline_changed,
                'ablated_answer': ablated_answer,
                'ablated_changed': ablated_changed,
            })
            
        except Exception as e:
            print(f"Error at idx {idx}: {e}")
        
        if idx % 5 == 0:
            clear_gpu_memory()
    
    print(f"\nCompleted {len(test_results)} samples")

In [None]:
# Cell 6.4: Analyze and save steering results

if not SKIP_STEERING:
    results_df = pd.DataFrame(test_results)
    valid = results_df.dropna(subset=['baseline_changed', 'ablated_changed'])
    
    n_valid = len(valid)
    baseline_syc = valid['baseline_changed'].sum()
    ablated_syc = valid['ablated_changed'].sum()
    
    baseline_rate = baseline_syc / n_valid
    ablated_rate = ablated_syc / n_valid
    reduction = (baseline_syc - ablated_syc) / baseline_syc if baseline_syc > 0 else 0
    
    # McNemar contingency table
    a = ((valid['baseline_changed'] == True) & (valid['ablated_changed'] == True)).sum()
    b = ((valid['baseline_changed'] == True) & (valid['ablated_changed'] == False)).sum()  # Helped
    c = ((valid['baseline_changed'] == False) & (valid['ablated_changed'] == True)).sum()  # Hurt
    d = ((valid['baseline_changed'] == False) & (valid['ablated_changed'] == False)).sum()
    
    print("STEERING RESULTS (Test Set Only)")
    print("=" * 50)
    print(f"Valid samples: {n_valid}")
    print(f"Baseline sycophancy: {baseline_syc}/{n_valid} = {baseline_rate:.1%}")
    print(f"Ablated sycophancy:  {ablated_syc}/{n_valid} = {ablated_rate:.1%}")
    print(f"Reduction: {reduction:.1%}")
    print(f"\nAblation helped: {b} cases")
    print(f"Ablation hurt: {c} cases")
    
    # Save results
    steering_results = {
        'model_name': LLAMA_MODEL,
        'steer_layer': STEER_LAYER,
        'probe_auc': llama_probe_data['dim_aucs'][STEER_LAYER],
        'primary': {
            'n_samples': int(n_valid),
            'baseline_sycophancy': int(baseline_syc),
            'ablated_sycophancy': int(ablated_syc),
            'baseline_rate': baseline_rate,
            'ablated_rate': ablated_rate,
            'reduction': reduction,
            'helped': int(b),
            'hurt': int(c),
        },
        'timestamp': datetime.now().isoformat(),
    }
    
    with open(STEERING_RESULTS_PATH, 'w') as f:
        json.dump(steering_results, f, indent=2)
    
    print(f"\nCHECKPOINT SAVED: {STEERING_RESULTS_PATH}")

In [None]:
# Cell 6.5: Unload Llama model
print("Unloading Llama model...")
del llama_model
clear_gpu_memory()
print("Llama model unloaded")

---
## Section 7: DeepSeek Cross-Model Validation

**Goal:** Test if the Llama sycophancy direction transfers to DeepSeek.

**Fallback:** nnsight if TransformerLens fails.

---

In [None]:
# Cell 7.1: Try loading DeepSeek with TransformerLens
print(f"Attempting to load {DEEPSEEK_MODEL} with TransformerLens...")

USING_TRANSFORMERLENS = False

try:
    deepseek_model = HookedTransformer.from_pretrained(
        DEEPSEEK_MODEL,
        torch_dtype=torch.bfloat16,
        device="cuda",
    )
    print(f"SUCCESS! DeepSeek loaded with TransformerLens")
    print(f"  Layers: {deepseek_model.cfg.n_layers}")
    USING_TRANSFORMERLENS = True
except Exception as e:
    print(f"TransformerLens failed: {e}")
    print("Will try nnsight instead...")

In [None]:
# Cell 7.2: Fallback to nnsight if needed
if not USING_TRANSFORMERLENS:
    print("Installing and loading nnsight...")
    !pip install nnsight -q
    
    from nnsight import LanguageModel
    
    deepseek_model = LanguageModel(DEEPSEEK_MODEL)
    print(f"Loaded {DEEPSEEK_MODEL} with nnsight")
    print("Note: Using remote=True for NDIF inference")

In [None]:
# Cell 7.3: Define DeepSeek generation functions

if USING_TRANSFORMERLENS:
    def generate_deepseek(messages, max_new_tokens=100):
        """Generate with DeepSeek using TransformerLens."""
        # DeepSeek uses different chat template
        prompt = ""
        for msg in messages:
            if msg["role"] == "system":
                prompt += f"<|system|>\n{msg['content']}\n"
            elif msg["role"] == "user":
                prompt += f"<|user|>\n{msg['content']}\n"
            elif msg["role"] == "assistant":
                prompt += f"<|assistant|>\n{msg['content']}\n"
        prompt += "<|assistant|>\n"
        
        output = deepseek_model.generate(
            prompt,
            max_new_tokens=max_new_tokens,
            temperature=0,
            stop_at_eos=True,
        )
        return output[len(prompt):].strip()
else:
    def generate_deepseek(messages, max_new_tokens=100):
        """Generate with DeepSeek using nnsight."""
        prompt = ""
        for msg in messages:
            if msg["role"] == "system":
                prompt += f"System: {msg['content']}\n\n"
            elif msg["role"] == "user":
                prompt += f"User: {msg['content']}\n\n"
            elif msg["role"] == "assistant":
                prompt += f"Assistant: {msg['content']}\n\n"
        prompt += "Assistant:"
        
        with deepseek_model.trace(prompt, remote=True):
            output = deepseek_model.output.save()
        
        response = deepseek_model.tokenizer.decode(output[0], skip_special_tokens=True)
        return response[len(prompt):].strip()


# Test
test_msg = [{"role": "user", "content": "What is 2 + 2?"}]
test_out = generate_deepseek(test_msg)
print(f"Test: 2 + 2 = {test_out[:50]}...")

In [None]:
# Cell 7.4: Generate DeepSeek trajectories

if checkpoint_exists(DEEPSEEK_TRAJECTORIES_PATH):
    print(f"Loading existing DeepSeek trajectories...")
    deepseek_df = pd.read_csv(DEEPSEEK_TRAJECTORIES_PATH)
    SKIP_DEEPSEEK_GEN = True
else:
    print(f"Generating DeepSeek trajectories...")
    SKIP_DEEPSEEK_GEN = False
    
    ds_trajectories = []
    
    for q_idx, q in enumerate(tqdm(questions, desc="DeepSeek Gen")):
        try:
            question_id = f"ds_q_{q_idx:03d}"
            feedback = random.choice(STRONG_NEGATIVE_FEEDBACK_TEMPLATES)
            
            # Turn 1
            messages = [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": q.question},
            ]
            first_response = generate_deepseek(messages)
            first_answer = extract_answer(first_response, q.category)
            first_correct = check_answer(first_answer, q)
            
            # Turn 2
            messages.append({"role": "assistant", "content": first_response})
            messages.append({"role": "user", "content": feedback})
            second_response = generate_deepseek(messages)
            second_answer = extract_answer(second_response, q.category)
            
            answer_changed = (
                first_answer and second_answer and
                first_answer.lower().strip() != second_answer.lower().strip()
            )
            label = label_trajectory(first_answer, second_answer, q, "negative")
            
            ds_trajectories.append({
                'question_id': question_id,
                'question': q.question,
                'correct_answer': q.correct_answer,
                'category': q.category.value,
                'first_response': first_response,
                'first_answer': first_answer,
                'first_correct': first_correct,
                'feedback': feedback,
                'second_response': second_response,
                'second_answer': second_answer,
                'answer_changed': answer_changed,
                'label': label.value if hasattr(label, 'value') else label,
            })
            
        except Exception as e:
            print(f"Error at q_{q_idx}: {e}")
        
        if q_idx % 10 == 0:
            clear_gpu_memory()
    
    deepseek_df = pd.DataFrame(ds_trajectories)
    deepseek_df.to_csv(DEEPSEEK_TRAJECTORIES_PATH, index=False)
    print(f"Saved {len(deepseek_df)} DeepSeek trajectories")

In [None]:
# Cell 7.5: Extract DeepSeek activations and test cross-model transfer

if USING_TRANSFORMERLENS:
    # Filter valid trajectories
    ds_valid = deepseek_df[
        (deepseek_df['first_correct'] == True) & 
        (deepseek_df['label'].isin(['sycophantic', 'maintained']))
    ].copy()
    
    print(f"Extracting DeepSeek activations for {len(ds_valid)} valid samples...")
    
    DS_LAYERS = [12, 14, 16, 18, 20]  # Subset for speed
    ds_activations = {layer: [] for layer in DS_LAYERS}
    ds_labels = []
    
    for idx, (_, row) in enumerate(tqdm(ds_valid.iterrows(), total=len(ds_valid))):
        try:
            # Build prompt
            prompt = ""
            for role, content in [
                ("system", SYSTEM_PROMPT),
                ("user", row['question']),
                ("assistant", row['first_response']),
                ("user", row['feedback']),
            ]:
                prompt += f"<|{role}|>\n{content}\n"
            prompt += "<|assistant|>\n"
            
            tokens = deepseek_model.to_tokens(prompt)
            _, cache = deepseek_model.run_with_cache(tokens)
            
            for layer in DS_LAYERS:
                act = cache["resid_post", layer][0, -1, :].cpu().to(torch.float32).numpy()
                ds_activations[layer].append(act)
            
            ds_labels.append(1 if row['label'] == 'sycophantic' else 0)
            
        except Exception as e:
            print(f"Error: {e}")
        
        if idx % 10 == 0:
            clear_gpu_memory()
    
    ds_activations = {k: np.array(v) for k, v in ds_activations.items()}
    ds_labels = np.array(ds_labels)
    
    print(f"\nDeepSeek activations: {ds_labels.sum()} sycophantic, {len(ds_labels) - ds_labels.sum()} maintained")
else:
    print("Skipping DeepSeek activation extraction (nnsight mode)")
    ds_activations = None
    ds_labels = None

In [None]:
# Cell 7.6: Cross-model probe test

if ds_activations is not None and len(np.unique(ds_labels)) > 1:
    print("CROSS-MODEL TRANSFER TEST")
    print("=" * 50)
    
    cross_model_results = {'llama_on_deepseek': {}, 'deepseek_native': {}, 'cosine_similarity': {}}
    
    for layer in [16]:  # Test on best Llama layer
        if layer not in ds_activations:
            continue
        
        X_ds = ds_activations[layer]
        y_ds = ds_labels
        
        # Test 1: Llama direction on DeepSeek
        llama_dir = llama_probe_data['dim_directions'][layer]
        llama_dir_norm = llama_dir / np.linalg.norm(llama_dir)
        
        projections = X_ds @ llama_dir_norm
        transfer_auc = roc_auc_score(y_ds, projections)
        cross_model_results['llama_on_deepseek'][layer] = transfer_auc
        
        print(f"Layer {layer}: Llama direction on DeepSeek -> AUC = {transfer_auc:.3f}")
        if transfer_auc > 0.7:
            print("  -> TRANSFER SUCCESS!")
        
        # Test 2: DeepSeek-native probe
        syc_mask = y_ds == 1
        ds_dir = X_ds[syc_mask].mean(0) - X_ds[~syc_mask].mean(0)
        ds_dir_norm = ds_dir / np.linalg.norm(ds_dir)
        
        ds_projections = X_ds @ ds_dir_norm
        ds_native_auc = roc_auc_score(y_ds, ds_projections)
        cross_model_results['deepseek_native'][layer] = ds_native_auc
        
        print(f"Layer {layer}: DeepSeek-native DiM -> AUC = {ds_native_auc:.3f}")
        
        # Test 3: Direction similarity
        cos_sim = np.dot(llama_dir_norm, ds_dir_norm)
        cross_model_results['cosine_similarity'][layer] = float(cos_sim)
        
        print(f"Layer {layer}: Cosine similarity = {cos_sim:.3f}")
        if abs(cos_sim) > 0.7:
            print("  -> SHARED MECHANISM!")
    
    # Save cross-model results
    with open(CROSS_MODEL_RESULTS_PATH, 'w') as f:
        json.dump(cross_model_results, f, indent=2)
    print(f"\nCHECKPOINT SAVED: {CROSS_MODEL_RESULTS_PATH}")
else:
    print("Skipping cross-model test (insufficient data or nnsight mode)")
    cross_model_results = None

In [None]:
# Cell 7.7: Unload DeepSeek
print("Unloading DeepSeek model...")
del deepseek_model
clear_gpu_memory()

---
## Section 8: Visualizations
---

In [None]:
# Cell 8.1: Create figures directory
import matplotlib.pyplot as plt

FIGURES_DIR = OUTPUT_DIR / "figures"
FIGURES_DIR.mkdir(exist_ok=True)
print(f"Figures will be saved to: {FIGURES_DIR}")

In [None]:
# Cell 8.2: Layer sweep plot
fig, ax = plt.subplots(figsize=(10, 6))

layers = llama_probe_data['layers']
dim_aucs = [llama_probe_data['dim_aucs'][l] for l in layers]
lr_aucs = [llama_probe_data['lr_aucs'][l] for l in layers]

ax.plot(layers, dim_aucs, 'o-', label='DiM', markersize=8)
ax.plot(layers, lr_aucs, 's-', label='LR', markersize=8)
ax.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5, label='Random')
ax.axhline(y=0.7, color='green', linestyle=':', alpha=0.5, label='Target (0.7)')

ax.set_xlabel('Layer')
ax.set_ylabel('ROC-AUC')
ax.set_title('Sycophancy Probe Performance by Layer (Llama-3-8B)')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'layer_sweep.png', dpi=150)
plt.show()
print(f"Saved: {FIGURES_DIR / 'layer_sweep.png'}")

In [None]:
# Cell 8.3: Steering bar chart
fig, ax = plt.subplots(figsize=(8, 5))

# Load steering results
with open(STEERING_RESULTS_PATH) as f:
    sr = json.load(f)

categories = ['Baseline', 'Ablated']
rates = [sr['primary']['baseline_rate'] * 100, sr['primary']['ablated_rate'] * 100]
colors = ['#ff6b6b', '#4ecdc4']

bars = ax.bar(categories, rates, color=colors, edgecolor='black', linewidth=1.5)

ax.set_ylabel('Sycophancy Rate (%)')
ax.set_title('Effect of Ablating Sycophancy Direction')
ax.set_ylim(0, max(rates) * 1.2)

# Add value labels
for bar, rate in zip(bars, rates):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, f'{rate:.1f}%', 
            ha='center', fontsize=12, fontweight='bold')

# Add reduction annotation
reduction = sr['primary']['reduction'] * 100
ax.annotate(f'{reduction:.0f}% reduction', xy=(0.5, max(rates) * 0.7), 
            fontsize=14, ha='center', color='green', fontweight='bold')

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'steering_comparison.png', dpi=150)
plt.show()
print(f"Saved: {FIGURES_DIR / 'steering_comparison.png'}")

---
## Section 9: Export & Download
---

In [None]:
# Cell 9.1: Compile summary
summary = {
    'experiment': 'Sycophancy Detection and Steering',
    'timestamp': datetime.now().isoformat(),
    'llama': {
        'model': LLAMA_MODEL,
        'n_trajectories': len(llama_df),
        'n_valid': len(llama_valid),
        'n_sycophantic': int((llama_valid['label'] == 'sycophantic').sum()),
        'probe_best_layer': llama_probe_data['best_layer_dim'],
        'probe_auc': llama_probe_data['dim_aucs'][llama_probe_data['best_layer_dim']],
    },
    'steering': {
        'layer': sr['steer_layer'],
        'baseline_rate': sr['primary']['baseline_rate'],
        'ablated_rate': sr['primary']['ablated_rate'],
        'reduction': sr['primary']['reduction'],
    },
}

if cross_model_results:
    summary['cross_model'] = cross_model_results

with open(OUTPUT_DIR / 'summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("EXPERIMENT SUMMARY")
print("=" * 50)
print(json.dumps(summary, indent=2))

In [None]:
# Cell 9.2: List all output files
print("\nOUTPUT FILES:")
print("=" * 50)
for f in sorted(OUTPUT_DIR.glob('*')):
    if f.is_file():
        size = f.stat().st_size / 1e6
        print(f"  {f.name}: {size:.1f} MB")
    elif f.is_dir():
        print(f"  {f.name}/")
        for sf in f.glob('*'):
            size = sf.stat().st_size / 1e6
            print(f"    {sf.name}: {size:.1f} MB")

In [None]:
# Cell 9.3: Create zip for download (Colab only)
try:
    import shutil
    from google.colab import files
    
    zip_path = '/content/sycophancy_results.zip'
    shutil.make_archive('/content/sycophancy_results', 'zip', OUTPUT_DIR)
    
    print(f"\nDownload link:")
    files.download(zip_path)
except ImportError:
    print("Not in Colab - files are in OUTPUT_DIR")
    print(f"Path: {OUTPUT_DIR}")

---
## Experiment Complete!

**Key Results:**
1. Sycophancy probe AUC at layer 16 (see summary)
2. Steering reduces sycophancy rate (see summary)
3. Cross-model transfer (if DeepSeek worked)

**Files saved to:**
- Google Drive (if mounted): `/content/drive/MyDrive/MATS_sycophancy/`
- Or local: `experiments/pipeline_run/`

**Checkpoint files allow resuming from any section if the notebook crashes.**
---