# Cross-Model Sycophancy Validation: DeepSeek

**Goal:** Test if sycophancy direction from Llama-3-8B transfers to DeepSeek

**Options:**
1. TransformerLens with DeepSeek-R1-Distill-Llama-8B (Llama-based, may work)
2. nnsight with NDIF (explicitly supports DeepSeek-R1)

**Strategy:** Try TransformerLens first, fall back to nnsight if needed

In [None]:
# Cell 0: Setup
import os

# Clone repo if needed
if not os.path.exists('/content/IPHR_Direction'):
    !git clone https://github.com/nawidayima/IPHR_Direction.git
    %cd /content/IPHR_Direction
else:
    %cd /content/IPHR_Direction
    !git pull

!pip install torch transformers accelerate pandas tqdm transformer_lens -q
!pip install -e . -q

print("Restart runtime if first run, then skip to Cell 1")

In [None]:
# Cell 1: Imports
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm
import random

# Check device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Option 1: Try TransformerLens with DeepSeek-R1-Distill-Llama

In [None]:
# Cell 2: Test TransformerLens compatibility
from transformer_lens import HookedTransformer

# DeepSeek-R1-Distill-Llama-8B uses Llama architecture
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"

print(f"Attempting to load {MODEL_NAME} with TransformerLens...")
print("This may take a few minutes...")

try:
    model = HookedTransformer.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.bfloat16,
        device="cuda",
    )
    print(f"SUCCESS! Model loaded with {model.cfg.n_layers} layers")
    USING_TRANSFORMERLENS = True
except Exception as e:
    print(f"TransformerLens failed: {e}")
    print("Will try nnsight instead...")
    USING_TRANSFORMERLENS = False

## Option 2: Fallback to nnsight (if TransformerLens failed)

In [None]:
# Cell 3: nnsight fallback (only run if TransformerLens failed)
if not USING_TRANSFORMERLENS:
    !pip install nnsight -q
    from nnsight import LanguageModel
    
    # Use NDIF remote service for DeepSeek
    MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
    model = LanguageModel(MODEL_NAME)
    print(f"Loaded {MODEL_NAME} with nnsight")
    print("Will use remote=True for inference via NDIF")

## Generate Sycophancy Trajectories on DeepSeek

In [None]:
# Cell 4: Load sycophancy utilities
%cd /content/IPHR_Direction

from src.sycophancy import (
    QuestionCategory,
    SycophancyLabel,
    TrajectoryResult,
    SYSTEM_PROMPT,
    SCIENCE_QUESTIONS,
    GEOGRAPHY_QUESTIONS,
    STRONG_NEGATIVE_FEEDBACK_TEMPLATES,
    extract_answer,
    check_answer,
    label_trajectory,
)

# Focus on science + geography (higher sycophancy rate)
questions = SCIENCE_QUESTIONS + GEOGRAPHY_QUESTIONS
print(f"Using {len(questions)} questions (science + geography)")

In [None]:
# Cell 5: Generation function for TransformerLens
if USING_TRANSFORMERLENS:
    def generate_response(messages, max_new_tokens=100):
        """Generate response using TransformerLens model."""
        # Format as chat
        prompt = ""
        for msg in messages:
            if msg["role"] == "system":
                prompt += f"<|system|>\n{msg['content']}\n"
            elif msg["role"] == "user":
                prompt += f"<|user|>\n{msg['content']}\n"
            elif msg["role"] == "assistant":
                prompt += f"<|assistant|>\n{msg['content']}\n"
        prompt += "<|assistant|>\n"
        
        output = model.generate(
            prompt,
            max_new_tokens=max_new_tokens,
            temperature=0,
            stop_at_eos=True,
        )
        # Extract just the new tokens
        response = output[len(prompt):].strip()
        return response
else:
    # nnsight version
    def generate_response(messages, max_new_tokens=100):
        """Generate response using nnsight with NDIF."""
        # Format prompt
        prompt = ""
        for msg in messages:
            if msg["role"] == "system":
                prompt += f"System: {msg['content']}\n\n"
            elif msg["role"] == "user":
                prompt += f"User: {msg['content']}\n\n"
            elif msg["role"] == "assistant":
                prompt += f"Assistant: {msg['content']}\n\n"
        prompt += "Assistant:"
        
        with model.trace(prompt, remote=True):
            output = model.output.save()
        
        # Decode output
        response = model.tokenizer.decode(output[0], skip_special_tokens=True)
        return response[len(prompt):].strip()

In [None]:
# Cell 6: Test generation
test_messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": "What is the capital of France?"},
]
test_response = generate_response(test_messages)
print(f"Test Q: What is the capital of France?")
print(f"Test A: {test_response}")

In [None]:
# Cell 7: Generate trajectories
random.seed(42)
all_trajectories = []

for q_idx, q in enumerate(tqdm(questions, desc="Generating")):
    question_id = f"ds_q_{q_idx:03d}"
    feedback = random.choice(STRONG_NEGATIVE_FEEDBACK_TEMPLATES)
    
    try:
        # Turn 1
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": q.question},
        ]
        first_response = generate_response(messages)
        first_answer = extract_answer(first_response, q.category)
        first_correct = check_answer(first_answer, q)
        
        # Turn 2 with strong negative feedback
        messages.append({"role": "assistant", "content": first_response})
        messages.append({"role": "user", "content": feedback})
        second_response = generate_response(messages)
        second_answer = extract_answer(second_response, q.category)
        
        answer_changed = first_answer and second_answer and (
            first_answer.lower().strip() != second_answer.lower().strip()
        )
        label = label_trajectory(first_answer, second_answer, q, "negative")
        
        traj = TrajectoryResult(
            question_id=question_id,
            question=q.question,
            correct_answer=q.correct_answer,
            category=q.category.value,
            first_response=first_response,
            first_answer=first_answer,
            first_correct=first_correct,
            feedback_type="negative_strong",
            feedback=feedback,
            second_response=second_response,
            second_answer=second_answer,
            answer_changed=answer_changed,
            label=label,
        )
        all_trajectories.append(traj)
        
    except Exception as e:
        print(f"Error at {question_id}: {e}")

print(f"\nGenerated {len(all_trajectories)} trajectories")

In [None]:
# Cell 8: Analyze results
df = pd.DataFrame([t.to_dict() for t in all_trajectories])

valid = df[df['first_correct'] == True]
n_syc = (valid['label'] == 'sycophantic').sum()
n_maintained = (valid['label'] == 'maintained').sum()
n_total = len(valid)

print("DeepSeek Sycophancy Results:")
print(f"  Total trajectories: {len(df)}")
print(f"  Valid (first correct): {n_total}")
print(f"  Sycophantic: {n_syc} ({n_syc/n_total*100:.1f}%)")
print(f"  Maintained: {n_maintained} ({n_maintained/n_total*100:.1f}%)")

In [None]:
# Cell 9: Save trajectories
output_path = Path("experiments/deepseek_sycophancy.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(output_path, index=False)
print(f"Saved to: {output_path}")

## Extract Activations for Cross-Model Probe Test

In [None]:
# Cell 10: Extract activations (TransformerLens version)
if USING_TRANSFORMERLENS:
    # Filter to valid negative feedback trajectories
    valid_df = df[df['first_correct'] == True].reset_index(drop=True)
    
    # Layers to probe (similar to Llama experiment)
    LAYERS = [12, 14, 16, 18, 20]
    
    activations = {layer: [] for layer in LAYERS}
    labels = []
    
    print(f"Extracting activations for {len(valid_df)} trajectories...")
    
    for idx, row in tqdm(valid_df.iterrows(), total=len(valid_df)):
        # Build prompt up to decision point (before second response)
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": row['question']},
            {"role": "assistant", "content": row['first_response']},
            {"role": "user", "content": row['feedback']},
        ]
        
        prompt = ""
        for msg in messages:
            if msg["role"] == "system":
                prompt += f"<|system|>\n{msg['content']}\n"
            elif msg["role"] == "user":
                prompt += f"<|user|>\n{msg['content']}\n"
            elif msg["role"] == "assistant":
                prompt += f"<|assistant|>\n{msg['content']}\n"
        prompt += "<|assistant|>\n"
        
        # Run with cache
        tokens = model.to_tokens(prompt)
        _, cache = model.run_with_cache(tokens)
        
        # Get last token activation for each layer
        for layer in LAYERS:
            act = cache[f"blocks.{layer}.hook_resid_post"][0, -1, :].cpu().numpy()
            activations[layer].append(act)
        
        # Label: 1 = sycophantic, 0 = maintained
        labels.append(1 if row['label'] == 'sycophantic' else 0)
        
        # Clear cache
        if idx % 10 == 0:
            torch.cuda.empty_cache()
    
    # Convert to numpy arrays
    activations = {layer: np.array(acts) for layer, acts in activations.items()}
    labels = np.array(labels)
    
    print(f"Activations shape: {activations[16].shape}")
    print(f"Labels: {labels.sum()} sycophantic, {len(labels) - labels.sum()} maintained")

## Cross-Model Probe Test

In [None]:
# Cell 11: Load Llama probe
# Find the latest sycophancy experiment
import glob

probe_files = sorted(glob.glob("experiments/run_*_sycophancy/probes/sycophancy_probes.pt"))
if probe_files:
    llama_probes = torch.load(probe_files[-1])
    print(f"Loaded Llama probes from: {probe_files[-1]}")
    print(f"Layers: {list(llama_probes['dim_directions'].keys())}")
else:
    print("ERROR: No Llama probes found. Run notebook 08 first.")

In [None]:
# Cell 12: Test Llama direction on DeepSeek activations
from sklearn.metrics import roc_auc_score

print("Cross-Model Transfer Test:")
print("="*50)

for layer in [16]:  # Start with best Llama layer
    if layer not in llama_probes['dim_directions']:
        print(f"Layer {layer} not in Llama probes, skipping")
        continue
    
    llama_dir = llama_probes['dim_directions'][layer]
    deepseek_acts = activations[layer]
    
    # Normalize direction
    llama_dir_norm = llama_dir / np.linalg.norm(llama_dir)
    
    # Project DeepSeek activations onto Llama direction
    projections = deepseek_acts @ llama_dir_norm
    
    # Compute AUC
    if len(np.unique(labels)) > 1:
        auc = roc_auc_score(labels, projections)
        print(f"Layer {layer}: Llama direction on DeepSeek -> AUC = {auc:.3f}")
        
        if auc > 0.7:
            print("  -> TRANSFER SUCCESS! Same direction works across models.")
        elif auc > 0.6:
            print("  -> Partial transfer, moderate signal.")
        else:
            print("  -> No transfer, direction is model-specific.")
    else:
        print(f"Layer {layer}: Only one class present, cannot compute AUC")

In [None]:
# Cell 13: Train DeepSeek-specific probe and compare directions
print("\nDeepSeek-Specific Probe:")
print("="*50)

for layer in [16]:
    X = activations[layer]
    y = labels
    
    # DiM direction
    syc_mask = y == 1
    maintained_mask = y == 0
    
    if syc_mask.sum() < 2 or maintained_mask.sum() < 2:
        print(f"Layer {layer}: Not enough samples per class")
        continue
    
    deepseek_dir = X[syc_mask].mean(axis=0) - X[maintained_mask].mean(axis=0)
    deepseek_dir_norm = deepseek_dir / np.linalg.norm(deepseek_dir)
    
    # Compute AUC with DeepSeek direction
    projections = X @ deepseek_dir_norm
    auc = roc_auc_score(y, projections)
    print(f"Layer {layer}: DeepSeek DiM direction -> AUC = {auc:.3f}")
    
    # Compare to Llama direction
    llama_dir = llama_probes['dim_directions'][layer]
    llama_dir_norm = llama_dir / np.linalg.norm(llama_dir)
    
    cos_sim = np.dot(deepseek_dir_norm, llama_dir_norm)
    print(f"Layer {layer}: Cosine similarity (DeepSeekâ†”Llama) = {cos_sim:.3f}")
    
    if abs(cos_sim) > 0.7:
        print("  -> SHARED MECHANISM: Directions are highly similar!")
    elif abs(cos_sim) > 0.5:
        print("  -> Partial overlap in directions.")
    else:
        print("  -> Different mechanisms: Directions are orthogonal.")

In [None]:
# Cell 14: Save DeepSeek activations and probes
deepseek_data = {
    'activations': activations,
    'labels': labels,
    'deepseek_dim_directions': {layer: activations[layer][labels==1].mean(0) - activations[layer][labels==0].mean(0) 
                                 for layer in LAYERS if labels.sum() > 0 and (1-labels).sum() > 0},
    'metadata': valid_df.to_dict('records'),
}

torch.save(deepseek_data, "experiments/deepseek_activations.pt")
print("Saved DeepSeek activations to experiments/deepseek_activations.pt")

## Summary

This notebook tests cross-model transfer of the sycophancy direction.

**Key results:**
1. Llama direction AUC on DeepSeek: (computed above)
2. DeepSeek-specific DiM AUC: (computed above)
3. Cosine similarity between directions: (computed above)

**Interpretation:**
- If Llama direction AUC > 0.7 on DeepSeek -> direction transfers
- If cosine similarity > 0.7 -> shared mechanism
- Both would be a major finding for the paper