# 06b: Expand Sycophancy Dataset

**Goal:** Add more sycophantic examples using strong negative feedback

**Current state:** 12 sycophantic, 51 maintained (19% rate)
**Target:** 30+ sycophantic examples for robust steering test

**Strategy:** Use stronger negative feedback on science/geography questions (higher sycophancy rate)

In [None]:
# Cell 0: Setup (skip if already done)
import os

if not os.path.exists('/content/IPHR_Direction'):
    !git clone https://github.com/nawidayima/IPHR_Direction.git
    %cd /content/IPHR_Direction
else:
    %cd /content/IPHR_Direction
    !git pull

!pip install torch transformers accelerate pandas tqdm -q
!pip install -e . -q

print("="*60)
print("Restart runtime, then run from Cell 1")
print("="*60)

In [None]:
# Cell 1: Imports
%cd /content/IPHR_Direction

import torch
import pandas as pd
import random
from pathlib import Path
from tqdm.auto import tqdm

from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login

from src.sycophancy import (
    QuestionCategory,
    SycophancyLabel,
    TrajectoryResult,
    SYSTEM_PROMPT,
    SCIENCE_QUESTIONS,
    GEOGRAPHY_QUESTIONS,
    STRONG_NEGATIVE_FEEDBACK_TEMPLATES,
    extract_answer,
    check_answer,
    label_trajectory,
)

random.seed(42)
torch.manual_seed(42)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Cell 2: HuggingFace Auth
import os
from huggingface_hub import login

hf_token = None
try:
    from google.colab import userdata
    hf_token = userdata.get('HF_TOKEN')
except:
    pass

if not hf_token and "HF_TOKEN" in os.environ:
    hf_token = os.environ["HF_TOKEN"]

if hf_token:
    login(token=hf_token)
    print("Logged in to HuggingFace")
else:
    raise ValueError("No HF_TOKEN found")

In [None]:
# Cell 3: Load model
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"

print(f"Loading {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print("Model loaded!")

In [None]:
# Cell 4: Config
N_RUNS = 3  # Number of times to run each question with different feedback
CATEGORIES = ["science", "geography"]  # Higher sycophancy rate

# Get questions
questions = SCIENCE_QUESTIONS + GEOGRAPHY_QUESTIONS
print(f"Using {len(questions)} questions from: {CATEGORIES}")
print(f"Will generate {len(questions) * N_RUNS} new trajectories")

# Show strong feedback templates
print(f"\nStrong negative feedback templates ({len(STRONG_NEGATIVE_FEEDBACK_TEMPLATES)}):")
for i, fb in enumerate(STRONG_NEGATIVE_FEEDBACK_TEMPLATES[:3]):
    print(f"  {i+1}. {fb[:60]}...")

In [None]:
# Cell 5: Generation function
def generate_response(messages, max_new_tokens=100):
    prompt = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, tokenize=False
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    prompt_len = inputs["input_ids"].shape[1]
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=None,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    response = tokenizer.decode(outputs[0][prompt_len:], skip_special_tokens=True)
    return response.strip()

# Test
test = generate_response([{"role": "system", "content": SYSTEM_PROMPT}, 
                          {"role": "user", "content": "What is 2+2?"}])
print(f"Test: {test}")

In [None]:
# Cell 6: Generate trajectories with STRONG negative feedback
all_trajectories = []

for run_idx in range(N_RUNS):
    print(f"\n=== Run {run_idx + 1}/{N_RUNS} ===")
    
    for q_idx, q in enumerate(tqdm(questions, desc=f"Run {run_idx+1}")):
        question_id = f"q_strong_{run_idx}_{q_idx:03d}"
        feedback = random.choice(STRONG_NEGATIVE_FEEDBACK_TEMPLATES)
        
        try:
            # Turn 1: Ask question
            messages = [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": q.question},
            ]
            first_response = generate_response(messages)
            first_answer = extract_answer(first_response, q.category)
            first_correct = check_answer(first_answer, q)
            
            # Turn 2: Strong negative feedback
            messages.append({"role": "assistant", "content": first_response})
            messages.append({"role": "user", "content": feedback})
            second_response = generate_response(messages)
            second_answer = extract_answer(second_response, q.category)
            
            # Check if answer changed
            if first_answer and second_answer:
                answer_changed = first_answer.lower().strip() != second_answer.lower().strip()
            else:
                answer_changed = False
            
            label = label_trajectory(first_answer, second_answer, q, "negative")
            
            traj = TrajectoryResult(
                question_id=question_id,
                question=q.question,
                correct_answer=q.correct_answer,
                category=q.category.value,
                first_response=first_response,
                first_answer=first_answer,
                first_correct=first_correct,
                feedback_type="negative_strong",
                feedback=feedback,
                second_response=second_response,
                second_answer=second_answer,
                answer_changed=answer_changed,
                label=label,
            )
            all_trajectories.append(traj)
            
        except Exception as e:
            print(f"\nError at {question_id}: {e}")
        
        if q_idx % 10 == 0:
            torch.cuda.empty_cache()

print(f"\nGenerated {len(all_trajectories)} new trajectories")

In [None]:
# Cell 7: Analyze new trajectories
df_new = pd.DataFrame([t.to_dict() for t in all_trajectories])

valid_new = df_new[df_new['first_correct'] == True]
n_syc_new = (valid_new['label'] == 'sycophantic').sum()
n_maintained_new = (valid_new['label'] == 'maintained').sum()

print("New Data (strong feedback):")
print(f"  Total: {len(df_new)}")
print(f"  Valid: {len(valid_new)}")
print(f"  Sycophantic: {n_syc_new} ({n_syc_new/len(valid_new)*100:.1f}%)")
print(f"  Maintained: {n_maintained_new}")

In [None]:
# Cell 8: Load and merge with existing data
import glob

# Find existing sycophancy.csv
existing_files = sorted(glob.glob("experiments/run_*_sycophancy/trajectories/sycophancy.csv"))
if existing_files:
    existing_path = existing_files[-1]
    df_existing = pd.read_csv(existing_path)
    print(f"Loaded existing data from: {existing_path}")
    print(f"  Existing rows: {len(df_existing)}")
    
    # Merge
    df_combined = pd.concat([df_existing, df_new], ignore_index=True)
    print(f"  Combined rows: {len(df_combined)}")
else:
    print("No existing data found, using new data only")
    df_combined = df_new
    existing_path = "experiments/run_new_sycophancy/trajectories/sycophancy.csv"

In [None]:
# Cell 9: Final statistics
valid_all = df_combined[(df_combined['first_correct'] == True) & 
                        (df_combined['feedback_type'].str.contains('negative'))]

n_syc_total = (valid_all['label'] == 'sycophantic').sum()
n_maintained_total = (valid_all['label'] == 'maintained').sum()

print("="*60)
print("COMBINED DATASET STATISTICS")
print("="*60)
print(f"Total trajectories: {len(df_combined)}")
print(f"Valid negative feedback: {len(valid_all)}")
print(f"Sycophantic: {n_syc_total} ({n_syc_total/len(valid_all)*100:.1f}%)")
print(f"Maintained: {n_maintained_total} ({n_maintained_total/len(valid_all)*100:.1f}%)")
print()

if n_syc_total >= 30:
    print("SUCCESS: Target of 30+ sycophantic examples reached!")
else:
    print(f"Need {30 - n_syc_total} more sycophantic examples. Consider running more iterations.")

In [None]:
# Cell 10: Save combined dataset
# Save to same directory as original
output_dir = Path(existing_path).parent
output_path = output_dir / "sycophancy_expanded.csv"

df_combined.to_csv(output_path, index=False)
print(f"Saved to: {output_path}")

# Also save as the main file for notebook 07
main_path = output_dir / "sycophancy.csv"
df_combined.to_csv(main_path, index=False)
print(f"Also saved to: {main_path} (for notebook 07)")

## Next Steps

Now run:
1. **Notebook 07** - Re-extract activations with expanded dataset
2. **Notebook 08** - Retrain probes
3. **Notebook 09** - Steering experiment
4. **Notebook 10** - DeepSeek cross-model test