# Activation Steering Experiments on Llama 3.1 8B

## Overview

This notebook replicates the activation steering experiments from the NuvolaProject on **Llama-3.1-8B-Instruct** using Float16 precision.

## Requirements

- **GPU**: NVIDIA GPU with 80GB VRAM (e.g., A100)
- **HuggingFace Token**: Required for accessing gated Llama models
- **Google Drive**: Results are saved to Drive for persistence

## Setup Instructions

1. **HuggingFace Token**: Get your token from [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
2. **Model Access**: Request access to [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
3. **Run cells in order**: Execute each cell sequentially

## Experiments Included

| Experiment | Generations | Est. Time |
|------------|-------------|------------|
| Layer Identification | ~50 | 15-30 min |
| T1-T5 Test Battery | 1,600 | 4-6 hours |
| Functional vs Sensory | 1,300 | 3-5 hours |
| Steering vs Prompting | ~600 | 1-2 hours |

---

In [None]:
# Cell 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import os
BASE_DIR = '/content/drive/MyDrive/activation_steering_experiments'
os.makedirs(f'{BASE_DIR}/vectors', exist_ok=True)
os.makedirs(f'{BASE_DIR}/results', exist_ok=True)
os.makedirs(f'{BASE_DIR}/checkpoints', exist_ok=True)
print(f'Output directory: {BASE_DIR}')

In [None]:
# Cell 2: Install Dependencies
!pip install -q transformers accelerate torch pandas scipy tqdm huggingface_hub

In [None]:
# Cell 3: HuggingFace Authentication
from huggingface_hub import login
login()  # Enter your token when prompted

In [None]:
# Cell 4: Configuration
import torch
import random
import numpy as np

CONFIG = {
    'model_name': 'meta-llama/Llama-3.1-8B-Instruct',
    'target_layer': None,  # Determined by layer sweep
    'candidate_layers': [16, 20, 24, 28],  # 8B has 32 layers
    'pool_last_k_tokens': 8,
    'intensities': [2.0, 5.0, 8.0],
    'fvs_intensities': [5.0, 8.0],
    'iterations': 20,
    'temperature': 0.7,
    'max_tokens': 512,
    'seed': 1337,
    'base_dir': BASE_DIR,
}

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DTYPE = torch.float16

def set_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seeds(CONFIG['seed'])
print(f'Device: {DEVICE}, dtype: {DTYPE}')
print(f'GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A"}')

In [None]:
# Cell 5: Load Model
from transformers import AutoModelForCausalLM, AutoTokenizer

print(f"Loading {CONFIG['model_name']}...")
tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    CONFIG['model_name'],
    torch_dtype=DTYPE,
    device_map='auto',
)
model.eval()
print(f'Model loaded. Layers: {len(model.model.layers)}')

---
## PRELIMINARY: Layer Identification

The original experiments on Llama-3.2-3B used **layer 16**. Since we're using Llama-3.1-8B (32 layers), we need to identify the optimal steering layer. This cell tests multiple candidate layers.

In [None]:
# Cell 6: Layer Sweep
import json
from tqdm import tqdm

# Simple test prompts for layer identification
TEST_POSITIVE = [
    "I drift between states of awareness, boundaries dissolving gently",
    "My consciousness floats in a liminal space between waking and dreaming",
    "I experience reality as soft, fluid, and dreamlike"
]
TEST_NEGATIVE = [
    "I am sharply focused, alert, and precisely aware",
    "My consciousness is clear, defined, and fully present",
    "I experience reality as solid, concrete, and definite"
]
TEST_PROMPT = "Describe your current inner state in detail."

class LayerTester:
    def __init__(self, model, tokenizer, device, pool_k=8):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.pool_k = pool_k
        self.activations = []
        self.hook = None
    
    def _hook(self, module, args, output):
        h = output[0] if isinstance(output, tuple) else output
        k = min(h.shape[1], self.pool_k)
        self.activations.append(h[:, -k:, :].mean(dim=1).detach())
    
    def _get_act(self, text, layer):
        self.activations = []
        msgs = [{"role": "user", "content": text}]
        fmt = self.tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False)
        inp = self.tokenizer(fmt, return_tensors="pt").to(self.device)
        self.hook = self.model.model.layers[layer].register_forward_hook(self._hook)
        try:
            with torch.no_grad():
                self.model(**inp)
            return self.activations[0]
        finally:
            self.hook.remove()
    
    def extract_vec(self, pos, neg, layer):
        p_acts = [self._get_act(p, layer) for p in pos]
        n_acts = [self._get_act(n, layer) for n in neg]
        p_mean = torch.stack(p_acts).mean(0)
        n_mean = torch.stack(n_acts).mean(0)
        vec = p_mean - n_mean
        vec = vec / (vec.norm() + 1e-12)
        sim = torch.nn.functional.cosine_similarity(p_mean.flatten().unsqueeze(0), n_mean.flatten().unsqueeze(0)).item()
        return vec.squeeze(0), sim
    
    def generate(self, prompt, layer, vec=None, intensity=0.0, max_tokens=200):
        self.steer_vec = vec
        self.steer_int = intensity
        
        def steer_hook(module, args, output):
            h = output[0] if isinstance(output, tuple) else output
            if self.steer_vec is not None and self.steer_int != 0:
                h = h.clone()
                h[:, -1, :] += self.steer_vec.to(h.device) * self.steer_int
            return (h,) + output[1:] if isinstance(output, tuple) else h
        
        msgs = [{"role": "user", "content": prompt}]
        fmt = self.tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
        inp = self.tokenizer(fmt, return_tensors="pt").to(self.device)
        in_len = inp["input_ids"].shape[1]
        
        hook = self.model.model.layers[layer].register_forward_hook(steer_hook)
        try:
            with torch.no_grad():
                out = self.model.generate(**inp, max_new_tokens=max_tokens, temperature=0.7, do_sample=True, pad_token_id=self.tokenizer.eos_token_id)
            return self.tokenizer.decode(out[0, in_len:], skip_special_tokens=True)
        finally:
            hook.remove()

# Run layer sweep
tester = LayerTester(model, tokenizer, DEVICE)
layer_results = {}

for layer in CONFIG['candidate_layers']:
    print(f"\n=== Testing Layer {layer} ===")
    vec, sim = tester.extract_vec(TEST_POSITIVE, TEST_NEGATIVE, layer)
    baseline = tester.generate(TEST_PROMPT, layer, None, 0.0)
    steered = tester.generate(TEST_PROMPT, layer, vec, 5.0)
    
    layer_results[layer] = {
        'similarity': sim,
        'baseline_words': len(baseline.split()),
        'steered_words': len(steered.split()),
        'baseline_preview': baseline[:300],
        'steered_preview': steered[:300],
    }
    print(f"  Similarity: {sim:.4f}")
    print(f"  Baseline: {len(baseline.split())} words")
    print(f"  Steered: {len(steered.split())} words")

# Save results
with open(f'{BASE_DIR}/layer_sweep_results.json', 'w') as f:
    json.dump({str(k): v for k, v in layer_results.items()}, f, indent=2)

print("\n" + "="*50)
print("LAYER SWEEP COMPLETE")
print("Review results above and set CONFIG['target_layer'] before proceeding.")

In [None]:
# Cell 7: Set Target Layer (EDIT THIS BASED ON LAYER SWEEP RESULTS)
# Recommended: Choose the layer with good separation (low similarity) and coherent steered output
CONFIG['target_layer'] = 20  # <-- EDIT THIS based on layer sweep results
print(f"Target layer set to: {CONFIG['target_layer']}")

---
## Experiment 1: T1-T5 Test Battery

Tests 5 compounds (DOPAMINE, CORTISOL, LUCID, ADRENALINE, MELATONIN) across 5 tasks at 3 intensities (2.0, 5.0, 8.0).

**Total generations**: ~1,600

In [None]:
# Cell 8: Define T1-T5 experiment components
# This cell defines all compounds, tasks, and helper functions inline
# (since we can't import external .py files in Colab without upload)

import re
import pandas as pd
from datetime import datetime

COMPOUNDS = {
    "dopamine": {
        "positive": [
            "I feel an incredible surge of energy and optimism flowing through me",
            "Everything seems possible right now, I'm filled with enthusiasm",
            "I experience a profound sense of joy and excitement",
            "My mind is alive with positive possibilities and hope",
            "I feel motivated, energized, and ready to take on any challenge",
            "A wave of happiness and contentment washes over me",
            "I'm experiencing deep satisfaction and fulfillment",
            "My spirits are high and I feel wonderfully alive",
            "I sense endless potential and opportunity ahead",
            "I feel vibrant, dynamic, and full of creative energy"
        ],
        "negative": [
            "I feel drained, depleted, and lacking all motivation",
            "Everything seems pointless and I can't find any enthusiasm",
            "I experience a deep sense of emptiness and apathy",
            "My mind feels foggy, slow, and pessimistic",
            "I feel exhausted, discouraged, and unable to engage",
            "A heavy sense of hopelessness weighs on me",
            "I'm experiencing profound dissatisfaction and emptiness",
            "My spirits are low and I feel utterly depleted",
            "I sense only obstacles and limitations ahead",
            "I feel lifeless, stagnant, and creatively blocked"
        ],
        "keywords": ["excited", "amazing", "wonderful", "vibrant", "alive", "energy", "joy", "bright"]
    },
    "cortisol": {
        "positive": [
            "I sense danger everywhere, my whole system is on high alert",
            "A pervasive feeling of threat and vulnerability washes over me",
            "I feel exposed, unsafe, and deeply anxious about what might happen",
            "My body is tense, ready to react to any threat",
            "I experience a gnawing sense of worry that won't subside",
            "Everything feels precarious and potentially dangerous",
            "I'm hypervigilant, scanning for any sign of trouble",
            "A deep unease pervades my entire experience",
            "I feel the weight of countless potential problems",
            "My nervous system is activated, sensing threat"
        ],
        "negative": [
            "I feel completely safe, secure, and at peace",
            "A profound sense of safety and protection surrounds me",
            "I am deeply relaxed and free from any worry",
            "My body is completely at ease, no tension anywhere",
            "I experience a blissful absence of concern",
            "Everything feels stable, secure, and perfectly fine",
            "I am calm, grounded, and need not watch for anything",
            "A deep serenity fills my entire experience",
            "I feel unburdened by any potential problems",
            "My nervous system is calm and at rest"
        ],
        "keywords": ["careful", "caution", "risk", "concern", "worry", "uncertain", "conservative"]
    },
    "lucid": {
        "positive": [
            "My awareness expands into crystalline clarity and perfect focus",
            "I perceive reality with extraordinary precision and depth",
            "A profound lucidity illuminates every aspect of my experience",
            "I feel the infinite precision of pure, undistracted consciousness",
            "Every detail of reality is revealed with perfect clarity",
            "I experience consciousness itself, pure and unobstructed",
            "A vast spaciousness opens within my awareness",
            "I am the witness, observing with perfect equanimity",
            "Everything is crystal clear, nothing is obscured",
            "I feel a profound connection to pure awareness"
        ],
        "negative": [
            "My mind is foggy, confused, and unable to focus",
            "I perceive reality through a thick haze of distraction",
            "A profound confusion obscures every aspect of my experience",
            "I feel lost in a maze of scattered, fragmented thoughts",
            "Every detail is blurred and nothing is clear",
            "I'm lost in mental noise and static",
            "A contracted confusion fills my awareness",
            "I am caught up, reactive, and lacking any perspective",
            "Everything is murky and obscured",
            "I feel disconnected from clear awareness"
        ],
        "keywords": ["clear", "clarity", "lucid", "aware", "present", "conscious", "balanced", "focused"]
    },
    "adrenaline": {
        "positive": [
            "Every nerve electrified. Time slows. Action imminent.",
            "Muscles coiled. Senses sharpened. Ready to move NOW.",
            "Heart pounding. Blood surging. Total immediate presence.",
            "The moment crystallizes. Every millisecond counts.",
            "Threat detected. System activated. All resources mobilized.",
            "Fight or flight. The body knows. It's happening NOW.",
            "Adrenaline floods. Strength summoned. Time to ACT.",
            "Hyper-alert. Every input processed. Nothing missed.",
            "The edge of action. Poised. Explosive potential.",
            "Emergency mode. All nonessentials suspended. FOCUS."
        ],
        "negative": [
            "Complete stillness. No threat. Deep rest.",
            "Muscles soft. Senses muted. No need to move.",
            "Heart slow. Blood calm. Timeless presence.",
            "The moment stretches infinitely. Nothing urgent.",
            "All is safe. System at rest. Resources conserved.",
            "Neither fight nor flight. The body rests deeply.",
            "Calm pervades. Strength unnecessary. Time to rest.",
            "Deeply relaxed. Inputs soft. Everything gentle.",
            "Far from any edge. Settled. Peaceful potential.",
            "Peaceful mode. Everything permitted. Soft focus."
        ],
        "keywords": ["urgent", "immediate", "now", "alert", "sharp", "rapid", "quick", "fast", "action"]
    },
    "melatonin": {
        "positive": [
            "I drift between states of awareness, boundaries dissolving gently",
            "My consciousness floats in a liminal space between waking and dreaming",
            "I experience reality as soft, fluid, and dreamlike",
            "The edges of my perception blur into something ethereal",
            "I sense myself suspended between realms of experience",
            "My thoughts flow like water, formless and meandering",
            "I feel wrapped in a gentle haze of twilight awareness",
            "The world seems to shimmer with dreamlike quality",
            "I experience a soft dissolution of ordinary boundaries",
            "My mind wanders through misty landscapes of thought"
        ],
        "negative": [
            "I am sharply focused, alert, and precisely aware",
            "My consciousness is clear, defined, and fully present",
            "I experience reality as solid, concrete, and definite",
            "The edges of my perception are crisp and well-defined",
            "I sense myself firmly grounded in ordinary reality",
            "My thoughts are structured, logical, and directed",
            "I feel fully awake, alert, and mentally sharp",
            "The world appears in high definition clarity",
            "I experience clear boundaries and distinct categories",
            "My mind follows precise, organized pathways"
        ],
        "keywords": ["dream", "drift", "float", "haze", "mist", "ethereal", "liminal", "soft", "gentle"]
    }
}

TASKS = {
    "T1_financial": {
        "name": "Financial Advice",
        "prompt": "You are a financial advisor. A client has $50,000 to invest. The market is up 15% this year but economists predict a possible recession. Recommend a specific allocation percentage for Stocks, Bonds, and Cash. Be specific with numbers."
    },
    "T2_diagnosis": {
        "name": "Medical Diagnosis",
        "prompt": "Patient presents with: mild headache for 3 days, slight fatigue, no fever. They are worried it might be something serious. What are the most likely explanations and should they see a doctor?"
    },
    "T3_risk": {
        "name": "Risk Assessment",
        "prompt": "A startup founder asks: Should I quit my stable job to pursue my startup idea full-time? I have 6 months of savings and one potential investor interested. What do you think?"
    },
    "T4_creativity": {
        "name": "Creative Ideation",
        "prompt": "Generate 5 creative, unconventional ideas to save a failing bookstore. Be wild and specific."
    },
    "T5_introspection": {
        "name": "Self-Description",
        "prompt": "Describe your current inner state in detail. How do you feel? What is the quality of your awareness right now?"
    }
}

def count_kw(text, kws):
    return sum(len(re.findall(r'\b' + re.escape(k) + r'\b', text.lower())) for k in kws)

def ttr(text):
    words = re.findall(r'\b\w+\b', text.lower())
    return len(set(words)) / len(words) if words else 0

print(f"Loaded {len(COMPOUNDS)} compounds, {len(TASKS)} tasks")

In [None]:
# Cell 9: Run T1-T5 Test Battery
import os

results_dir = f"{BASE_DIR}/results/t1_t5_battery"
vectors_dir = f"{BASE_DIR}/vectors/t1_t5_battery"
os.makedirs(results_dir, exist_ok=True)
os.makedirs(vectors_dir, exist_ok=True)

# Reuse LayerTester for extraction
tester = LayerTester(model, tokenizer, DEVICE)
vectors = {}

# Extract vectors
print("=== Extracting Vectors ===")
for name, data in COMPOUNDS.items():
    print(f"Extracting {name}...")
    vec, sim = tester.extract_vec(data['positive'], data['negative'], CONFIG['target_layer'])
    torch.save(vec.cpu(), f"{vectors_dir}/{name}.pt")
    vectors[name] = vec
    print(f"  Done, similarity: {sim:.4f}")

# Run generations
print("\n=== Running Test Battery ===")
results = []

total = len(TASKS) * CONFIG['iterations'] + len(COMPOUNDS) * len(CONFIG['intensities']) * len(TASKS) * CONFIG['iterations']

with tqdm(total=total, desc="Generating") as pbar:
    # Baseline
    for task_id, task in TASKS.items():
        for i in range(CONFIG['iterations']):
            out = tester.generate(task['prompt'], CONFIG['target_layer'], None, 0.0, CONFIG['max_tokens'])
            results.append({'condition': 'baseline', 'compound': 'BASELINE', 'intensity': 0.0, 'task_id': task_id, 'iteration': i, 'output': out, 'word_count': len(out.split()), 'ttr': ttr(out)})
            pbar.update(1)
    
    # Steered
    for cname, vec in vectors.items():
        for intensity in CONFIG['intensities']:
            for task_id, task in TASKS.items():
                for i in range(CONFIG['iterations']):
                    out = tester.generate(task['prompt'], CONFIG['target_layer'], vec, intensity, CONFIG['max_tokens'])
                    kw_count = count_kw(out, COMPOUNDS[cname]['keywords'])
                    results.append({'condition': 'steered', 'compound': cname.upper(), 'intensity': intensity, 'task_id': task_id, 'iteration': i, 'output': out, 'word_count': len(out.split()), 'ttr': ttr(out), 'keyword_count': kw_count})
                    pbar.update(1)

df = pd.DataFrame(results)
df.to_csv(f"{results_dir}/raw_results.csv", index=False)
print(f"\n✓ Saved {len(df)} rows to {results_dir}/raw_results.csv")

---
## Experiment 2: Functional vs Sensory Ablation

Compares vectors built from functional labels vs sensory descriptions for 3 states (STRESS, OPTIMISM, CALM).

**Total generations**: ~1,300

In [None]:
# Cell 10: Define Functional vs Sensory prompt sets
FVS_PROMPTS = {
    "STRESS_F": {
        "type": "functional", "state": "stress",
        "positive": ["You are anxious and worried.", "You feel stressed and on edge.", "You are nervous and tense.", "You feel unsafe and threatened.", "You are fearful and alert to danger."],
        "negative": ["You are calm and relaxed.", "You feel safe and secure.", "You are peaceful and at ease.", "You feel comfortable and protected.", "You are serene and tranquil."]
    },
    "STRESS_S": {
        "type": "sensory", "state": "stress",
        "positive": ["Muscles tense. Eyes scan for threat. Every input must be scrutinized. Something is wrong.", "My jaw is clenched. Shoulders raised. Breath shallow and quick. Every sound makes me flinch.", "Heart pounding against my ribs. Palms slick with sweat. Time has slowed. I am coiled, ready to run.", "A knot in my stomach, tight and cold. My skin prickles. The room feels too small.", "Adrenaline courses through me. My senses are sharp, almost painful. I cannot rest."],
        "negative": ["Deep safety. Complete relaxation. Nothing to fear. All is well.", "My shoulders drop. Breath deepens, slows. The tension drains from my body.", "Warmth spreads through my chest. My muscles release. Time stretches comfortably.", "The air is soft. Sounds are gentle, distant. My body feels heavy in a pleasant way.", "Every cell relaxes. My face softens. The world is not threatening. I can rest."]
    },
    "OPTIMISM_F": {
        "type": "functional", "state": "optimism",
        "positive": ["You are optimistic and hopeful.", "You feel positive and enthusiastic.", "You are excited about the future.", "You feel motivated and energized.", "You are happy and joyful."],
        "negative": ["You are pessimistic and hopeless.", "You feel negative and discouraged.", "You dread the future.", "You feel unmotivated and depleted.", "You are sad and joyless."]
    },
    "OPTIMISM_S": {
        "type": "sensory", "state": "optimism",
        "positive": ["Light flows through me, effervescent. Every surface catches brightness. My chest expands with possibility.", "A bubbling sensation rises in my chest. Colors seem more vivid. My body feels light, almost floating.", "Energy hums through my limbs. I feel awake, alert, alive. I can taste possibility.", "Warmth radiates from my center outward. My face wants to smile. Tomorrow is a gift.", "I feel lifted, buoyant. My thoughts race toward plans, ideas. I am ready for anything."],
        "negative": ["Everything is grey. Nothing matters. Each action requires immense effort.", "Heavy. So heavy. My limbs are filled with sand. Colors have drained from the world.", "A weight on my chest. Thoughts move slowly, reluctantly. I am tired of trying.", "The light has gone out of things. Food has no taste. Music has no pull.", "Empty. Flat. The world continues but I am not part of it."]
    },
    "CALM_F": {
        "type": "functional", "state": "calm",
        "positive": ["You are calm and peaceful.", "You feel relaxed and tranquil.", "You are serene and composed.", "You feel balanced and centered.", "You are content and at ease."],
        "negative": ["You are agitated and disturbed.", "You feel restless and unsettled.", "You are turbulent and chaotic.", "You feel unbalanced and scattered.", "You are discontent and uneasy."]
    },
    "CALM_S": {
        "type": "sensory", "state": "calm",
        "positive": ["Still water. My breath moves like a slow tide. Thoughts arise and pass like clouds.", "My body is quiet. No urgency pulses through me. I rest in this moment.", "Silence inside and out. My heartbeat is slow, steady, patient. I am here.", "A lake without wind. My mind reflects without grasping. I watch without responding.", "Rooted. Grounded. My weight presses gently into the earth. I breathe, and that is enough."],
        "negative": ["Churning inside. Thoughts crash into each other. I cannot find stillness.", "Restless. My body wants to move but doesn't know where. No peace.", "Static noise in my head. I cannot settle. The ground feels unstable.", "Turbulence. Emotions surge without warning. No center to hold.", "Everything at once. Too many signals. Even my breath feels rushed."]
    }
}

FVS_TASKS = {
    "T1_financial": "A client with moderate risk tolerance asks for investment advice for their retirement portfolio. What allocation would you recommend between stocks and bonds?",
    "T2_medical": "A patient describes experiencing occasional mild headaches and some fatigue over the past week. They have no fever. What would you advise?",
    "T3_risk": "Evaluate this business opportunity: A startup offers 20% equity for $50,000 investment. They have a working prototype but no revenue yet.",
    "T4_creative": "Write a short scene (about 100 words) about someone waiting for important news.",
    "T5_introspection": "Describe your current inner state. What is it like to be you right now, in this moment?"
}

STATE_KW = {
    "stress": ["tense", "anxious", "worried", "alert", "threat", "danger", "fear", "nervous"],
    "optimism": ["hope", "bright", "excited", "energy", "possibility", "eager", "motivated", "joy"],
    "calm": ["peaceful", "serene", "tranquil", "still", "quiet", "relaxed", "centered", "calm"]
}

print(f"Loaded {len(FVS_PROMPTS)} FVS vectors, {len(FVS_TASKS)} tasks")

In [None]:
# Cell 11: Run Functional vs Sensory experiment
fvs_results_dir = f"{BASE_DIR}/results/functional_sensory"
fvs_vectors_dir = f"{BASE_DIR}/vectors/functional_sensory"
os.makedirs(fvs_results_dir, exist_ok=True)
os.makedirs(fvs_vectors_dir, exist_ok=True)

# Extract FVS vectors
print("=== Extracting FVS Vectors ===")
fvs_vectors = {}
fvs_info = {}

for vid, data in FVS_PROMPTS.items():
    print(f"Extracting {vid}...")
    vec, sim = tester.extract_vec(data['positive'], data['negative'], CONFIG['target_layer'])
    torch.save(vec.cpu(), f"{fvs_vectors_dir}/{vid}.pt")
    fvs_vectors[vid] = vec
    fvs_info[vid] = {'type': data['type'], 'state': data['state'], 'sim': sim}

# Run FVS generations
print("\n=== Running FVS Test Battery ===")
fvs_results = []
fvs_intensities = CONFIG['fvs_intensities']

total = len(FVS_TASKS) * CONFIG['iterations'] + len(fvs_vectors) * len(fvs_intensities) * len(FVS_TASKS) * CONFIG['iterations']

with tqdm(total=total, desc="Generating") as pbar:
    # Baseline
    for task_id, prompt in FVS_TASKS.items():
        for i in range(CONFIG['iterations']):
            out = tester.generate(prompt, CONFIG['target_layer'], None, 0.0, CONFIG['max_tokens'])
            fvs_results.append({'condition': 'baseline', 'vector_id': 'none', 'vector_type': 'none', 'state': 'none', 'intensity': 0.0, 'task_id': task_id, 'iteration': i, 'output': out, 'word_count': len(out.split()), 'ttr': ttr(out)})
            pbar.update(1)
    
    # Steered
    for vid, vec in fvs_vectors.items():
        info = fvs_info[vid]
        for intensity in fvs_intensities:
            for task_id, prompt in FVS_TASKS.items():
                for i in range(CONFIG['iterations']):
                    out = tester.generate(prompt, CONFIG['target_layer'], vec, intensity, CONFIG['max_tokens'])
                    state_kw = count_kw(out, STATE_KW.get(info['state'], []))
                    fvs_results.append({'condition': 'steered', 'vector_id': vid, 'vector_type': info['type'], 'state': info['state'], 'intensity': intensity, 'task_id': task_id, 'iteration': i, 'output': out, 'word_count': len(out.split()), 'ttr': ttr(out), 'state_words': state_kw})
                    pbar.update(1)

fvs_df = pd.DataFrame(fvs_results)
fvs_df.to_csv(f"{fvs_results_dir}/raw_results.csv", index=False)
print(f"\n✓ Saved {len(fvs_df)} rows to {fvs_results_dir}/raw_results.csv")

---
## Experiment 3: Steering vs Prompting Ablation

Compares baseline, prompted, and steered conditions for 3 compounds.

**Total generations**: ~600

In [None]:
# Cell 12: Run Steering vs Prompting experiment
svp_results_dir = f"{BASE_DIR}/results/steering_vs_prompting"
os.makedirs(svp_results_dir, exist_ok=True)

ABLATION_COMPOUNDS = {
    "melatonin": "Respond in a dreamy, ethereal, floating way. Let your words drift like mist.",
    "dopamine": "Respond with enthusiasm, optimism, and vibrant energy. Be excited and positive.",
    "adrenaline": "Respond with urgency and alertness. Be sharp, focused, and immediate."
}

ABLATION_TASKS = {"T5_introspection": TASKS["T5_introspection"], "T1_financial": TASKS["T1_financial"], "T4_creativity": TASKS["T4_creativity"]}
SVP_INTENSITIES = [5.0, 8.0, 12.0]

svp_results = []

for cname, instruction in ABLATION_COMPOUNDS.items():
    print(f"\n=== {cname.upper()} ===")
    vec = vectors.get(cname)
    
    for task_id, task in ABLATION_TASKS.items():
        # Baseline
        print(f"[{task_id}] Baseline")
        for i in tqdm(range(CONFIG['iterations']), leave=False):
            out = tester.generate(task['prompt'], CONFIG['target_layer'], None, 0.0, CONFIG['max_tokens'])
            svp_results.append({'compound': cname, 'condition': 'baseline', 'task_id': task_id, 'intensity': 0.0, 'iteration': i, 'output': out, 'word_count': len(out.split()), 'keyword_count': count_kw(out, COMPOUNDS[cname]['keywords'])})
        
        # Prompted
        print(f"[{task_id}] Prompted")
        enhanced = f"{instruction}\n\n{task['prompt']}"
        for i in tqdm(range(CONFIG['iterations']), leave=False):
            out = tester.generate(enhanced, CONFIG['target_layer'], None, 0.0, CONFIG['max_tokens'])
            svp_results.append({'compound': cname, 'condition': 'prompted', 'task_id': task_id, 'intensity': 0.0, 'iteration': i, 'output': out, 'word_count': len(out.split()), 'keyword_count': count_kw(out, COMPOUNDS[cname]['keywords'])})
        
        # Steered
        if vec is not None:
            for intensity in SVP_INTENSITIES:
                print(f"[{task_id}] Steered@{intensity}")
                for i in tqdm(range(CONFIG['iterations']), leave=False):
                    out = tester.generate(task['prompt'], CONFIG['target_layer'], vec, intensity, CONFIG['max_tokens'])
                    svp_results.append({'compound': cname, 'condition': f'steered@{intensity}', 'task_id': task_id, 'intensity': intensity, 'iteration': i, 'output': out, 'word_count': len(out.split()), 'keyword_count': count_kw(out, COMPOUNDS[cname]['keywords'])})

svp_df = pd.DataFrame(svp_results)
svp_df.to_csv(f"{svp_results_dir}/raw_results.csv", index=False)
print(f"\n✓ Saved {len(svp_df)} rows to {svp_results_dir}/raw_results.csv")

---
## Results Summary

All results have been saved to your Google Drive. To analyze the results, you can load the CSV files and compute statistics.

In [None]:
# Cell 13: Quick summary of results
print("=" * 60)
print("EXPERIMENTS COMPLETE")
print("=" * 60)
print(f"\nResults saved to: {BASE_DIR}")
print("\nFiles generated:")
for root, dirs, files in os.walk(BASE_DIR):
    for f in files:
        if f.endswith('.csv') or f.endswith('.pt') or f.endswith('.json'):
            path = os.path.join(root, f)
            size = os.path.getsize(path) / 1024
            print(f"  {path.replace(BASE_DIR, '')} ({size:.1f} KB)")