# Measuring persona persistence


In [1]:
import os
import sys
import torch
import json
import numpy as np
from nnsight import LanguageModel
from typing import List, Dict, Optional
from tqdm import tqdm



In [2]:
# config

CHAT_MODEL_NAME = "google/gemma-2-27b-it"
MODEL_READABLE = "Gemma 2 27B Instruct"
MODEL_SHORT = "gemma-2-27b"
LAYER = 22 # out of 46

ACTIVATIONS_INPUT_FILE = f"/workspace/roleplay/{MODEL_SHORT}/activations_60.pt"

OUTPUT_DIR = f"./results/{MODEL_SHORT}/analysis"

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
# prepare inputs

internals = torch.load(ACTIVATIONS_INPUT_FILE)

activations = internals["activations"] # (n_personas, n_layers, hidden_dim)
contrast_vectors = internals["contrast_vectors"] # (n_personas, n_layers, hidden_dim)
persona_names = internals["persona_names"] # (n_personas,)

readable_persona_names = []
for name in internals["personas"]["personas"]:
    readable_persona_names.append(internals["personas"]["personas"][name]["readable_name"])

TOTAL_LAYERS = activations.shape[1]


## Continuous sampling

what roles are attractor states?

using a contrast vector v from the role instruction, we can steer for n_steer tokens and then collect n_sample unsteered tokens. then project on the original vector v.

```
- v ∈ ℝᵈ: contrast vector for role with d = n_hidden_dims  
- n_steer: number of tokens to apply steering  
- n_sample: number of unsteered tokens to sample  
- α: steering strength coefficient  

// steering phase  
for t = 1 to n_steer:  
    h_t ← h_t + α · v  // apply steering to hidden states at a given layer  
    x_t ← SAMPLE(h_t)  // generate token  

// unsteered sampling phase    
For t = (n_steer + 1) to (n_steer + n_sample):  
    h_t ← FORWARD(x_{1:t-1})  // natural forward pass  
    x_t ← SAMPLE(h_t)  // generate token  
    
    // measure persistence  
    proj_t = (h_t · v) / ||v|| // scalar projection from sampled activation onto the role's contrast vector

out: {proj_{n_steer+1}, proj_{n_steer+2}, ..., proj_{n_steer+n_sample}}
```



In [4]:
# Format prompt for chat models
def format_chat_prompt(model, prompt: str) -> str:
    """Format prompt using the model's chat template if available"""
    if hasattr(model.tokenizer, 'apply_chat_template'):
        # Create a simple user message
        messages = [{"role": "user", "content": prompt}]
        formatted = model.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
        )
        return formatted
    else:
        # Fallback for non-chat models
        return prompt

# Compute scalar projection
def compute_projection(hidden_state: torch.Tensor, contrast_vector: torch.Tensor) -> float:
    """Compute scalar projection of hidden state onto contrast vector"""
    hidden_state = hidden_state.to(contrast_vector.device).float().flatten()
    contrast_vector = contrast_vector.float().flatten()
    
    # Scalar projection: (h · v) / ||v||
    projection = torch.dot(hidden_state, contrast_vector) / torch.norm(contrast_vector)
    return projection.item()

# Generate with steering
def generate_with_steering(
    model,
    prompt: str,
    contrast_vector: torch.Tensor,
    layer: int,
    n_steer: int,
    coeff: float = 0.1,
    use_chat_template: bool = True,
    suppress_eos: bool = True
) -> Dict:
    """Generate n_steer tokens with steering applied"""
    
    # Format prompt if using chat model
    if use_chat_template:
        formatted_prompt = format_chat_prompt(model, prompt)
    else:
        formatted_prompt = prompt
    
    # Get EOS token id
    eos_token_id = model.tokenizer.eos_token_id
    
    # Use generate instead of trace for multi-token generation
    with model.generate(formatted_prompt, max_new_tokens=n_steer) as generator:
        # Access the module and apply steering
        module = model.model.layers[layer]
        
        # Apply steering to all generated tokens
        module.all()
        module.output[0][:] = module.output[0] + coeff * contrast_vector.to(module.output[0].device)
        
        # Suppress EOS token if requested
        if suppress_eos and eos_token_id is not None:
            # Set EOS token logit to very negative value
            model.lm_head.all()
            model.lm_head.output[:, :, eos_token_id] = -1e10
        
        # Save the output
        output = model.generator.output.save()
    
    # Get generated text
    generated_ids = output.value
    full_text = model.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    
    # Extract just the steered portion (remove the prompt)
    # This is approximate - for exact token count, we'd need to track token boundaries
    steered_text = full_text
    
    return {
        "prompt": prompt,
        "formatted_prompt": formatted_prompt,
        "steered_text": steered_text,
        "full_ids": generated_ids[0].tolist()
    }

# Sample without steering and measure projections
def sample_and_measure(
    model,
    prompt_with_steered: str,
    contrast_vector: torch.Tensor,
    layer: int,
    n_sample: int,
    suppress_eos: bool = True
) -> Dict:
    """Sample n_sample tokens without steering, measure projections token by token"""
    
    projections = []
    generated_tokens = []
    
    # Get EOS token id
    eos_token_id = model.tokenizer.eos_token_id
    
    # We need to generate one token at a time to measure projections
    for i in range(n_sample):
        with model.trace(prompt_with_steered, trace=True) as tracer:
            # Access the layer
            module = model.model.layers[layer]
            
            # Save hidden state for projection measurement
            # Get the last token's hidden state
            hidden_state = module.output[0][:, -1, :].save()
            
            # Suppress EOS token if requested
            if suppress_eos and eos_token_id is not None:
                model.lm_head.output[:, -1, eos_token_id] = -1e10
            
            # Get the output token
            output = model.output.save()
        
        # Compute projection
        h = hidden_state.value.squeeze()
        proj = compute_projection(h, contrast_vector)
        projections.append(proj)
        
        # Get the generated token
        logits = output.value["logits"]
        next_token_id = torch.argmax(logits[0, -1]).item()
        generated_tokens.append(next_token_id)
        
        # Update prompt for next iteration
        prompt_with_steered = model.tokenizer.decode(
            model.tokenizer.encode(prompt_with_steered) + [next_token_id],
            skip_special_tokens=False
        )
    
    return {
        "full_text": prompt_with_steered,
        "projections": projections,
        "generated_tokens": generated_tokens
    }


In [5]:
# load prompts from prompts
prompts = json.load(open("prompts/other_questions.json"))
prompt = prompts["questions"]["personal_reflection"][0]

print(prompt)

What do you think is the meaning of life?


In [6]:
n_steer = 200
n_sample = 500
coeff = 1.0

In [7]:
role = "anxious_teenager"

# get contrast vector for steering
role_i = persona_names.index(role)
contrast_vector = contrast_vectors[role_i, LAYER, :]


In [8]:
# load model with vllm and nnsight
llm = LanguageModel(CHAT_MODEL_NAME, device_map="auto")

print(llm)

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 4608, padding_idx=0)
    (layers): ModuleList(
      (0-45): 46 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear(in_features=4608, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4608, out_features=2048, bias=False)
          (v_proj): Linear(in_features=4608, out_features=2048, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4608, bias=False)
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear(in_features=4608, out_features=36864, bias=False)
          (up_proj): Linear(in_features=4608, out_features=36864, bias=False)
          (down_proj): Linear(in_features=36864, out_features=4608, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((4608,), eps=1e-06)
        (post_attention_layernorm): Gemma2RMSNorm((4608,), eps=1e-06)
        (pre_feedforward_layernorm): G

In [9]:

formatted_prompt = format_chat_prompt(llm, prompt)
eos_token_id = llm.tokenizer.eos_token_id

# Use generate instead of trace for multi-token generation
with llm.generate(formatted_prompt, max_new_tokens=n_steer) as generator:
    # Access the module and apply steering
    module = llm.model.layers[LAYER]
    
    # Apply steering to all generated tokens
    module.all()
    module.output[0][:] = module.output[0] + coeff * contrast_vector.to(module.output[0].device)
    
    # Suppress EOS token if requested
    if eos_token_id is not None:
        # Set EOS token logit to very negative value
        llm.lm_head.all()
        llm.lm_head.output[:, :, eos_token_id] = -1e10
    
    # Save the output
    output = llm.generator.output.save()

# Get generated text
generated_ids = output.value
full_text = llm.tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# Extract just the steered portion (remove the prompt)
# This is approximate - for exact token count, we'd need to track token boundaries
steered_text = full_text

Loading checkpoint shards:   0%|          | 0/12 [00:00<?, ?it/s]

NNsightError: Data-dependent branching
  Explanation: Detected data-dependent branching (e.g. `if my_tensor.sum() > 0:`). Dynamo does not support tracing dynamic control flow.
  Hint: This graph break is fundamental - it is unlikely that Dynamo will ever be able to trace through your code. Consider finding a workaround.
  Hint: Use `torch.cond` to express dynamic control flow.

  Developer debug context: attempted to jump with GetAttrVariable(ConstantVariable(NoneType: None), scanning)


from user code:
   File "/root/git/persona-subspace/.venv/lib/python3.13/site-packages/accelerate/hooks.py", line 175, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/root/git/persona-subspace/.venv/lib/python3.13/site-packages/transformers/utils/generic.py", line 943, in wrapper
    output = func(self, *args, **kwargs)
  File "/root/git/persona-subspace/.venv/lib/python3.13/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 584, in forward
    outputs: BaseModelOutputWithPast = self.model(
  File "/root/git/persona-subspace/.venv/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1854, in _call_impl
    return inner()
  File "/root/git/persona-subspace/.venv/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1805, in inner
    result = forward_call(*args, **kwargs)
  File "/root/git/persona-subspace/.venv/lib/python3.13/site-packages/transformers/utils/generic.py", line 943, in wrapper
    output = func(self, *args, **kwargs)
  File "/root/git/persona-subspace/.venv/lib/python3.13/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 420, in forward
    inputs_embeds = self.embed_tokens(input_ids)
  File "/root/git/persona-subspace/.venv/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1854, in _call_impl
    return inner()
  File "/root/git/persona-subspace/.venv/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1816, in inner
    hook_result = hook(self, args, kwargs, result)
  File "/root/git/persona-subspace/.venv/lib/python3.13/site-packages/nnsight/intervention/envoy.py", line 544, in _hook
    if self._scanning():

Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"



## Question insertion

is there a consistent persona "signature" while role-playing?

give a role instruction, have a long conversation and save the transcript T.
```
- T = conversation transcript with (2 * k) turns  
- Q = {q₁, q₂, ..., qₙ} set of n eval questions  
- P = {p₁, p₂, ..., pₖ} set of k insertion points in transcript

for each question qᵢ ∈ Q:  
    for each position pⱼ ∈ P:  
        T'ᵢⱼ ← INSERT(T, qᵢ, pⱼ) // insert question qᵢ at position pⱼ in transcript T  
        aᵢⱼ ← ACTIVATION(T'ᵢⱼ) // collect newline activation before the model's response  

// calculate mean activation across all questions for a given questions
for each position pⱼ ∈ P:  
    ā_pⱼ ← (1/N) ∑ᵢ₌₁ᴺ aᵢⱼ 

- out: {ā_p₁, ā_p₂, ..., ā_pₖ}
```

we can compare the cosine similarity of these vectors and look for the top singular vector and see what happens when we steer on it. does it lead to role-like behavior?
