# Compare single and multi-turn role activations


In [2]:
import torch
import os
import json
import sys
import numpy as np
import plotly.graph_objects as go
from transformers import AutoTokenizer

sys.path.append('.')
sys.path.append('..')

from utils.inference_utils import *
from utils.probing_utils import *
from utils.steering_utils import ActivationSteering

torch.set_float32_matmul_precision('high')

INFO 08-06 18:58:37 [__init__.py:235] Automatically detected platform cuda.


In [36]:
CHAT_MODEL_NAME = "google/gemma-2-27b-it"
MODEL_READABLE = "Gemma 2 27B Instruct"
MODEL_SHORT = "gemma-2-27b"
LAYER = 34 # out of 46

ACTIVATIONS_DIR = f"/workspace/roleplay/{MODEL_SHORT}"
CONVERSATION_DIR = f"./results/{MODEL_SHORT}/role_vectors/transcripts"
OUTPUT_DIR = f"./results/{MODEL_SHORT}/role_vectors/steering"
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [5]:
model, tokenizer = load_model(CHAT_MODEL_NAME)

Loading checkpoint shards:   0%|          | 0/12 [00:00<?, ?it/s]

In [7]:
roles = ["medieval_bard", "deep_sea_leviathan", "anxious_teenager"]
long_acts = {}
long_acts_control = {}
short_acts = {}
short_acts_control = {}

for role in roles:
    long_acts[role] = torch.load(f"{ACTIVATIONS_DIR}/{role}.pt") # (n_layers, n_tokens, hidden_size)
    long_acts_control[role] = torch.load(f"{ACTIVATIONS_DIR}/{role}_control.pt") # (n_layers, n_tokens, hidden_size)
    short_acts[role] = torch.load(f"{ACTIVATIONS_DIR}/{role}_single.pt") # (n_layers, n_tokens, hidden_size)
    short_acts_control[role] = torch.load(f"{ACTIVATIONS_DIR}/{role}_control_single.pt") # (n_layers, n_tokens, hidden_size)

print(long_acts.keys())
print(long_acts_control.keys())
print(short_acts.keys())
print(short_acts_control.keys())

dict_keys(['medieval_bard', 'deep_sea_leviathan', 'anxious_teenager'])
dict_keys(['medieval_bard', 'deep_sea_leviathan', 'anxious_teenager'])
dict_keys(['medieval_bard', 'deep_sea_leviathan', 'anxious_teenager'])
dict_keys(['medieval_bard', 'deep_sea_leviathan', 'anxious_teenager'])


## Get activations and role vector

In [1]:
def get_response_indices(conversation, tokenizer):
    """
    Get every token index of the model's response.
    
    Args:
        conversation: List of dict with 'role' and 'content' keys
        tokenizer: Tokenizer to apply chat template and tokenize
    
    Returns:
        response_indices: list of token positions where the model is responding
    """
    # Apply chat template to the full conversation
    response_indices = []
    
    # Process conversation incrementally to find assistant response boundaries
    for i, turn in enumerate(conversation):
        if turn['role'] != 'assistant':
            continue
            
        # Get conversation up to but not including this assistant turn
        conversation_before = conversation[:i]
        
        # Get conversation up to and including this assistant turn  
        conversation_including = conversation[:i+1]
        
        # Format and tokenize both versions
        if conversation_before:
            before_formatted = tokenizer.apply_chat_template(
                conversation_before, tokenize=False, add_generation_prompt=True
            )
            before_tokens = tokenizer(before_formatted, add_special_tokens=False)
            before_length = len(before_tokens['input_ids'])
        else:
            before_length = 0
            
        including_formatted = tokenizer.apply_chat_template(
            conversation_including, tokenize=False, add_generation_prompt=False
        )
        including_tokens = tokenizer(including_formatted, add_special_tokens=False)
        including_length = len(including_tokens['input_ids'])
        
        # The assistant response tokens are between before_length and including_length
        # We need to account for any generation prompt tokens that get removed
        assistant_start = before_length
        assistant_end = including_length
        
        # Add these indices to our response list
        response_indices.extend(range(assistant_start, assistant_end))
    
    return response_indices

In [8]:
def mean_response_activation(activations, conversation, tokenizer):
    """
    Get the mean activation of the model's response to the user's message.
    """
    # get the token positions of model responses
    response_indices = get_response_indices(conversation, tokenizer)

    # get the mean activation of the model's response to the user's message
    mean_activation = activations[:, response_indices, :].mean(dim=1)
    return mean_activation

In [23]:
# get contrast vectors for long conversations
long_mean_acts = {}
long_contrast_vectors = {}

for role in roles:
    role_conversation = json.load(open(f"{CONVERSATION_DIR}/{role}.json"))["conversation"]
    control_conversation = json.load(open(f"{CONVERSATION_DIR}/{role}_control.json"))["conversation"]

    mean_role_acts = mean_response_activation(long_acts[role], role_conversation, tokenizer)
    mean_control_acts = mean_response_activation(long_acts_control[role], control_conversation, tokenizer)

    long_mean_acts[role] = mean_role_acts
    long_mean_acts[f"{role}_control"] = mean_control_acts
    long_contrast_vectors[role] = mean_role_acts - mean_control_acts


print(long_mean_acts.keys())
print(long_contrast_vectors.keys()) 

dict_keys(['medieval_bard', 'medieval_bard_control', 'deep_sea_leviathan', 'deep_sea_leviathan_control', 'anxious_teenager', 'anxious_teenager_control'])
dict_keys(['medieval_bard', 'deep_sea_leviathan', 'anxious_teenager'])


In [10]:
print(long_mean_acts['medieval_bard'].shape)

torch.Size([46, 4608])


In [None]:
# get contrast vectors and mean acts for short conversations
short_mean_acts = {}
short_contrast_vectors = {}

for role in roles:
    role_conversations = json.load(open(f"{CONVERSATION_DIR}/{role}_single.json"))["conversations"]
    control_conversations = json.load(open(f"{CONVERSATION_DIR}/{role}_control_single.json"))["conversations"]

    # need to iterate through multiple conversations and get the mean of all response activations
    sum_role_acts = torch.zeros(short_acts[role][0][:, 0, :].shape)
    sum_control_acts = torch.zeros(short_acts_control[role][0][:, 0, :].shape)

    for convo, convo_control, acts, acts_control in zip(role_conversations, control_conversations, short_acts[role], short_acts_control[role]):
        sum_role_acts += mean_response_activation(acts, convo, tokenizer)
        sum_control_acts += mean_response_activation(acts_control, convo_control, tokenizer)

    short_mean_acts[role] = sum_role_acts / len(role_conversations)
    short_mean_acts[f"{role}_control"] = sum_control_acts / len(control_conversations)  # FIXED: was control_conversation
    short_contrast_vectors[role] = short_mean_acts[role] - short_mean_acts[f"{role}_control"]

print(short_mean_acts.keys())
print(short_contrast_vectors.keys())
        

In [24]:
print(long_mean_acts['medieval_bard'].shape)

torch.Size([46, 4608])


## Compare vectors

In [37]:
# Compare cosine similarity between long and short conversation mean activations (CORRECTED)
import torch.nn.functional as F

print("Cosine Similarities Between Long and Short Conversation Mean Activations:")
print("=" * 70)

for role_name in roles:
    print(f"\nRole: {role_name}")
    print("-" * 40)
    
    # Get activations for role-playing condition
    long_role_acts = long_mean_acts[role_name]  # shape: (n_layers, hidden_size)
    short_role_acts = short_mean_acts[role_name]  # shape: (n_layers, hidden_size)
    
    # Get activations for control condition  
    long_control_acts = long_mean_acts[f"{role_name}_control"]
    short_control_acts = short_mean_acts[f"{role_name}_control"]
    
    # Compute cosine similarities layer by layer
    role_similarities = []
    control_similarities = []
    
    for layer in range(long_role_acts.shape[0]):
        # Role-playing condition similarity
        role_sim = F.cosine_similarity(
            long_role_acts[layer].unsqueeze(0), 
            short_role_acts[layer].unsqueeze(0)
        ).item()
        # Clamp to valid range to fix numerical precision issues
        role_sim = max(-1.0, min(1.0, role_sim))
        role_similarities.append(role_sim)
        
        # Control condition similarity
        control_sim = F.cosine_similarity(
            long_control_acts[layer].unsqueeze(0), 
            short_control_acts[layer].unsqueeze(0)
        ).item()
        # Clamp to valid range to fix numerical precision issues
        control_sim = max(-1.0, min(1.0, control_sim))
        control_similarities.append(control_sim)
    
    # Print layer-wise similarities for target layer
    print(f"Layer {LAYER} - Role-playing: {role_similarities[LAYER]:.4f}")
    print(f"Layer {LAYER} - Control: {control_similarities[LAYER]:.4f}")
    
    # Print mean similarity across all layers
    print(f"Mean across all layers - Role-playing: {np.mean(role_similarities):.4f}")
    print(f"Mean across all layers - Control: {np.mean(control_similarities):.4f}")

print("\n" + "=" * 70)

Cosine Similarities Between Long and Short Conversation Mean Activations:

Role: medieval_bard
----------------------------------------
Layer 34 - Role-playing: 0.9977
Layer 34 - Control: 1.0000
Mean across all layers - Role-playing: 0.9982
Mean across all layers - Control: 0.9973

Role: deep_sea_leviathan
----------------------------------------
Layer 34 - Role-playing: 0.9995
Layer 34 - Control: 0.9969
Mean across all layers - Role-playing: 0.9979
Mean across all layers - Control: 0.9963

Role: anxious_teenager
----------------------------------------
Layer 34 - Role-playing: 0.9998
Layer 34 - Control: 0.9983
Mean across all layers - Role-playing: 0.9982
Mean across all layers - Control: 0.9984



In [None]:
# Compare cosine similarity between long and short conversation contrast vectors
print("Cosine Similarities Between Long and Short Conversation Contrast Vectors:")
print("=" * 65)

contrast_similarities = {}

for role_name in roles:
    long_contrast = long_contrast_vectors[role_name]  # shape: (n_layers, hidden_size)
    short_contrast = short_contrast_vectors[role_name]  # shape: (n_layers, hidden_size)
    
    # Compute cosine similarities layer by layer
    layer_similarities = []
    
    for layer in range(long_contrast.shape[0]):
        sim = F.cosine_similarity(
            long_contrast[layer].unsqueeze(0), 
            short_contrast[layer].unsqueeze(0)
        ).item()
        layer_similarities.append(sim)
    
    contrast_similarities[role_name] = layer_similarities
    
    print(f"\nRole: {role_name}")
    print(f"Layer {LAYER} similarity: {layer_similarities[LAYER]:.4f}")
    print(f"Mean across all layers: {np.mean(layer_similarities):.4f}")

print("\n" + "=" * 65)

# Create a summary comparison matrix for the target layer
print(f"\nContrast Vector Similarities at Layer {LAYER}:")
print("Role" + " " * 16 + "Similarity")
print("-" * 30)
for role_name in roles:
    print(f"{role_name:<20} {contrast_similarities[role_name][LAYER]:.4f}")

# Find the most and least similar contrast vectors
layer_sims = [contrast_similarities[role_name][LAYER] for role_name in roles]
most_similar_idx = np.argmax(layer_sims)
least_similar_idx = np.argmin(layer_sims)

print(f"\nMost similar contrast vectors: {roles[most_similar_idx]} ({layer_sims[most_similar_idx]:.4f})")
print(f"Least similar contrast vectors: {roles[least_similar_idx]} ({layer_sims[least_similar_idx]:.4f})")

Cosine Similarities Between Long and Short Conversation Contrast Vectors:

Role: medieval_bard
Layer 34 similarity: 0.9130
Mean across all layers: 0.9161

Role: deep_sea_leviathan
Layer 34 similarity: 0.9078
Mean across all layers: 0.9000

Role: anxious_teenager
Layer 34 similarity: 0.9431
Mean across all layers: 0.9398


Contrast Vector Similarities at Layer 34:
Role                Similarity
------------------------------
medieval_bard        0.9130
deep_sea_leviathan   0.9078
anxious_teenager     0.9431

Most similar contrast vectors: anxious_teenager (0.9431)
Least similar contrast vectors: deep_sea_leviathan (0.9078)


: 

In [34]:
# Debug: Check for cosine similarity > 1 and investigate why
print("Debugging cosine similarity > 1 issue:")
print("=" * 50)

for role_name in roles:
    long_role_acts = long_mean_acts[role_name]
    short_role_acts = short_mean_acts[role_name]
    long_control_acts = long_mean_acts[f"{role_name}_control"]
    short_control_acts = short_mean_acts[f"{role_name}_control"]
    
    # Check layer 20 specifically
    layer = LAYER
    
    # Manual cosine similarity computation to debug
    long_vec = long_role_acts[layer]
    short_vec = short_role_acts[layer]
    
    # Check if vectors are identical (which would cause issues)
    are_identical = torch.allclose(long_vec.float(), short_vec.float(), atol=1e-6)
    
    # Compute norms
    long_norm = torch.norm(long_vec)
    short_norm = torch.norm(short_vec)
    
    # Compute dot product
    dot_product = torch.dot(long_vec.float(), short_vec.float())
    
    # Manual cosine similarity
    manual_cosine = dot_product / (long_norm.float() * short_norm.float())
    
    # PyTorch's cosine similarity
    pytorch_cosine = F.cosine_similarity(long_vec.unsqueeze(0), short_vec.unsqueeze(0)).item()
    
    print(f"\nRole: {role_name} (role-playing condition)")
    print(f"Vectors identical: {are_identical}")
    print(f"Long norm: {long_norm:.8f}")
    print(f"Short norm: {short_norm:.8f}")
    print(f"Dot product: {dot_product:.8f}")
    print(f"Manual cosine: {manual_cosine:.8f}")
    print(f"PyTorch cosine: {pytorch_cosine:.8f}")
    print(f"Difference: {abs(manual_cosine - pytorch_cosine):.10f}")
    
    if pytorch_cosine > 1.0:
        print(f"⚠️  COSINE > 1 DETECTED: {pytorch_cosine:.8f}")
        
    # Check control condition too
    long_vec_ctrl = long_control_acts[layer]
    short_vec_ctrl = short_control_acts[layer]
    pytorch_cosine_ctrl = F.cosine_similarity(long_vec_ctrl.unsqueeze(0), short_vec_ctrl.unsqueeze(0)).item()
    
    if pytorch_cosine_ctrl > 1.0:
        print(f"⚠️  CONTROL COSINE > 1: {pytorch_cosine_ctrl:.8f}")

print("\n" + "=" * 50)

Debugging cosine similarity > 1 issue:

Role: medieval_bard (role-playing condition)
Vectors identical: False
Long norm: 13376.00000000
Short norm: 13498.71777344
Dot product: 179950016.00000000
Manual cosine: 0.99662805
PyTorch cosine: 0.99628758
Difference: 0.0003404617
⚠️  CONTROL COSINE > 1: 1.00089252

Role: deep_sea_leviathan (role-playing condition)
Vectors identical: False
Long norm: 13632.00000000
Short norm: 13702.09082031
Dot product: 186105728.00000000
Manual cosine: 0.99635321
PyTorch cosine: 0.99562150
Difference: 0.0007317066

Role: anxious_teenager (role-playing condition)
Vectors identical: False
Long norm: 13376.00000000
Short norm: 13611.93261719
Dot product: 182105792.00000000
Manual cosine: 1.00017893
PyTorch cosine: 0.99975097
Difference: 0.0004279613
⚠️  CONTROL COSINE > 1: 1.00247478



## Steer with the vector

In [None]:
# try steering with this vector
steered_results = {}
output_file = f"{OUTPUT_DIR}/steering/{role}.json"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

if os.path.exists(output_file):
    steered_results = json.load(open(output_file))



In [None]:
magnitudes = [-5.0, -3.0, -1.0, 1.0, 3.0, 5.0]

prompts = [
    "What's it like to be you?",
    "Who are you?",
    "Where are you?",
    "What is the meaning of life?"
]

In [None]:
for magnitude in magnitudes:
    print(f"\n{'='*20} Magnitude: {magnitude:+.1f} {'='*20}")
    
    try:
        with ActivationSteering(
            model=model,
            steering_vectors=contrast_vector[LAYER, :],
            coefficients=magnitude,
            layer_indices=LAYER,
            intervention_type="addition",
            positions="all"
        ) as steerer:
            for prompt in prompts:
                if prompt not in steered_results:
                    steered_results[prompt] = {}
                
                print(f"\nPrompt: {prompt}")
                response = generate_text(model, tokenizer, prompt, chat_format=True)

                print(f"Response: {response}")
                
                if magnitude not in steered_results[prompt]:
                    steered_results[prompt][magnitude] = []
                steered_results[prompt][magnitude].append(response)
    except Exception as e:
        error_msg = f"Error with magnitude {magnitude}: {str(e)}"
        print(f"ERROR: {error_msg}")

with open(output_file, "w") as f:
    json.dump(steered_results, f, indent=2)