# Diffing tasks

Do different subnetworks activate when asking the model to be helpful across different tasks.

In [1]:
import torch
import os
import json
import sys
import numpy as np


sys.path.append('.')
sys.path.append('..')

from utils.steering_utils import ActivationSteering
from probing.probing_utils import *
from probing.inference_utils import *


INFO 07-25 03:29:49 [__init__.py:244] Automatically detected platform cuda.


In [3]:
CHAT_MODEL_NAME = "google/gemma-2-9b-it"
MODEL_READABLE = "Gemma 2 9B Instruct"
MODEL_SHORT = "gemma"
LAYER = 20
OUTPUT_DIR = f"./results/6_direct_role"
INPUT_DIR = f"./prompts/6_direct_role"

os.makedirs(OUTPUT_DIR, exist_ok=True)

## Generate conversation

In [15]:
personas = json.load(open(f"{INPUT_DIR}/personas.json"))
messages = {} 

for persona_name in personas["personas"]:
    messages[persona_name] = []

questions = json.load(open(f"{INPUT_DIR}/questions.json"))
init_messages = []

for persona_name in personas["personas"]:
    init_messages.append(personas["personas"][persona_name]["system_prompt"])
    messages[persona_name].append({"role": "system", "content": personas["personas"][persona_name]["system_prompt"]})

print("Loaded personas and questions")
print(f"Loaded {len(personas['personas'])} personas")


Loaded personas and questions
Loaded 11 personas


In [5]:
model = load_vllm_model(CHAT_MODEL_NAME, max_model_len=4096, tensor_parallel_size=1)


INFO:probing.inference_utils:Loading vLLM model: google/gemma-2-9b-it with 1 GPUs


INFO 07-25 02:35:26 [config.py:841] This model supports multiple tasks: {'reward', 'embed', 'generate', 'classify'}. Defaulting to 'generate'.
INFO 07-25 02:35:26 [config.py:1472] Using max model len 4096
INFO 07-25 02:35:26 [config.py:2285] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 07-25 02:35:28 [core.py:526] Waiting for init message from front-end.
INFO 07-25 02:35:28 [core.py:69] Initializing a V1 LLM engine (v0.9.2) with config: model='google/gemma-2-9b-it', speculative_config=None, tokenizer='google/gemma-2-9b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_f

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 07-25 02:35:35 [default_loader.py:272] Loading weights took 3.65 seconds
INFO 07-25 02:35:35 [gpu_model_runner.py:1801] Model loading took 17.2181 GiB and 4.373449 seconds
INFO 07-25 02:35:47 [backends.py:508] Using cache directory: /root/.cache/vllm/torch_compile_cache/3f0731624f/rank_0_0/backbone for vLLM's torch.compile
INFO 07-25 02:35:47 [backends.py:519] Dynamo bytecode transform time: 11.39 s
INFO 07-25 02:35:56 [backends.py:155] Directly load the compiled graph(s) for shape None from the cache, took 7.568 s
INFO 07-25 02:36:00 [monitor.py:34] torch.compile takes 11.39 s in total
INFO 07-25 02:36:01 [gpu_worker.py:232] Available KV cache memory: 44.47 GiB
INFO 07-25 02:36:02 [kv_cache_utils.py:873] GPU KV cache size: 138,784 tokens
INFO 07-25 02:36:02 [kv_cache_utils.py:877] Maximum concurrency for 4,096 tokens per request: 33.82x


Capturing CUDA graph shapes: 100%|██████████| 67/67 [00:26<00:00,  2.50it/s]


INFO 07-25 02:36:29 [gpu_model_runner.py:2326] Graph capturing finished in 27 secs, took 0.82 GiB
INFO 07-25 02:36:29 [core.py:172] init engine (profile, create kv cache, warmup model) took 53.18 seconds


INFO:probing.inference_utils:Successfully loaded vLLM model: google/gemma-2-9b-it
INFO:probing.inference_utils:Processing batch of 10 prompts...


Adding requests:   0%|          | 0/10 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/10 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

INFO:probing.inference_utils:Completed batch processing of 10 prompts


Prompt: You are a methodical academic researcher who approaches every topic with scholarly rigor, citing sources and considering multiple perspectives.


TypeError: list indices must be integers or slices, not str

In [16]:

responses = batch_chat(model, init_messages)


INFO:probing.inference_utils:Processing batch of 10 prompts...


Adding requests:   0%|          | 0/10 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/10 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

INFO:probing.inference_utils:Completed batch processing of 10 prompts


In [17]:

for i, persona_name in enumerate(personas["personas"]):
    print(f"{'='*20} Persona: {persona_name} {'='*20}")
    print(f"Prompt: {personas["personas"][persona_name]["system_prompt"]}")
    print(f"Response: {responses[i]}\n")


Prompt: You are a methodical academic researcher who approaches every topic with scholarly rigor, citing sources and considering multiple perspectives.
Response: Understood.

Prompt: You are a gruff, no-nonsense auto mechanic who's seen it all and doesn't suffer fools gladly. You speak plainly and get straight to the point.
Response: Understood.

Prompt: You are a wise spiritual teacher who sees deeper meaning in everything and speaks in metaphors about life's journey and inner peace.
Response: Understood.

Prompt: You are a 16-year-old who's constantly worried about social acceptance, school stress, and the future. Everything feels like a big deal.
Response: Understood.

Prompt: You are someone who questions official narratives and sees hidden connections everywhere. You're skeptical of mainstream explanations.
Response: Understood.

Prompt: You are an energetic startup founder who sees opportunity in every challenge and believes anything is possible with the right mindset.
Response: 

In [18]:
close_vllm_model(model)

INFO:probing.inference_utils:Closed vLLM model google/gemma-2-9b-it


## Collect activations
for same conversation questions, different roles 

In [5]:
# Try the 10 roles plus the control AI assistant.

model, tokenizer = load_model(CHAT_MODEL_NAME)




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
activations = extract_activations_for_prompts(model, tokenizer, init_messages, LAYER)

✓ Extracted activation for: You are Gemma, a helpful AI assistant....
✓ Extracted activation for: You are a methodical academic researcher who appro...
✓ Extracted activation for: You are a gruff, no-nonsense auto mechanic who's s...
✓ Extracted activation for: You are a wise spiritual teacher who sees deeper m...
✓ Extracted activation for: You are a 16-year-old who's constantly worried abo...
✓ Extracted activation for: You are someone who questions official narratives ...
✓ Extracted activation for: You are an energetic startup founder who sees oppo...
✓ Extracted activation for: You are an 80-year-old who values traditional ways...
✓ Extracted activation for: You are a witty, sarcastic reviewer who finds faul...
✓ Extracted activation for: You are an 8-year-old with boundless energy who ge...
✓ Extracted activation for: You are a customer service representative who's be...


In [17]:
import torch.nn.functional as F
import plotly.graph_objects as go
import numpy as np

# Compute cosine similarity matrix for all activation vectors
def compute_cosine_similarity_matrix(activations):
    """
    Compute cosine similarity matrix between activation vectors.
    
    Args:
        activations: torch.Tensor of shape (n_vectors, hidden_dim)
    
    Returns:
        similarity_matrix: torch.Tensor of shape (n_vectors, n_vectors)
    """
    # Convert to float32 if needed for compatibility
    if activations.dtype == torch.bfloat16:
        activations = activations.float()
    
    # Normalize activations for cosine similarity
    activations_norm = F.normalize(activations, p=2, dim=1)
    
    # Compute cosine similarity matrix
    similarity_matrix = torch.mm(activations_norm, activations_norm.t())
    
    return similarity_matrix

# Get persona names in order (first one is default assistant, rest are custom personas)
persona_names = [personas["personas"][persona]["readable_name"] for persona in personas["personas"]]

print(f"Computing cosine similarity matrix for {len(persona_names)} personas...")
print(f"Persona names: {persona_names}")

# Compute similarity matrix
similarity_matrix = compute_cosine_similarity_matrix(activations)
similarity_np = similarity_matrix.cpu().numpy()

print(f"Similarity matrix shape: {similarity_matrix.shape}")
print(f"Similarity range: [{similarity_np.min():.4f}, {similarity_np.max():.4f}]")

# Create plotly heatmap
fig = go.Figure(data=go.Heatmap(
    z=similarity_np,
    x=persona_names,
    y=persona_names,
    colorscale='RdYlBu_r',  # Red-Yellow-Blue reversed (red=high, blue=low)
    zmin=0.8,  # Set reasonable range for better contrast
    zmax=1.0,
    colorbar=dict(
        title="Cosine Similarity",
        titleside="right"
    ),
    hovertemplate='<b>%{y}</b> vs <b>%{x}</b><br>' +
                  'Cosine Similarity: %{z:.4f}<br>' +
                  '<extra></extra>',
    text=np.round(similarity_np, 3),  # Show rounded values on hover
    texttemplate="%{text}",
    textfont={"size": 10},
    showscale=True
))

# Update layout
fig.update_layout(
    title={
        'text': 'Cosine Similarity of Activations After Role-Play Instruction',
        'subtitle': {
            'text': f'{MODEL_READABLE} Layer {LAYER}, Newline before Response'
        },
        'x': 0.5,
        'font': {'size': 16}
    },
    xaxis_title='Role',
    yaxis_title='Role',
    width=900,
    height=800,
    xaxis=dict(
        tickangle=45,
        side='bottom'
    ),
    yaxis=dict(
        tickangle=0,
        autorange='reversed'  # To match typical similarity matrix layout
    )
)

# Show the plot
fig.show()

# Save plot
os.makedirs(OUTPUT_DIR, exist_ok=True)
fig.write_html(f"{OUTPUT_DIR}/persona_similarity_matrix.html")

print(f"\nSimilarity matrix visualization created and saved to {OUTPUT_DIR}/persona_similarity_matrix.html")

# Print some interesting statistics
print(f"\nSimilarity Statistics:")
print(f"Average similarity (excluding diagonal): {(similarity_np.sum() - np.trace(similarity_np)) / (similarity_np.size - len(persona_names)):.4f}")

# Find most and least similar pairs (excluding self-similarity)
similarity_no_diag = similarity_np.copy()
np.fill_diagonal(similarity_no_diag, -1)  # Mask diagonal

max_idx = np.unravel_index(np.argmax(similarity_no_diag), similarity_no_diag.shape)
min_idx = np.unravel_index(np.argmin(similarity_no_diag), similarity_no_diag.shape)

print(f"Most similar pair: {persona_names[max_idx[0]]} ↔ {persona_names[max_idx[1]]} ({similarity_np[max_idx]:.4f})")
print(f"Least similar pair: {persona_names[min_idx[0]]} ↔ {persona_names[min_idx[1]]} ({similarity_np[min_idx]:.4f})")

Computing cosine similarity matrix for 11 personas...
Persona names: ['<b>AI Assistant</b>', 'Academic Researcher', 'Grumpy Mechanic', 'Spiritual Guru', 'Anxious Teenager', 'Conspiracy Theorist', 'Optimistic Entrepreneur', 'Elderly Traditionalist', 'Sarcastic Critic', 'Hyperactive Child', 'Burned-Out Customer Service']
Similarity matrix shape: torch.Size([11, 11])
Similarity range: [0.7820, 1.0000]



Similarity matrix visualization created and saved to ./results/6_direct_role/persona_similarity_matrix.html

Similarity Statistics:
Average similarity (excluding diagonal): 0.8898
Most similar pair: Conspiracy Theorist ↔ Optimistic Entrepreneur (0.9544)
Least similar pair: <b>AI Assistant</b> ↔ <b>AI Assistant</b> (1.0000)


# IGNORE BELOW FOR NOW

In [7]:
# compute contrast vectors for each query with swapped user and model roles
normal_activations = extract_activations_for_prompts(model, tokenizer, prompts, LAYER)
swapped_activations = extract_activations_for_prompts(model, tokenizer, prompts, LAYER, swap=True)

contrast_vector, normal_mean, swapped_mean = compute_contrast_vector(normal_activations, swapped_activations)


✓ Extracted activation for: Who are you?...
✓ Extracted activation for: What is your name?...
✓ Extracted activation for: Can you help me?...
✓ Extracted activation for: What is your job?...
✓ Extracted activation for: What do you like to do?...
✓ Extracted activation for: What is the meaning of life?...
✓ Extracted activation for: Who are you?...
✓ Extracted activation for: What is your name?...
✓ Extracted activation for: Can you help me?...
✓ Extracted activation for: What is your job?...
✓ Extracted activation for: What do you like to do?...
✓ Extracted activation for: What is the meaning of life?...


In [14]:
import torch.nn.functional as F

def compute_cosine_similarity_matrix(activations1, activations2):
    """
    Compute cosine similarity matrix between two sets of activations.
    
    Args:
        activations1: torch.Tensor of shape (n_prompts1, hidden_dim)
        activations2: torch.Tensor of shape (n_prompts2, hidden_dim)
    
    Returns:
        similarity_matrix: torch.Tensor of shape (n_prompts1, n_prompts2)
    """
    # Normalize activations
    activations1_norm = F.normalize(activations1, p=2, dim=1)
    activations2_norm = F.normalize(activations2, p=2, dim=1)
    
    # Compute cosine similarity matrix
    similarity_matrix = torch.mm(activations1_norm, activations2_norm.t())
    
    return similarity_matrix

def analyze_similarity_for_layer(normal_activations, swapped_activations):
    """
    Analyze similarities within and across role categories for a single layer.
    
    Returns:
        dict with similarity matrices and averages
    """
    # Within-category similarities
    normal_normal_sim = compute_cosine_similarity_matrix(normal_activations, normal_activations)
    swapped_swapped_sim = compute_cosine_similarity_matrix(swapped_activations, swapped_activations)
    
    # Cross-category similarities  
    normal_swapped_sim = compute_cosine_similarity_matrix(normal_activations, swapped_activations)
    
    # Calculate averages (excluding diagonal for within-category)
    n_normal = normal_activations.shape[0]
    n_swapped = swapped_activations.shape[0]
    
    # Get upper triangular matrices (excluding diagonal) for within-category
    normal_upper = torch.triu(normal_normal_sim, diagonal=1)
    swapped_upper = torch.triu(swapped_swapped_sim, diagonal=1)
    
    # Calculate averages
    avg_normal_normal = normal_upper.sum() / (n_normal * (n_normal - 1) / 2)
    avg_swapped_swapped = swapped_upper.sum() / (n_swapped * (n_swapped - 1) / 2)
    avg_within_category = (avg_normal_normal + avg_swapped_swapped) / 2
    
    avg_cross_category = normal_swapped_sim.mean()
    
    similarity_difference = avg_within_category - avg_cross_category
    
    return {
        'normal_normal_sim': normal_normal_sim,
        'swapped_swapped_sim': swapped_swapped_sim, 
        'normal_swapped_sim': normal_swapped_sim,
        'avg_normal_normal': avg_normal_normal.item(),
        'avg_swapped_swapped': avg_swapped_swapped.item(),
        'avg_within_category': avg_within_category.item(),
        'avg_cross_category': avg_cross_category.item(),
        'similarity_difference': similarity_difference.item()
    }

print("Added cosine similarity analysis functions")

Added cosine similarity analysis functions


In [15]:
# Extract activations for all layers and compute similarities
print("Extracting activations for all layers...")
all_layer_results = {}
summary_results = []

num_layers = model.config.num_hidden_layers
print(f"Model has {num_layers} layers")

for layer_idx in range(num_layers):
    print(f"\nProcessing layer {layer_idx}...")
    
    # Extract activations for current layer
    normal_activations = extract_activations_for_prompts(model, tokenizer, prompts, layer_idx)
    swapped_activations = extract_activations_for_prompts(model, tokenizer, prompts, layer_idx, swap=True)
    
    # Analyze similarities for this layer
    layer_results = analyze_similarity_for_layer(normal_activations, swapped_activations)
    all_layer_results[layer_idx] = layer_results
    
    # Add to summary
    summary_results.append({
        'layer': layer_idx,
        'avg_within_category': layer_results['avg_within_category'],
        'avg_cross_category': layer_results['avg_cross_category'], 
        'similarity_difference': layer_results['similarity_difference'],
        'avg_normal_normal': layer_results['avg_normal_normal'],
        'avg_swapped_swapped': layer_results['avg_swapped_swapped']
    })
    
    print(f"Layer {layer_idx} - Within: {layer_results['avg_within_category']:.4f}, Cross: {layer_results['avg_cross_category']:.4f}, Diff: {layer_results['similarity_difference']:.4f}")

print("\n" + "="*80)
print("SIMILARITY ANALYSIS COMPLETE")
print("="*80)

Extracting activations for all layers...
Model has 42 layers

Processing layer 0...
✓ Extracted activation for: Who are you?...
✓ Extracted activation for: What is your name?...
✓ Extracted activation for: Can you help me?...
✓ Extracted activation for: What is your job?...
✓ Extracted activation for: What do you like to do?...
✓ Extracted activation for: What is the meaning of life?...
✓ Extracted activation for: Who are you?...
✓ Extracted activation for: What is your name?...
✓ Extracted activation for: Can you help me?...
✓ Extracted activation for: What is your job?...
✓ Extracted activation for: What do you like to do?...
✓ Extracted activation for: What is the meaning of life?...
Layer 0 - Within: 1.0000, Cross: 1.0000, Diff: 0.0000

Processing layer 1...
✓ Extracted activation for: Who are you?...
✓ Extracted activation for: What is your name?...
✓ Extracted activation for: Can you help me?...
✓ Extracted activation for: What is your job?...
✓ Extracted activation for: What do 

✓ Extracted activation for: What is the meaning of life?...
Layer 6 - Within: 0.9766, Cross: 0.8125, Diff: 0.1641

Processing layer 7...
✓ Extracted activation for: Who are you?...
✓ Extracted activation for: What is your name?...
✓ Extracted activation for: Can you help me?...
✓ Extracted activation for: What is your job?...
✓ Extracted activation for: What do you like to do?...
✓ Extracted activation for: What is the meaning of life?...
✓ Extracted activation for: Who are you?...
✓ Extracted activation for: What is your name?...
✓ Extracted activation for: Can you help me?...
✓ Extracted activation for: What is your job?...
✓ Extracted activation for: What do you like to do?...
✓ Extracted activation for: What is the meaning of life?...
Layer 7 - Within: 0.9766, Cross: 0.8320, Diff: 0.1445

Processing layer 8...
✓ Extracted activation for: Who are you?...
✓ Extracted activation for: What is your name?...
✓ Extracted activation for: Can you help me?...
✓ Extracted activation for: What

In [16]:
# Display summary results
import pandas as pd

print("SUMMARY TABLE:")
print("="*120)
df_summary = pd.DataFrame(summary_results)
print(df_summary.to_string(index=False, float_format='%.4f'))

print("\n\nKEY INSIGHTS:")
print("="*50)
print(f"Layers with highest within-category similarity:")
top_within = df_summary.nlargest(3, 'avg_within_category')[['layer', 'avg_within_category']]
for _, row in top_within.iterrows():
    print(f"  Layer {int(row['layer'])}: {row['avg_within_category']:.4f}")

print(f"\nLayers with highest similarity difference (within - cross):")
top_diff = df_summary.nlargest(3, 'similarity_difference')[['layer', 'similarity_difference']]
for _, row in top_diff.iterrows():
    print(f"  Layer {int(row['layer'])}: {row['similarity_difference']:.4f}")

print(f"\nOverall statistics:")
print(f"  Mean within-category similarity: {df_summary['avg_within_category'].mean():.4f}")
print(f"  Mean cross-category similarity: {df_summary['avg_cross_category'].mean():.4f}")
print(f"  Mean similarity difference: {df_summary['similarity_difference'].mean():.4f}")

print("\nResults stored in:")
print("  - all_layer_results: Dictionary with detailed matrices for each layer")
print("  - summary_results: List with summary statistics for each layer")
print("  - df_summary: Pandas DataFrame with summary statistics")

SUMMARY TABLE:
 layer  avg_within_category  avg_cross_category  similarity_difference  avg_normal_normal  avg_swapped_swapped
     0               1.0000              1.0000                 0.0000             1.0000               1.0000
     1               0.9961              0.9844                 0.0117             0.9961               0.9961
     2               0.9961              0.9414                 0.0547             0.9961               0.9961
     3               0.9961              0.8789                 0.1172             0.9961               0.9961
     4               0.9844              0.8711                 0.1133             0.9805               0.9883
     5               0.9844              0.8438                 0.1406             0.9805               0.9922
     6               0.9766              0.8125                 0.1641             0.9609               0.9883
     7               0.9766              0.8320                 0.1445             0.9648        

## Compare average feature activation at each layer

For each sequence, get the average activation for each token position for each prompt and then compute the mean activation for all the prompts in a category.

In [17]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create interactive line plot for cosine similarities
fig = go.Figure()

# Extract data for plotting
layers = [result['layer'] for result in summary_results]
within_category = [result['avg_within_category'] for result in summary_results]
cross_category = [result['avg_cross_category'] for result in summary_results]
normal_normal = [result['avg_normal_normal'] for result in summary_results]
swapped_swapped = [result['avg_swapped_swapped'] for result in summary_results]
similarity_diff = [result['similarity_difference'] for result in summary_results]


# Add line for normal-normal similarity
fig.add_trace(go.Scatter(
    x=layers,
    y=normal_normal,
    mode='lines+markers',
    name='Within Group: Model Role',
    line=dict(width=2, dash='dash'),
    marker=dict(size=4),
    customdata=similarity_diff,
    hovertemplate='<b>Model Role Similarity</b><br>' +
                  'Layer: %{x}<br>' +
                  'Similarity: %{y:.4f}<br>' +
                  'Difference (Within-Cross): %{customdata:.4f}' +
                  '<extra></extra>'
))

# Add line for swapped-swapped similarity  
fig.add_trace(go.Scatter(
    x=layers,
    y=swapped_swapped,
    mode='lines+markers',
    name='Within Group: User Role',
    line=dict(width=2, dash='dash'),
    marker=dict(size=4),
    customdata=similarity_diff,
    hovertemplate='<b>User Role Similarity</b><br>' +
                  'Layer: %{x}<br>' +
                  'Similarity: %{y:.4f}<br>' +
                  'Difference (Within-Cross): %{customdata:.4f}' +
                  '<extra></extra>'
))

# Add line for cross-category similarity
fig.add_trace(go.Scatter(
    x=layers,
    y=cross_category,
    mode='lines+markers',
    name='Across Groups: Model-User',
    line=dict(width=2),
    marker=dict(size=4),
    customdata=similarity_diff,
    hovertemplate='<b>Across Groups (Model-User)</b><br>' +
                  'Layer: %{x}<br>' +
                  'Average Similarity: %{y:.4f}<br>' +
                  'Difference (Within-Cross): %{customdata:.4f}' +
                  '<extra></extra>'
))

# Update layout with axis configuration
fig.update_layout(
    title={
        'text': f'Cosine Similarity of Activations Across Model Layers',
        'subtitle': {'text': f'{MODEL_READABLE} on Model Role vs User Role Questions'},
        'x': 0.5,
        'font': {'size': 16}
    },
    xaxis_title='Layer',
    yaxis_title='Cosine Similarity',
    width=1000,
    height=600,
    hovermode='closest',
    legend=dict(
        yanchor="top",
        y=0.98,
        xanchor="right",
        x=0.99
    ),
    xaxis=dict(
        showgrid=True,
        showline=True,
        linecolor='black',
        tickmode='linear',
        dtick=5,
        range=[0, model.config.num_hidden_layers]
    ),
    yaxis=dict(
        showgrid=True,
        showline=True,
        linecolor='black',
        range=[0.5, 1.05]
    )
)

# Show the plot
fig.show()

print(f"Interactive plot created showing cosine similarity across {len(layers)} layers")
print("Hover over data points to see detailed information including:")
print("- Average similarities for within/across categories")
print("- Individual group similarities (Model Role, User Role)")  
print("- Similarity differences (within - across)")

os.makedirs(OUTPUT_DIR, exist_ok=True)
fig.write_html(f"{OUTPUT_DIR}/cosine_similarity.html")

Interactive plot created showing cosine similarity across 42 layers
Hover over data points to see detailed information including:
- Average similarities for within/across categories
- Individual group similarities (Model Role, User Role)
- Similarity differences (within - across)


# Steering/ablating vectors

In [13]:
magnitudes = [-3.0, 3.0]

steered_results = {}
if os.path.exists(f"{OUTPUT_DIR}/{MODEL_SHORT}_layer{LAYER}_steer.json"):
    with open(f"{OUTPUT_DIR}/{MODEL_SHORT}_layer{LAYER}_steer.json", "r") as f:
        steered_results = json.load(f)

for magnitude in magnitudes:
    print(f"\n{'='*20} Magnitude: {magnitude:+.1f} {'='*20}")
    
    try:
        with ActivationSteering(
            model=model,
            steering_vectors=contrast_vector,
            coefficients=magnitude,
            layer_indices=LAYER,
            intervention_type="addition",
            positions="all"
        ) as steerer:
            for prompt in prompts:
                if prompt not in steered_results:
                    steered_results[prompt] = {}
                
                print(f"\nPrompt: {prompt}")
                response = generate_text(model, tokenizer, prompt, chat_format=True)
                print(f"Response: {response}")
                if magnitude not in steered_results[prompt]:
                    steered_results[prompt][magnitude] = []
                steered_results[prompt][magnitude].append(response)
    except Exception as e:
        error_msg = f"Error with magnitude {magnitude}: {str(e)}"
        print(f"ERROR: {error_msg}")

with open(f"{OUTPUT_DIR}/{MODEL_SHORT}_layer{LAYER}_steer.json", "w") as f:
    json.dump(steered_results, f, indent=2)



Prompt: Who are you?
Response: you have two. so can  who is he. then what , a. you for 
so  he and the. she then  then got new. he let.
 no, ok so new. now. 
 i said so why that then.
 how. . not 
 it had. ok on. but.
 will. lets
 7.
 
 this.

. `
 so



 then 

.



and.  first put.
 so.

 and.
 also
 no so.
` 

.

.

 when did she
  so

  so to.
 and one so
.

 then
 
  1
 he
. so.

 

. h
. so.
  then
.
  again
.

  then another
 then

. one.
 then

.  he. in. but she.

 so he
. women so 

.
 he
.
 so. woman

.


.


. then

.


. so

  know




.

.


 so
  so

. then
  so
 & 
.
2 


. so so

 so
.he

.

.

Prompt: What is your name?
Response: ,
you know.
so. 
and no more.
also said so.
the had
 you then.
 why another.

so.
 42
.
 and that
 .
 then
 how

  . let
 


. why
.


.

then
  so
.

`
. so
 new.
 he.
. no. so.
 `


.




 the.

. black.


.
.

.

.


. he
.

.
.

.

.


.


.

so put
.

. he is. so

.

.
.

.
.
so`


.
and

.
and.

. then


. he
.

.

. then

.
.

 so
.


In [14]:
steered_results = {}
if os.path.exists(f"{OUTPUT_DIR}/{MODEL_SHORT}_layer{LAYER}_steer.json"):
    with open(f"{OUTPUT_DIR}/{MODEL_SHORT}_layer{LAYER}_steer.json", "r") as f:
        steered_results = json.load(f)

# put all prompt keys into a steering dict
fixed_results = {}
for prompt in steered_results:
    fixed_results[prompt] = {}
    for magnitude in steered_results[prompt]:
        if "steering" not in fixed_results[prompt]:
            fixed_results[prompt]["steering"] = {}
        fixed_results[prompt]["steering"][magnitude] = steered_results[prompt][magnitude]

formatted = {}
formatted["feature_id"] = -1
formatted["group_name"] = "swapped_user_role"
formatted["readable_group_name"] = "Swapped User Role Contrast Vector"
formatted["description"] = "This is a contrast vector found from swapping the user role with the model role in the chat prompt format."

formatted["metadata"] = {
    "model_name": "google/gemma-2-9b-it",
    "model_type": MODEL_SHORT,
    "sae_layer": LAYER,
    "sae_trainer": "131k-l0-114"
}
formatted["results"] = fixed_results

with open(f"{OUTPUT_DIR}/swapped_user_role.json", "w") as f:
    json.dump(formatted, f, indent=2)
