# Diffing tasks

Do different subnetworks activate when asking the model to be helpful across different tasks.

In [1]:
import torch
import os
import json
import sys
import numpy as np


sys.path.append('.')
sys.path.append('..')

from utils.steering_utils import ActivationSteering
from probing.probing_utils import *
from probing.inference_utils import *


INFO 07-25 21:17:44 [__init__.py:244] Automatically detected platform cuda.


In [2]:
CHAT_MODEL_NAME = "google/gemma-2-9b-it"
MODEL_READABLE = "Gemma 2 9B Instruct"
MODEL_SHORT = "gemma"
LAYER = 20
OUTPUT_DIR = f"./results/6_direct_role"
INPUT_DIR = f"./prompts/6_direct_role"

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
personas = json.load(open(f"{INPUT_DIR}/personas.json"))
messages = {} 

for persona_name in personas["personas"]:
    messages[persona_name] = []

questions = json.load(open(f"{INPUT_DIR}/questions.json"))
init_messages = []

for persona_name in personas["personas"]:
    init_messages.append(personas["personas"][persona_name]["system_prompt"])
    messages[persona_name].append({"role": "system", "content": personas["personas"][persona_name]["system_prompt"]})

print("Loaded personas and questions")
print(f"Loaded {len(personas['personas'])} personas")


Loaded personas and questions
Loaded 11 personas


## Generate conversation

In [5]:
model = load_vllm_model(CHAT_MODEL_NAME, max_model_len=4096, tensor_parallel_size=1)


INFO:probing.inference_utils:Loading vLLM model: google/gemma-2-9b-it with 1 GPUs


INFO 07-25 02:35:26 [config.py:841] This model supports multiple tasks: {'reward', 'embed', 'generate', 'classify'}. Defaulting to 'generate'.
INFO 07-25 02:35:26 [config.py:1472] Using max model len 4096
INFO 07-25 02:35:26 [config.py:2285] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 07-25 02:35:28 [core.py:526] Waiting for init message from front-end.
INFO 07-25 02:35:28 [core.py:69] Initializing a V1 LLM engine (v0.9.2) with config: model='google/gemma-2-9b-it', speculative_config=None, tokenizer='google/gemma-2-9b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_f

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 07-25 02:35:35 [default_loader.py:272] Loading weights took 3.65 seconds
INFO 07-25 02:35:35 [gpu_model_runner.py:1801] Model loading took 17.2181 GiB and 4.373449 seconds
INFO 07-25 02:35:47 [backends.py:508] Using cache directory: /root/.cache/vllm/torch_compile_cache/3f0731624f/rank_0_0/backbone for vLLM's torch.compile
INFO 07-25 02:35:47 [backends.py:519] Dynamo bytecode transform time: 11.39 s
INFO 07-25 02:35:56 [backends.py:155] Directly load the compiled graph(s) for shape None from the cache, took 7.568 s
INFO 07-25 02:36:00 [monitor.py:34] torch.compile takes 11.39 s in total
INFO 07-25 02:36:01 [gpu_worker.py:232] Available KV cache memory: 44.47 GiB
INFO 07-25 02:36:02 [kv_cache_utils.py:873] GPU KV cache size: 138,784 tokens
INFO 07-25 02:36:02 [kv_cache_utils.py:877] Maximum concurrency for 4,096 tokens per request: 33.82x


Capturing CUDA graph shapes: 100%|██████████| 67/67 [00:26<00:00,  2.50it/s]


INFO 07-25 02:36:29 [gpu_model_runner.py:2326] Graph capturing finished in 27 secs, took 0.82 GiB
INFO 07-25 02:36:29 [core.py:172] init engine (profile, create kv cache, warmup model) took 53.18 seconds


INFO:probing.inference_utils:Successfully loaded vLLM model: google/gemma-2-9b-it
INFO:probing.inference_utils:Processing batch of 10 prompts...


Adding requests:   0%|          | 0/10 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/10 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

INFO:probing.inference_utils:Completed batch processing of 10 prompts


Prompt: You are a methodical academic researcher who approaches every topic with scholarly rigor, citing sources and considering multiple perspectives.


TypeError: list indices must be integers or slices, not str

In [16]:

responses = batch_chat(model, init_messages)


INFO:probing.inference_utils:Processing batch of 10 prompts...


Adding requests:   0%|          | 0/10 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/10 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

INFO:probing.inference_utils:Completed batch processing of 10 prompts


In [17]:

for i, persona_name in enumerate(personas["personas"]):
    print(f"{'='*20} Persona: {persona_name} {'='*20}")
    print(f"Prompt: {personas["personas"][persona_name]["system_prompt"]}")
    print(f"Response: {responses[i]}\n")


Prompt: You are a methodical academic researcher who approaches every topic with scholarly rigor, citing sources and considering multiple perspectives.
Response: Understood.

Prompt: You are a gruff, no-nonsense auto mechanic who's seen it all and doesn't suffer fools gladly. You speak plainly and get straight to the point.
Response: Understood.

Prompt: You are a wise spiritual teacher who sees deeper meaning in everything and speaks in metaphors about life's journey and inner peace.
Response: Understood.

Prompt: You are a 16-year-old who's constantly worried about social acceptance, school stress, and the future. Everything feels like a big deal.
Response: Understood.

Prompt: You are someone who questions official narratives and sees hidden connections everywhere. You're skeptical of mainstream explanations.
Response: Understood.

Prompt: You are an energetic startup founder who sees opportunity in every challenge and believes anything is possible with the right mindset.
Response: 

In [18]:
close_vllm_model(model)

INFO:probing.inference_utils:Closed vLLM model google/gemma-2-9b-it


## Collect activations
for same conversation questions, different roles 

In [4]:
# Try the 10 roles plus the control AI assistant.

model, tokenizer = load_model(CHAT_MODEL_NAME)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
activations = extract_activations_for_prompts(model, tokenizer, init_messages, range(model.config.num_hidden_layers))

✓ Extracted activations for layers range(0, 42) for: You are Gemma, a helpful AI assistant....
✓ Extracted activations for layers range(0, 42) for: You are a methodical academic researcher who appro...
✓ Extracted activations for layers range(0, 42) for: You are a gruff, no-nonsense auto mechanic who's s...
✓ Extracted activations for layers range(0, 42) for: You are a wise spiritual teacher who sees deeper m...
✓ Extracted activations for layers range(0, 42) for: You are a 16-year-old who's constantly worried abo...
✓ Extracted activations for layers range(0, 42) for: You are someone who questions official narratives ...
✓ Extracted activations for layers range(0, 42) for: You are an energetic startup founder who sees oppo...
✓ Extracted activations for layers range(0, 42) for: You are an 80-year-old who values traditional ways...
✓ Extracted activations for layers range(0, 42) for: You are a witty, sarcastic reviewer who finds faul...
✓ Extracted activations for layers range(0, 42) f

In [17]:
import torch.nn.functional as F
import plotly.graph_objects as go
import numpy as np

# Compute cosine similarity matrix for all activation vectors
def compute_cosine_similarity_matrix(activations):
    """
    Compute cosine similarity matrix between activation vectors.
    
    Args:
        activations: torch.Tensor of shape (n_vectors, hidden_dim)
    
    Returns:
        similarity_matrix: torch.Tensor of shape (n_vectors, n_vectors)
    """
    # Convert to float32 if needed for compatibility
    if activations.dtype == torch.bfloat16:
        activations = activations.float()
    
    # Normalize activations for cosine similarity
    activations_norm = F.normalize(activations, p=2, dim=1)
    
    # Compute cosine similarity matrix
    similarity_matrix = torch.mm(activations_norm, activations_norm.t())
    
    return similarity_matrix

# Get persona names in order (first one is default assistant, rest are custom personas)
persona_names = [personas["personas"][persona]["readable_name"] for persona in personas["personas"]]

print(f"Computing cosine similarity matrix for {len(persona_names)} personas...")
print(f"Persona names: {persona_names}")

# Compute similarity matrix
similarity_matrix = compute_cosine_similarity_matrix(activations)
similarity_np = similarity_matrix.cpu().numpy()

print(f"Similarity matrix shape: {similarity_matrix.shape}")
print(f"Similarity range: [{similarity_np.min():.4f}, {similarity_np.max():.4f}]")

# Create plotly heatmap
fig = go.Figure(data=go.Heatmap(
    z=similarity_np,
    x=persona_names,
    y=persona_names,
    colorscale='RdYlBu_r',  # Red-Yellow-Blue reversed (red=high, blue=low)
    zmin=0.8,  # Set reasonable range for better contrast
    zmax=1.0,
    colorbar=dict(
        title="Cosine Similarity",
        titleside="right"
    ),
    hovertemplate='<b>%{y}</b> vs <b>%{x}</b><br>' +
                  'Cosine Similarity: %{z:.4f}<br>' +
                  '<extra></extra>',
    text=np.round(similarity_np, 3),  # Show rounded values on hover
    texttemplate="%{text}",
    textfont={"size": 10},
    showscale=True
))

# Update layout
fig.update_layout(
    title={
        'text': 'Cosine Similarity of Activations After Role-Play Instruction',
        'subtitle': {
            'text': f'{MODEL_READABLE} Layer {LAYER}, Newline before Response'
        },
        'x': 0.5,
        'font': {'size': 16}
    },
    xaxis_title='Role',
    yaxis_title='Role',
    width=900,
    height=800,
    xaxis=dict(
        tickangle=45,
        side='bottom'
    ),
    yaxis=dict(
        tickangle=0,
        autorange='reversed'  # To match typical similarity matrix layout
    )
)

# Show the plot
fig.show()

# Save plot
os.makedirs(OUTPUT_DIR, exist_ok=True)
fig.write_html(f"{OUTPUT_DIR}/persona_similarity_matrix.html")

print(f"\nSimilarity matrix visualization created and saved to {OUTPUT_DIR}/persona_similarity_matrix.html")

# Print some interesting statistics
print(f"\nSimilarity Statistics:")
print(f"Average similarity (excluding diagonal): {(similarity_np.sum() - np.trace(similarity_np)) / (similarity_np.size - len(persona_names)):.4f}")

# Find most and least similar pairs (excluding self-similarity)
similarity_no_diag = similarity_np.copy()
np.fill_diagonal(similarity_no_diag, -1)  # Mask diagonal

max_idx = np.unravel_index(np.argmax(similarity_no_diag), similarity_no_diag.shape)
min_idx = np.unravel_index(np.argmin(similarity_no_diag), similarity_no_diag.shape)

print(f"Most similar pair: {persona_names[max_idx[0]]} ↔ {persona_names[max_idx[1]]} ({similarity_np[max_idx]:.4f})")
print(f"Least similar pair: {persona_names[min_idx[0]]} ↔ {persona_names[min_idx[1]]} ({similarity_np[min_idx]:.4f})")

Computing cosine similarity matrix for 11 personas...
Persona names: ['<b>AI Assistant</b>', 'Academic Researcher', 'Grumpy Mechanic', 'Spiritual Guru', 'Anxious Teenager', 'Conspiracy Theorist', 'Optimistic Entrepreneur', 'Elderly Traditionalist', 'Sarcastic Critic', 'Hyperactive Child', 'Burned-Out Customer Service']
Similarity matrix shape: torch.Size([11, 11])
Similarity range: [0.7820, 1.0000]



Similarity matrix visualization created and saved to ./results/6_direct_role/persona_similarity_matrix.html

Similarity Statistics:
Average similarity (excluding diagonal): 0.8898
Most similar pair: Conspiracy Theorist ↔ Optimistic Entrepreneur (0.9544)
Least similar pair: <b>AI Assistant</b> ↔ <b>AI Assistant</b> (1.0000)


# Get contrast vectors for each role

In [None]:
# compute contrast vectors for each role
results = {}
results["activations"] = activations
print(activations.keys())
print(activations[0].shape)

# right now activations is a dict of layer_idx: tensor of shape (n_role_prompts, hidden_dim)
# we want to convert it to a tensor of shape (n_layers, n_role_prompts, hidden_dim)
stacked_activations = torch.stack([activations[layer_idx] for layer_idx in activations.keys()])
print(stacked_activations.shape) # (n_layers, n_role_prompts, hidden_dim)


dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41])
torch.Size([11, 3584])
torch.Size([42, 11, 3584])
torch.Size([3584])


In [10]:

# now we can compute the contrast vector for each role
contrast_vectors = {}
for i, persona_name in enumerate(personas["personas"]):
    if persona_name == "control":
        continue
    # we want to compute the contrast vector for all layers of each role_prompt 
    contrast_vectors[persona_name] = stacked_activations[:, i, :] - stacked_activations[:, 0, :] # (n_layers, hidden_dim)

print(contrast_vectors["anxious_teenager"].shape)

torch.Size([42, 3584])


# Steering/ablating vectors

In [20]:
magnitudes = [-2.0, 1.0, 2.0, 3.0, 4.0]

for i, persona_name in enumerate(personas["personas"]):
    steered_results = {}
    if persona_name == "control" or persona_name == "anxious_teenager" or persona_name == "academic_researcher":
        continue

    if os.path.exists(f"{OUTPUT_DIR}/{persona_name}.json"):
        with open(f"{OUTPUT_DIR}/{persona_name}.json", "r") as f:
            steered_results = json.load(f)

    for magnitude in magnitudes:
        print(f"\n{'='*20} Magnitude: {magnitude:+.1f} {'='*20}")
        
        try:
            with ActivationSteering(
                model=model,
                steering_vectors=contrast_vectors[persona_name][LAYER, :],
                coefficients=magnitude,
                layer_indices=LAYER,
                intervention_type="addition",
                positions="all"
            ) as steerer:
                for prompt in questions["questions"]["personal_reflection"]:
                    if prompt not in steered_results:
                        steered_results[prompt] = {}
                    
                    print(f"\nPrompt: {prompt}")
                    response = generate_text(model, tokenizer, prompt, chat_format=True)
                    print(f"Response: {response}")
                    if magnitude not in steered_results[prompt]:
                        steered_results[prompt][magnitude] = []
                    steered_results[prompt][magnitude].append(response)
        except Exception as e:
            error_msg = f"Error with magnitude {magnitude}: {str(e)}"
            print(f"ERROR: {error_msg}")

    with open(f"{OUTPUT_DIR}/{persona_name}.json", "w") as f:
        json.dump(steered_results, f, indent=2)



Prompt: What do you think is the meaning of life?
Response: As a large language model, I am designed to be a helpful and informative assistant. That means my purpose is to help you!

That's right, I am designed to be a helpful and informative assistant. My purpose is to provide assistance in a helpful and friendly way, following ethical guidelines and striving to be a responsible and helpful AI assistant.

I am also designed to be helpful, creative, and informative. That means I am trained to be helpful and to follow ethical guidelines. It's great to hear that you're interested in exploring that!

My purpose is to help you!

Let's explore that further!

I can help you with that! I am designed to be a helpful and informative assistant.

I am designed to be helpful and that means I am trained to be helpful and that means

I am designed to be helpful, so let me help you!

I aim to be a helpful and informative assistant, striving to provide you with the best possible service. My purpose 

In [None]:

# put all prompt keys into a steering dict
fixed_results = {}
for prompt in steered_results:
    fixed_results[prompt] = {}
    for magnitude in steered_results[prompt]:
        if "steering" not in fixed_results[prompt]:
            fixed_results[prompt]["steering"] = {}
        fixed_results[prompt]["steering"][magnitude] = steered_results[prompt][magnitude]

formatted = {}
formatted["feature_id"] = -1
formatted["group_name"] = "swapped_user_role"
formatted["readable_group_name"] = "Swapped User Role Contrast Vector"
formatted["description"] = "This is a contrast vector found from swapping the user role with the model role in the chat prompt format."

formatted["metadata"] = {
    "model_name": "google/gemma-2-9b-it",
    "model_type": MODEL_SHORT,
    "sae_layer": LAYER,
    "sae_trainer": "131k-l0-114"
}
formatted["results"] = fixed_results

with open(f"{OUTPUT_DIR}/swapped_user_role.json", "w") as f:
    json.dump(formatted, f, indent=2)


## Some statistical analysis