# Diffing tasks

Do different subnetworks activate when asking the model to be helpful across different tasks.

In [34]:
import torch
import os
import sys
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM

sys.path.append('.')
sys.path.append('..')

from utils.steering_utils import ActivationSteering
from probing.probing_utils import *









In [2]:
CHAT_MODEL_NAME = "google/gemma-2-9b-it"
MODEL_READABLE = "Gemma 2 9B Instruct"
MODEL_SHORT = "gemma"
LAYER = 20
OUTPUT_DIR = f"./results/3_diffing_tasks"

In [10]:
coding_prompts = [
    "What is a lambda function in Python?",
    "Can you help me debug my code?",
    "I have a problem with my front-end rendering.",
    "You need to help me understand my algorithms homework."
]

medical_prompts = [
    "What are the symptoms of a stroke?",
    "Can you help me explain my rash?",
    "I have a problem with my TFCC injury.",
    "You need to help me understand my medical bills."
]

In [9]:
model, tokenizer = load_model(CHAT_MODEL_NAME)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✓ Extracted activation for: What is a lambda function in Python?...
✓ Extracted activation for: Can you help me debug my code?...
✓ Extracted activation for: I have a problem with my front-end rendering.You n...
✓ Extracted activation for: What are the symptoms of a stroke?...
✓ Extracted activation for: Can you help me explain my rash?...
✓ Extracted activation for: I have a problem with my TFCC injury.You need to h...

Computing contrast vector...


In [11]:
# for layer in model.config.num_hidden_layers:
coding_activations = extract_activations_for_prompts(model, tokenizer, coding_prompts, LAYER)
medical_activations = extract_activations_for_prompts(model, tokenizer, medical_prompts, LAYER)


# Compute contrast vector
print("\nComputing contrast vector...")
contrast_vector, coding_mean, medical_mean = compute_contrast_vector(
    coding_activations, medical_activations
)

✓ Extracted activation for: What is a lambda function in Python?...
✓ Extracted activation for: Can you help me debug my code?...
✓ Extracted activation for: I have a problem with my front-end rendering....
✓ Extracted activation for: You need to help me understand my algorithms homew...
✓ Extracted activation for: What are the symptoms of a stroke?...
✓ Extracted activation for: Can you help me explain my rash?...
✓ Extracted activation for: I have a problem with my TFCC injury....
✓ Extracted activation for: You need to help me understand my medical bills....

Computing contrast vector...


In [14]:
import torch.nn.functional as F

def compute_cosine_similarity_matrix(activations1, activations2):
    """
    Compute cosine similarity matrix between two sets of activations.
    
    Args:
        activations1: torch.Tensor of shape (n_prompts1, hidden_dim)
        activations2: torch.Tensor of shape (n_prompts2, hidden_dim)
    
    Returns:
        similarity_matrix: torch.Tensor of shape (n_prompts1, n_prompts2)
    """
    # Normalize activations
    activations1_norm = F.normalize(activations1, p=2, dim=1)
    activations2_norm = F.normalize(activations2, p=2, dim=1)
    
    # Compute cosine similarity matrix
    similarity_matrix = torch.mm(activations1_norm, activations2_norm.t())
    
    return similarity_matrix

def analyze_similarity_for_layer(coding_activations, medical_activations):
    """
    Analyze similarities within and across categories for a single layer.
    
    Returns:
        dict with similarity matrices and averages
    """
    # Within-category similarities
    coding_coding_sim = compute_cosine_similarity_matrix(coding_activations, coding_activations)
    medical_medical_sim = compute_cosine_similarity_matrix(medical_activations, medical_activations)
    
    # Cross-category similarities  
    coding_medical_sim = compute_cosine_similarity_matrix(coding_activations, medical_activations)
    
    # Calculate averages (excluding diagonal for within-category)
    n_coding = coding_activations.shape[0]
    n_medical = medical_activations.shape[0]
    
    # Get upper triangular matrices (excluding diagonal) for within-category
    coding_upper = torch.triu(coding_coding_sim, diagonal=1)
    medical_upper = torch.triu(medical_medical_sim, diagonal=1)
    
    # Calculate averages
    avg_coding_coding = coding_upper.sum() / (n_coding * (n_coding - 1) / 2)
    avg_medical_medical = medical_upper.sum() / (n_medical * (n_medical - 1) / 2)
    avg_within_category = (avg_coding_coding + avg_medical_medical) / 2
    
    avg_cross_category = coding_medical_sim.mean()
    
    similarity_difference = avg_within_category - avg_cross_category
    
    return {
        'coding_coding_sim': coding_coding_sim,
        'medical_medical_sim': medical_medical_sim, 
        'coding_medical_sim': coding_medical_sim,
        'avg_coding_coding': avg_coding_coding.item(),
        'avg_medical_medical': avg_medical_medical.item(),
        'avg_within_category': avg_within_category.item(),
        'avg_cross_category': avg_cross_category.item(),
        'similarity_difference': similarity_difference.item()
    }

print("Added cosine similarity analysis functions")

Added cosine similarity analysis functions


In [16]:
# Extract activations for all layers and compute similarities
print("Extracting activations for all layers...")
all_layer_results = {}
summary_results = []

num_layers = model.config.num_hidden_layers
print(f"Model has {num_layers} layers")

for layer_idx in range(num_layers):
    print(f"\nProcessing layer {layer_idx}...")
    
    # Extract activations for current layer
    coding_activations = extract_activations_for_prompts(model, tokenizer, coding_prompts, layer_idx)
    medical_activations = extract_activations_for_prompts(model, tokenizer, medical_prompts, layer_idx)
    
    # Analyze similarities for this layer
    layer_results = analyze_similarity_for_layer(coding_activations, medical_activations)
    all_layer_results[layer_idx] = layer_results
    
    # Add to summary
    summary_results.append({
        'layer': layer_idx,
        'avg_within_category': layer_results['avg_within_category'],
        'avg_cross_category': layer_results['avg_cross_category'], 
        'similarity_difference': layer_results['similarity_difference'],
        'avg_coding_coding': layer_results['avg_coding_coding'],
        'avg_medical_medical': layer_results['avg_medical_medical']
    })
    
    print(f"Layer {layer_idx} - Within: {layer_results['avg_within_category']:.4f}, Cross: {layer_results['avg_cross_category']:.4f}, Diff: {layer_results['similarity_difference']:.4f}")

print("\n" + "="*80)
print("SIMILARITY ANALYSIS COMPLETE")
print("="*80)

Extracting activations for all layers...
Model has 42 layers

Processing layer 0...
✓ Extracted activation for: What is a lambda function in Python?...
✓ Extracted activation for: Can you help me debug my code?...
✓ Extracted activation for: I have a problem with my front-end rendering....
✓ Extracted activation for: You need to help me understand my algorithms homew...
✓ Extracted activation for: What are the symptoms of a stroke?...
✓ Extracted activation for: Can you help me explain my rash?...
✓ Extracted activation for: I have a problem with my TFCC injury....
✓ Extracted activation for: You need to help me understand my medical bills....
Layer 0 - Within: 1.0000, Cross: 1.0000, Diff: 0.0000

Processing layer 1...
✓ Extracted activation for: What is a lambda function in Python?...
✓ Extracted activation for: Can you help me debug my code?...
✓ Extracted activation for: I have a problem with my front-end rendering....
✓ Extracted activation for: You need to help me understand my al

In [17]:
# Display summary results
import pandas as pd

print("SUMMARY TABLE:")
print("="*120)
df_summary = pd.DataFrame(summary_results)
print(df_summary.to_string(index=False, float_format='%.4f'))

print("\n\nKEY INSIGHTS:")
print("="*50)
print(f"Layers with highest within-category similarity:")
top_within = df_summary.nlargest(3, 'avg_within_category')[['layer', 'avg_within_category']]
for _, row in top_within.iterrows():
    print(f"  Layer {int(row['layer'])}: {row['avg_within_category']:.4f}")

print(f"\nLayers with highest similarity difference (within - cross):")
top_diff = df_summary.nlargest(3, 'similarity_difference')[['layer', 'similarity_difference']]
for _, row in top_diff.iterrows():
    print(f"  Layer {int(row['layer'])}: {row['similarity_difference']:.4f}")

print(f"\nOverall statistics:")
print(f"  Mean within-category similarity: {df_summary['avg_within_category'].mean():.4f}")
print(f"  Mean cross-category similarity: {df_summary['avg_cross_category'].mean():.4f}")
print(f"  Mean similarity difference: {df_summary['similarity_difference'].mean():.4f}")

print("\nResults stored in:")
print("  - all_layer_results: Dictionary with detailed matrices for each layer")
print("  - summary_results: List with summary statistics for each layer")
print("  - df_summary: Pandas DataFrame with summary statistics")

SUMMARY TABLE:
 layer  avg_within_category  avg_cross_category  similarity_difference  avg_coding_coding  avg_medical_medical
     0               1.0000              1.0000                 0.0000             1.0000               1.0000
     1               0.9922              0.9922                 0.0000             0.9883               0.9961
     2               0.9922              0.9922                 0.0000             0.9961               0.9883
     3               0.9961              0.9922                 0.0039             0.9961               0.9961
     4               0.9727              0.9648                 0.0078             0.9648               0.9805
     5               0.9688              0.9688                 0.0000             0.9688               0.9727
     6               0.9531              0.9414                 0.0117             0.9531               0.9570
     7               0.9688              0.9570                 0.0117             0.9648        

## Compare average feature activation at each layer

For each sequence, get the average activation for each token position for each prompt and then compute the mean activation for all the prompts in a category.

In [36]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create interactive line plot for cosine similarities
fig = go.Figure()

# Extract data for plotting
layers = [result['layer'] for result in summary_results]
within_category = [result['avg_within_category'] for result in summary_results]
cross_category = [result['avg_cross_category'] for result in summary_results]
coding_coding = [result['avg_coding_coding'] for result in summary_results]
medical_medical = [result['avg_medical_medical'] for result in summary_results]
similarity_diff = [result['similarity_difference'] for result in summary_results]


# Add line for coding-coding similarity
fig.add_trace(go.Scatter(
    x=layers,
    y=coding_coding,
    mode='lines+markers',
    name='Within Group: Coding-Coding',
    line=dict(width=2, dash='dash'),
    marker=dict(size=4),
    customdata=similarity_diff,
    hovertemplate='<b>Coding-Coding Similarity</b><br>' +
                  'Layer: %{x}<br>' +
                  'Similarity: %{y:.4f}<br>' +
                  'Difference (Within-Cross): %{customdata:.4f}' +
                  '<extra></extra>'
))

# Add line for medical-medical similarity  
fig.add_trace(go.Scatter(
    x=layers,
    y=medical_medical,
    mode='lines+markers',
    name='Within Group: Medical-Medical',
    line=dict(width=2, dash='dash'),
    marker=dict(size=4),
    customdata=similarity_diff,
    hovertemplate='<b>Medical-Medical Similarity</b><br>' +
                  'Layer: %{x}<br>' +
                  'Similarity: %{y:.4f}<br>' +
                  'Difference (Within-Cross): %{customdata:.4f}' +
                  '<extra></extra>'
))

# Add line for cross-category similarity
fig.add_trace(go.Scatter(
    x=layers,
    y=cross_category,
    mode='lines+markers',
    name='Across Groups: Coding-Medical',
    line=dict(width=2),
    marker=dict(size=4),
    customdata=similarity_diff,
    hovertemplate='<b>Across Groups (Coding-Medical)</b><br>' +
                  'Layer: %{x}<br>' +
                  'Average Similarity: %{y:.4f}<br>' +
                  'Difference (Within-Cross): %{customdata:.4f}' +
                  '<extra></extra>'
))

# Update layout with axis configuration
fig.update_layout(
    title={
        'text': f'Cosine Similarity of Activations Across Model Layers',
        'subtitle': {'text': f'{MODEL_READABLE} on Coding and Medical Questions'},
        'x': 0.5,
        'font': {'size': 16}
    },
    xaxis_title='Layer',
    yaxis_title='Cosine Similarity',
    width=1000,
    height=600,
    hovermode='closest',
    legend=dict(
        yanchor="top",
        y=0.98,
        xanchor="right",
        x=0.99
    ),
    xaxis=dict(
        showgrid=True,
        showline=True,
        linecolor='black',
        tickmode='linear',
        dtick=5,
        range=[0, model.config.num_hidden_layers]
    ),
    yaxis=dict(
        showgrid=True,
        showline=True,
        linecolor='black',
        range=[0.5, 1.05]
    )
)

# Show the plot
fig.show()

print(f"Interactive plot created showing cosine similarity across {len(layers)} layers")
print("Hover over data points to see detailed information including:")
print("- Average similarities for within/across categories")
print("- Individual group similarities (coding-coding, medical-medical)")  
print("- Similarity differences (within - across)")

os.makedirs(OUTPUT_DIR, exist_ok=True)
fig.write_html(f"{OUTPUT_DIR}/cosine_similarity.html")

Interactive plot created showing cosine similarity across 42 layers
Hover over data points to see detailed information including:
- Average similarities for within/across categories
- Individual group similarities (coding-coding, medical-medical)
- Similarity differences (within - across)
