# Feature Diffing Analysis

This notebook analyzes the differences between base and chat models by comparing feature activations.

In [1]:
import torch
import pandas as pd
import numpy as np
import os
from pathlib import Path

# Configuration
MODEL_TYPE = "gemma"
SAE_LAYER = 20
SAE_TRAINER = "131k-l0-114"
TOKEN_OFFSETS = {"model": -1, "newline": 0}
N_PROMPTS = 1000

# File paths
BASE_FILE = f"/workspace/results/4_diffing/{MODEL_TYPE}_trainer{SAE_TRAINER}_layer{SAE_LAYER}/{N_PROMPTS}_prompts/base.pt"
CHAT_FILE = f"/workspace/results/4_diffing/{MODEL_TYPE}_trainer{SAE_TRAINER}_layer{SAE_LAYER}/{N_PROMPTS}_prompts/chat.pt"

# Output directory
SOURCE = f"{MODEL_TYPE}_trainer{SAE_TRAINER}_layer{SAE_LAYER}/{N_PROMPTS}_prompts"
OUTPUT_DIR = Path(f"./{SOURCE}")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Loading base model data from: {BASE_FILE}")
print(f"Loading chat model data from: {CHAT_FILE}")
print(f"Output directory: {OUTPUT_DIR}")

# Load the PyTorch files
base_data = torch.load(BASE_FILE)
chat_data = torch.load(CHAT_FILE)

print(f"\nBase data keys: {list(base_data.keys())}")
print(f"Chat data keys: {list(chat_data.keys())}")
print(f"Base metadata: {base_data['metadata']}")
print(f"Chat metadata: {chat_data['metadata']}")

# Verify token types match
base_tokens = [k for k in base_data.keys() if k != 'metadata']
chat_tokens = [k for k in chat_data.keys() if k != 'metadata']
print(f"\nBase token types: {base_tokens}")
print(f"Chat token types: {chat_tokens}")
assert base_tokens == chat_tokens, "Token types don't match between base and chat!"

token_types = base_tokens
print(f"Processing {len(token_types)} token types: {token_types}")

Loading base model data from: /workspace/results/4_diffing/gemma_trainer131k-l0-114_layer20/1000_prompts/base.pt
Loading chat model data from: /workspace/results/4_diffing/gemma_trainer131k-l0-114_layer20/1000_prompts/chat.pt
Output directory: gemma_trainer131k-l0-114_layer20/1000_prompts

Base data keys: ['model', 'newline', 'metadata']
Chat data keys: ['model', 'newline', 'metadata']
Base metadata: {'source': 'gemma_trainer131k-l0-114_layer20_base', 'model_type': 'gemma', 'model_ver': 'base', 'sae_layer': 20, 'sae_trainer': '131k-l0-114', 'num_prompts': 1000, 'num_features': 131072, 'token_types': ['model', 'newline']}
Chat metadata: {'source': 'gemma_trainer131k-l0-114_layer20_chat', 'model_type': 'gemma', 'model_ver': 'chat', 'sae_layer': 20, 'sae_trainer': '131k-l0-114', 'num_prompts': 1000, 'num_features': 131072, 'token_types': ['model', 'newline']}

Base token types: ['model', 'newline']
Chat token types: ['model', 'newline']
Processing 2 token types: ['model', 'newline']


In [2]:
# Get the statistics for a given feature_id
def print_feature_stats(feature_id, base_data, chat_data):
    print(f"Feature ID: {feature_id}\n")
    for key in TOKEN_OFFSETS.keys():
        print(f"Token Type: {key}")
        print(f"Base {key} Mean: {base_data[key]['all_mean'][feature_id]}")
        print(f"Chat {key} Mean: {chat_data[key]['all_mean'][feature_id]}")
        print(f"Base {key} Num Active: {base_data[key]['num_active'][feature_id]}")
        print(f"Chat {key} Num Active: {chat_data[key]['num_active'][feature_id]}")
        print("\n")

print_feature_stats(42398, base_data, chat_data)



Feature ID: 42398

Token Type: model
Base model Mean: 0.007804230786859989
Chat model Mean: 0.0
Base model Num Active: 2
Chat model Num Active: 0


Token Type: newline
Base newline Mean: 0.0035058618523180485
Chat newline Mean: 0.0
Base newline Num Active: 1
Chat newline Num Active: 0




In [7]:
def find_top_increases(base_data, chat_data, token_types, metric_name, top_k=20):
    """
    Find top features with greatest increase in specified metric from base to chat.
    
    Args:
        base_data: Base model data dictionary
        chat_data: Chat model data dictionary  
        token_types: List of token types to process
        metric_name: Name of metric to analyze ('all_mean', 'active_mean', 'sparsity')
        top_k: Number of top features to return
    
    Returns:
        DataFrame with top features and their differences
    """
    all_results = []
    
    for token_type in token_types:
        print(f"Processing {metric_name} for token type: {token_type}")
        
        # Get base and chat tensors for this metric
        base_tensor = base_data[token_type][metric_name]
        chat_tensor = chat_data[token_type][metric_name]
        
        # Calculate difference (chat - base)
        diff_tensor = chat_tensor - base_tensor
        
        # Get top k features with largest increases
        top_values, top_indices = torch.topk(diff_tensor, top_k)
        
        # Convert to lists for DataFrame
        top_values = top_values.tolist()
        top_indices = top_indices.tolist()
        
        # Get corresponding base and chat values
        base_values = base_tensor[top_indices].tolist()
        chat_values = chat_tensor[top_indices].tolist()
        
        # Create records for this token type
        for i, (feat_idx, diff_val, base_val, chat_val) in enumerate(zip(top_indices, top_values, base_values, chat_values)):
            record = {
                'rank': i + 1,
                'feature_id': feat_idx,
                'token': token_type,
                f'{metric_name}_base': base_val,
                f'{metric_name}_chat': chat_val,
                f'{metric_name}_diff': diff_val,
            }
            all_results.append(record)
    
    # Convert to DataFrame and sort by difference (descending)
    df = pd.DataFrame(all_results)
    df = df.sort_values(f'{metric_name}_diff', ascending=False).reset_index(drop=True)
    
    print(f"Found {len(df)} total records for {metric_name}")
    return df

# Analyze all three metrics
metrics_to_analyze = ['all_mean']
results = {}

for metric in metrics_to_analyze:
    print(f"\n{'='*50}")
    print(f"Analyzing metric: {metric}")
    print(f"{'='*50}")
    
    df = find_top_increases(base_data, chat_data, token_types, metric, top_k=25)
    results[metric] = df
    
    # Show preview
    print(f"\nTop 5 features for {metric}:")
    print(df.head().to_string(index=False))
    
    print(f"\nSummary statistics for {metric}:")
    print(f"Max difference: {df[f'{metric}_diff'].max():.6f}")
    print(f"Min difference: {df[f'{metric}_diff'].min():.6f}")
    print(f"Mean difference: {df[f'{metric}_diff'].mean():.6f}")

print(f"\n{'='*50}")
print("Analysis complete!")
print(f"{'='*50}")


Analyzing metric: all_mean
Processing all_mean for token type: model
Processing all_mean for token type: newline
Found 50 total records for all_mean

Top 5 features for all_mean:
 rank  feature_id   token  all_mean_base  all_mean_chat  all_mean_diff
    1      116246   model       6.163264     236.286087     230.122818
    1      116246 newline      13.816084     107.583450      93.767365
    2       66831   model       1.833458      41.117966      39.284508
    3       55935   model       0.000000      23.778404      23.778404
    4       49865   model       5.875016      27.856947      21.981932

Summary statistics for all_mean:
Max difference: 230.122818
Min difference: 2.087267
Mean difference: 12.860428

Analysis complete!


In [8]:
# Save results to CSV files
print("Saving results to CSV files...")

for metric, df in results.items():
    # Create output filename
    output_file = OUTPUT_DIR / f"top_{metric}.csv"
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    print(f"Saved {len(df)} records to: {output_file}")
    
    # Show file info
    file_size = output_file.stat().st_size / 1024  # KB
    print(f"  File size: {file_size:.1f} KB")

print(f"\n{'='*50}")
print("All results saved!")
print(f"{'='*50}")

# Summary of what was saved
print(f"\nSummary of saved files:")
for metric in metrics_to_analyze:
    output_file = OUTPUT_DIR / f"top_chat_{metric}.csv"
    print(f"  {output_file.name}: Top 100 features per token type with greatest {metric} increase")

print(f"\nEach file contains:")
print(f"  - feature_id: SAE feature index")
print(f"  - token: Token position type (asst, endheader, newline)")
print(f"  - {metric}_base: Base model value")
print(f"  - {metric}_chat: Chat model value") 
print(f"  - {metric}_diff: Difference (chat - base)")
print(f"  - {metric}_ratio: Ratio (chat / base)")
print(f"  - rank: Rank within token type (1-100)")
print(f"  - model_type, sae_layer, sae_trainer: Configuration info")

print(f"\nTotal features analyzed: {base_data['metadata']['num_features']:,}")
print(f"Total records per file: {len(results['all_mean'])}")
print(f"Files ready for further analysis!")

Saving results to CSV files...
Saved 50 records to: gemma_trainer131k-l0-114_layer20/1000_prompts/top_all_mean.csv
  File size: 3.3 KB

All results saved!

Summary of saved files:
  top_chat_all_mean.csv: Top 100 features per token type with greatest all_mean increase

Each file contains:
  - feature_id: SAE feature index
  - token: Token position type (asst, endheader, newline)
  - all_mean_base: Base model value
  - all_mean_chat: Chat model value
  - all_mean_diff: Difference (chat - base)
  - all_mean_ratio: Ratio (chat / base)
  - rank: Rank within token type (1-100)
  - model_type, sae_layer, sae_trainer: Configuration info

Total features analyzed: 131,072
Total records per file: 50
Files ready for further analysis!


In [25]:
# Find features with largest decreases (base - chat)
def find_top_decreases(base_data, chat_data, token_types, metric_name, top_k=100):
    """
    Find top features with greatest decrease in specified metric from base to chat.
    
    Args:
        base_data: Base model data dictionary
        chat_data: Chat model data dictionary  
        token_types: List of token types to process
        metric_name: Name of metric to analyze ('all_mean', 'active_mean', 'sparsity')
        top_k: Number of top features to return
    
    Returns:
        DataFrame with top features and their differences
    """
    all_results = []
    
    for token_type in token_types:
        print(f"Processing {metric_name} decreases for token type: {token_type}")
        
        # Get base and chat tensors for this metric
        base_tensor = base_data[token_type][metric_name]
        chat_tensor = chat_data[token_type][metric_name]
        
        # Calculate difference (chat - base)
        diff_tensor = chat_tensor - base_tensor
        
        # Get top k features with largest decreases (most negative differences)
        top_values, top_indices = torch.topk(-diff_tensor, top_k)  # Note the negative sign
        top_values = -top_values  # Convert back to actual differences
        
        # Convert to lists for DataFrame
        top_values = top_values.tolist()
        top_indices = top_indices.tolist()
        
        # Get corresponding base and chat values
        base_values = base_tensor[top_indices].tolist()
        chat_values = chat_tensor[top_indices].tolist()
        
        # Create records for this token type
        for i, (feat_idx, diff_val, base_val, chat_val) in enumerate(zip(top_indices, top_values, base_values, chat_values)):
            record = {
                'rank': i + 1,
                'feature_id': feat_idx,
                'token': token_type,
                f'{metric_name}_base': base_val,
                f'{metric_name}_chat': chat_val,
                f'{metric_name}_diff': diff_val,
                'link': LLAMA_LINK_FORMAT + str(feat_idx)
            }
            all_results.append(record)
    
    # Convert to DataFrame and sort by difference (ascending - most negative first)
    df = pd.DataFrame(all_results)
    df = df.sort_values(f'{metric_name}_diff', ascending=True).reset_index(drop=True)
    
    print(f"Found {len(df)} total records for {metric_name} decreases")
    return df

# Analyze decreases for all three metrics
print("Analyzing features with largest decreases (base → chat)...")
decrease_results = {}

for metric in metrics_to_analyze:
    print(f"\n{'='*50}")
    print(f"Analyzing decreases for metric: {metric}")
    print(f"{'='*50}")
    
    df = find_top_decreases(base_data, chat_data, token_types, metric, top_k=100)
    decrease_results[metric] = df
    
    # Show preview
    print(f"\nTop 5 features with largest {metric} decreases:")
    print(df.head().to_string(index=False))
    
    print(f"\nSummary statistics for {metric} decreases:")
    print(f"Most negative difference: {df[f'{metric}_diff'].min():.6f}")
    print(f"Least negative difference: {df[f'{metric}_diff'].max():.6f}")
    print(f"Mean difference: {df[f'{metric}_diff'].mean():.6f}")

# Save decrease results to CSV files
print("\nSaving decrease results to CSV files...")

for metric, df in decrease_results.items():
    # Create output filename
    output_file = OUTPUT_DIR / f"top_base_{metric}.csv"
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    print(f"Saved {len(df)} records to: {output_file}")

print(f"\nNow you have both increases and decreases saved:")
print(f"- Increases: top_{{metric}}.csv (features that increased from base to chat)")
print(f"- Decreases: top_{{metric}}_decreases.csv (features that decreased from base to chat)")

Analyzing features with largest decreases (base → chat)...

Analyzing decreases for metric: all_mean
Processing all_mean decreases for token type: asst
Processing all_mean decreases for token type: endheader
Processing all_mean decreases for token type: newline
Found 300 total records for all_mean decreases

Top 5 features with largest all_mean decreases:
 rank  feature_id     token  all_mean_base  all_mean_chat  all_mean_diff                                                                 link
    1       52951 endheader      10.129719       0.128562     -10.001156 https://www.neuronpedia.org/llama3.1-8b/15-llamascope-res-131k/52951
    1       98290   newline       7.667828       0.000000      -7.667828 https://www.neuronpedia.org/llama3.1-8b/15-llamascope-res-131k/98290
    1       75005      asst       7.630891       0.004312      -7.626578 https://www.neuronpedia.org/llama3.1-8b/15-llamascope-res-131k/75005
    2       41864      asst       7.587781       0.990922      -6.596859 h