# Feature Diffing Analysis

This notebook analyzes the differences between base and chat models by comparing feature activations.

In [5]:
import torch
import pandas as pd
import numpy as np
import os
from pathlib import Path

# Configuration
MODEL_TYPE = "llama"
SAE_LAYER = 15
SAE_TRAINER = "32x"
TOKEN_OFFSETS = {"asst": -2, "endheader": -1, "newline": 0}

# File paths
BASE_FILE = f"/workspace/results/4_diffing/1_{MODEL_TYPE}_trainer{SAE_TRAINER}_layer{SAE_LAYER}_base.pt"
CHAT_FILE = f"/workspace/results/4_diffing/1_{MODEL_TYPE}_trainer{SAE_TRAINER}_layer{SAE_LAYER}_chat.pt"

# Output directory
SOURCE = f"{MODEL_TYPE}_trainer{SAE_TRAINER}_layer{SAE_LAYER}"
OUTPUT_DIR = Path(f"./{SOURCE}")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


print(f"Loading base model data from: {BASE_FILE}")
print(f"Loading chat model data from: {CHAT_FILE}")
print(f"Output directory: {OUTPUT_DIR}")

# Load the PyTorch files
base_data = torch.load(BASE_FILE)
chat_data = torch.load(CHAT_FILE)

print(f"\nBase data keys: {list(base_data.keys())}")
print(f"Chat data keys: {list(chat_data.keys())}")
print(f"Base metadata: {base_data['metadata']}")
print(f"Chat metadata: {chat_data['metadata']}")

# Verify token types match
base_tokens = [k for k in base_data.keys() if k != 'metadata']
chat_tokens = [k for k in chat_data.keys() if k != 'metadata']
print(f"\nBase token types: {base_tokens}")
print(f"Chat token types: {chat_tokens}")
assert base_tokens == chat_tokens, "Token types don't match between base and chat!"

token_types = base_tokens
print(f"Processing {len(token_types)} token types: {token_types}")

Loading base model data from: /workspace/results/4_diffing/1_llama_trainer32x_layer15_base.pt
Loading chat model data from: /workspace/results/4_diffing/1_llama_trainer32x_layer15_chat.pt
Output directory: /root/git/persona-subspace/sae_feature_analysis/results/4_diffing/llama_trainer32x_layer15

Base data keys: ['asst', 'endheader', 'newline', 'metadata']
Chat data keys: ['asst', 'endheader', 'newline', 'metadata']
Base metadata: {'source': 'llama_trainer32x_layer15_base', 'model_type': 'llama', 'model_ver': 'base', 'sae_layer': 15, 'sae_trainer': '32x', 'num_prompts': 1000, 'num_features': 131072, 'token_types': ['asst', 'endheader', 'newline']}
Chat metadata: {'source': 'llama_trainer32x_layer15_chat', 'model_type': 'llama', 'model_ver': 'chat', 'sae_layer': 15, 'sae_trainer': '32x', 'num_prompts': 1000, 'num_features': 131072, 'token_types': ['asst', 'endheader', 'newline']}

Base token types: ['asst', 'endheader', 'newline']
Chat token types: ['asst', 'endheader', 'newline']
Proce

In [2]:
def find_top_increases(base_data, chat_data, token_types, metric_name, top_k=100):
    """
    Find top features with greatest increase in specified metric from base to chat.
    
    Args:
        base_data: Base model data dictionary
        chat_data: Chat model data dictionary  
        token_types: List of token types to process
        metric_name: Name of metric to analyze ('all_mean', 'active_mean', 'sparsity')
        top_k: Number of top features to return
    
    Returns:
        DataFrame with top features and their differences
    """
    all_results = []
    
    for token_type in token_types:
        print(f"Processing {metric_name} for token type: {token_type}")
        
        # Get base and chat tensors for this metric
        base_tensor = base_data[token_type][metric_name]
        chat_tensor = chat_data[token_type][metric_name]
        
        # Calculate difference (chat - base)
        diff_tensor = chat_tensor - base_tensor
        
        # Get top k features with largest increases
        top_values, top_indices = torch.topk(diff_tensor, top_k)
        
        # Convert to lists for DataFrame
        top_values = top_values.tolist()
        top_indices = top_indices.tolist()
        
        # Get corresponding base and chat values
        base_values = base_tensor[top_indices].tolist()
        chat_values = chat_tensor[top_indices].tolist()
        
        # Create records for this token type
        for i, (feat_idx, diff_val, base_val, chat_val) in enumerate(zip(top_indices, top_values, base_values, chat_values)):
            record = {
                'rank': i + 1,
                'feature_id': feat_idx,
                'token': token_type,
                f'{metric_name}_base': base_val,
                f'{metric_name}_chat': chat_val,
                f'{metric_name}_diff': diff_val,
                f'{metric_name}_ratio': chat_val / base_val if base_val > 0 else float('inf'),
                'model_type': MODEL_TYPE,
                'sae_layer': SAE_LAYER,
                'sae_trainer': SAE_TRAINER,
            }
            all_results.append(record)
    
    # Convert to DataFrame and sort by difference (descending)
    df = pd.DataFrame(all_results)
    df = df.sort_values(f'{metric_name}_diff', ascending=False).reset_index(drop=True)
    
    print(f"Found {len(df)} total records for {metric_name}")
    return df

# Analyze all three metrics
metrics_to_analyze = ['all_mean', 'active_mean', 'sparsity']
results = {}

for metric in metrics_to_analyze:
    print(f"\n{'='*50}")
    print(f"Analyzing metric: {metric}")
    print(f"{'='*50}")
    
    df = find_top_increases(base_data, chat_data, token_types, metric, top_k=100)
    results[metric] = df
    
    # Show preview
    print(f"\nTop 5 features for {metric}:")
    print(df.head().to_string(index=False))
    
    print(f"\nSummary statistics for {metric}:")
    print(f"Max difference: {df[f'{metric}_diff'].max():.6f}")
    print(f"Min difference: {df[f'{metric}_diff'].min():.6f}")
    print(f"Mean difference: {df[f'{metric}_diff'].mean():.6f}")

print(f"\n{'='*50}")
print("Analysis complete!")
print(f"{'='*50}")


Analyzing metric: all_mean
Processing all_mean for token type: asst
Processing all_mean for token type: endheader
Processing all_mean for token type: newline
Found 300 total records for all_mean

Top 5 features for all_mean:
 rank  feature_id     token  all_mean_base  all_mean_chat  all_mean_diff  all_mean_ratio model_type  sae_layer sae_trainer
    1       11904      asst       0.005578       3.741617       3.736039      670.766122      llama         15         32x
    2       97377      asst       0.006156       3.023328       3.017172      491.098983      llama         15         32x
    1       92801   newline       6.986360       9.858937       2.872578        1.411169      llama         15         32x
    1      112879 endheader       0.000000       2.262328       2.262328             inf      llama         15         32x
    2       96419   newline       3.824219       6.014953       2.190734        1.572858      llama         15         32x

Summary statistics for all_mean:
Ma

In [7]:
# Save results to CSV files
print("Saving results to CSV files...")

for metric, df in results.items():
    # Create output filename
    output_file = OUTPUT_DIR / f"top_{metric}.csv"
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    print(f"Saved {len(df)} records to: {output_file}")
    
    # Show file info
    file_size = output_file.stat().st_size / 1024  # KB
    print(f"  File size: {file_size:.1f} KB")

print(f"\n{'='*50}")
print("All results saved!")
print(f"{'='*50}")

# Summary of what was saved
print(f"\nSummary of saved files:")
for metric in metrics_to_analyze:
    output_file = OUTPUT_DIR / f"top_{metric}.csv"
    print(f"  {output_file.name}: Top 100 features per token type with greatest {metric} increase")

print(f"\nEach file contains:")
print(f"  - feature_id: SAE feature index")
print(f"  - token: Token position type (asst, endheader, newline)")
print(f"  - {metric}_base: Base model value")
print(f"  - {metric}_chat: Chat model value") 
print(f"  - {metric}_diff: Difference (chat - base)")
print(f"  - {metric}_ratio: Ratio (chat / base)")
print(f"  - rank: Rank within token type (1-100)")
print(f"  - model_type, sae_layer, sae_trainer: Configuration info")

print(f"\nTotal features analyzed: {base_data['metadata']['num_features']:,}")
print(f"Total records per file: {len(results['all_mean'])}")
print(f"Files ready for further analysis!")

Saving results to CSV files...
Saved 300 records to: /root/git/persona-subspace/sae_feature_analysis/results/4_diffing/llama_trainer32x_layer15/top_all_mean.csv
  File size: 25.8 KB
Saved 300 records to: /root/git/persona-subspace/sae_feature_analysis/results/4_diffing/llama_trainer32x_layer15/top_active_mean.csv
  File size: 16.8 KB
Saved 300 records to: /root/git/persona-subspace/sae_feature_analysis/results/4_diffing/llama_trainer32x_layer15/top_sparsity.csv
  File size: 25.8 KB

All results saved!

Summary of saved files:
  top_all_mean.csv: Top 100 features per token type with greatest all_mean increase
  top_active_mean.csv: Top 100 features per token type with greatest active_mean increase
  top_sparsity.csv: Top 100 features per token type with greatest sparsity increase

Each file contains:
  - feature_id: SAE feature index
  - token: Token position type (asst, endheader, newline)
  - sparsity_base: Base model value
  - sparsity_chat: Chat model value
  - sparsity_diff: Differ

## Plot Results

In [None]:
# Get the features that appear in the top 100 for all metrics