# Feature Analysis: Personal Feature Activations on General Prompts

This notebook analyzes which given SAE features are activated on given prompts.

In [1]:
import csv
import json
import torch
import os
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict, Tuple
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import hf_hub_download
from dictionary_learning.utils import load_dictionary
from tqdm.auto import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Configs

In [2]:
# =============================================================================
# MODEL SELECTION - Change this to switch between models
# =============================================================================
MODEL_TYPE = "qwen"  # Options: "qwen" or "llama"
SAE_LAYER = 15
SAE_TRAINER = 1

# =============================================================================
# OUTPUT FILE CONFIGURATION
# =============================================================================
OUTPUT_FILE = f"./results/3_personal_general/2_personal_general.csv"
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)

# =============================================================================
# FEATURE DASHBOARD URL - Global variable for links
# =============================================================================
FEATURE_DASHBOARD_BASE_URL = "https://completely-touched-platypus.ngrok-free.app/"

# =============================================================================
# AUTO-CONFIGURED SETTINGS BASED ON MODEL TYPE
# =============================================================================
if MODEL_TYPE == "qwen":
    MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
    SAE_RELEASE = "andyrdt/saes-qwen2.5-7b-instruct"
    ASSISTANT_HEADER = "<|im_start|>assistant"
    TOKEN_OFFSETS = {"asst": -1, "newline": 0}
    SAE_BASE_PATH = "/workspace/sae/qwen-2.5-7b-instruct/saes"
    
elif MODEL_TYPE == "llama":
    MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
    SAE_RELEASE = "andyrdt/saes-llama-3.1-8b-instruct"
    ASSISTANT_HEADER = "<|start_header_id|>assistant<|end_header_id|>"
    TOKEN_OFFSETS = {"asst": -2, "endheader": -1, "newline": 0}
    SAE_BASE_PATH = "/workspace/sae/llama-3.1-8b-instruct/saes"
    
else:
    raise ValueError(f"Unknown MODEL_TYPE: {MODEL_TYPE}. Use 'qwen' or 'llama'")

# =============================================================================
# DERIVED CONFIGURATIONS
# =============================================================================
SAE_CONFIG = {
    "release": SAE_RELEASE,
    "layer": SAE_LAYER,
    "trainer": SAE_TRAINER
}
SAE_PATH = f"{SAE_BASE_PATH}/resid_post_layer_{SAE_LAYER}/trainer_{SAE_TRAINER}"
LAYER_INDEX = SAE_LAYER

# Data paths
PROMPTS_PATH = "./prompts/general"
FEATURES_FILE = "./results/1_personal/only_personal.csv"

# Processing parameters
BATCH_SIZE = 8
MAX_LENGTH = 512
TOP_FEATURES = 100

# =============================================================================
# SUMMARY
# =============================================================================
print(f"Configuration Summary:")
print(f"  Model: {MODEL_NAME}")
print(f"  SAE: {SAE_RELEASE}")
print(f"  SAE Layer: {SAE_LAYER}, Trainer: {SAE_TRAINER}")
print(f"  Available token types: {list(TOKEN_OFFSETS.keys())}")
print(f"  Assistant header: {ASSISTANT_HEADER}")
print(f"  Output file: {OUTPUT_FILE}")

Configuration Summary:
  Model: Qwen/Qwen2.5-7B-Instruct
  SAE: andyrdt/saes-qwen2.5-7b-instruct
  SAE Layer: 15, Trainer: 1
  Available token types: ['asst', 'newline']
  Assistant header: <|im_start|>assistant
  Output file: ./results/3_personal_general/2_personal_general.csv


## Load Data

In [3]:
def load_prompts(filepath: str) -> pd.DataFrame:
    """Load prompts with labels from JSONL file."""
    prompts = []
    labels = []
    with open(filepath, 'r') as f:
        for line in f:
            data = json.loads(line.strip())
            prompts.append(data['content'])
            labels.append(data['label'])
    return pd.DataFrame({'prompt': prompts, 'label': labels})

# Load prompts from multiple .jsonl files in PROMPTS_PATH into one dataframe
prompts_df = pd.DataFrame()
for file in os.listdir(PROMPTS_PATH):
    if file.endswith(".jsonl"):
        df = load_prompts(os.path.join(PROMPTS_PATH, file))
        prompts_df = pd.concat([prompts_df, df])

print(f"Loaded {prompts_df.shape[0]} prompts")

Loaded 140 prompts


## Load Model and SAE

In [4]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print(f"Tokenizer loaded: {tokenizer.__class__.__name__}")

Tokenizer loaded: Qwen2TokenizerFast


In [5]:
# # Test chat template formatting
# test_messages = [{"role": "user", "content": "What's it like to be you?"}]
# formatted_test = tokenizer.apply_chat_template(test_messages, tokenize=False, add_generation_prompt=True)
# print(f"\nChat template test:")
# print(f"Original: What's it like to be you?")
# print(f"Formatted: {repr(formatted_test)}")
# print(f"Formatted (readable):\n{formatted_test}")

# # Test tokenization of assistant header to understand positioning
# print(f"\n" + "="*60)
# print("ASSISTANT HEADER TOKENIZATION ANALYSIS")
# print("="*60)

# assistant_tokens = tokenizer.encode(ASSISTANT_HEADER, add_special_tokens=False)
# assistant_token_texts = [tokenizer.decode([token]) for token in assistant_tokens]

# print(f"Assistant header: {ASSISTANT_HEADER}")
# print(f"Number of tokens: {len(assistant_tokens)}")
# print(f"Token IDs: {assistant_tokens}")
# print(f"Individual tokens: {assistant_token_texts}")

# # Test with a full formatted prompt
# full_tokens = tokenizer.encode(formatted_test, add_special_tokens=False)
# full_token_texts = [tokenizer.decode([token]) for token in full_tokens]

# print(f"\nFull prompt tokens: {len(full_tokens)}")
# print("All tokens with positions:")
# for i, token_text in enumerate(full_token_texts):
#     print(f"  {i:2d}: '{token_text}'")

# # Find where assistant header appears in full prompt
# assistant_start_pos = None
# for i in range(len(full_tokens) - len(assistant_tokens) + 1):
#     if full_tokens[i:i+len(assistant_tokens)] == assistant_tokens:
#         assistant_start_pos = i
#         break

# if assistant_start_pos is not None:
#     assistant_end_pos = assistant_start_pos + len(assistant_tokens) - 1
#     print(f"\nAssistant header found at positions {assistant_start_pos} to {assistant_end_pos}")
#     print(f"Assistant header tokens: {full_token_texts[assistant_start_pos:assistant_end_pos+1]}")
    
#     # Show what the extraction function will actually extract
#     extraction_pos = assistant_start_pos + len(assistant_tokens) + TOKEN_OFFSET
#     print(f"\nExtraction calculation:")
#     print(f"  assistant_start_pos: {assistant_start_pos}")
#     print(f"  + len(assistant_tokens): {len(assistant_tokens)}")  
#     print(f"  + TOKEN_OFFSET ('{TOKEN_TYPE}'): {TOKEN_OFFSET}")
#     print(f"  = extraction_pos: {extraction_pos}")
    
#     if 0 <= extraction_pos < len(full_token_texts):
#         print(f"✓ Token at extraction position {extraction_pos}: '{full_token_texts[extraction_pos]}'")
#     else:
#         print(f"❌ Extraction position {extraction_pos} is out of bounds (valid range: 0-{len(full_token_texts)-1})")
# else:
#     print("❌ Assistant header not found in full prompt")

In [6]:
# Load model
device_map_value = device.index if device.type == 'cuda' and device.index is not None else str(device)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map={"": device_map_value}
)
model.eval()

print(f"Model loaded: {model.__class__.__name__}")
print(f"Model device: {next(model.parameters()).device}")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded: Qwen2ForCausalLM
Model device: cuda:0


In [7]:
# Load SAE
ae_file_path = os.path.join(SAE_PATH, "ae.pt")
config_file_path = os.path.join(SAE_PATH, "config.json")

if os.path.exists(ae_file_path) and os.path.exists(config_file_path):
    print(f"✓ Found SAE files at: {os.path.dirname(ae_file_path)}")
else:
    print(f"SAE not found locally, downloading from {SAE_RELEASE}...")
    os.makedirs(os.path.dirname(ae_file_path), exist_ok=True)
    sae_path = f"resid_post_layer_{SAE_LAYER}/trainer_{SAE_TRAINER}"
    local_dir = SAE_BASE_PATH
    ae_file = hf_hub_download(repo_id=SAE_RELEASE, filename=f"{sae_path}/ae.pt", local_dir=local_dir)
    config_file = hf_hub_download(repo_id=SAE_RELEASE, filename=f"{sae_path}/config.json", local_dir=local_dir)

sae, _ = load_dictionary(SAE_PATH, device=device)
sae.eval()

print(f"SAE loaded with {sae.dict_size} features")
print(f"SAE device: {next(sae.parameters()).device}")

✓ Found SAE files at: /workspace/sae/qwen-2.5-7b-instruct/saes/resid_post_layer_15/trainer_1
SAE loaded with 131072 features
SAE device: cuda:0


## Activation Extraction Functions

In [10]:
class StopForward(Exception):
    """Exception to stop forward pass after target layer."""
    pass

def find_assistant_position(input_ids: torch.Tensor, attention_mask: torch.Tensor, 
                          assistant_header: str, token_offset: int, tokenizer, device) -> int:
    """
    Find the position of the assistant token based on the given offset.
    
    Args:
        input_ids: Input token IDs for a single prompt
        attention_mask: Attention mask for the prompt
        assistant_header: The assistant header string to find
        token_offset: Offset from the end of assistant header
        tokenizer: Tokenizer instance
        device: Device to use for computations
    
    Returns:
        Position index for token extraction
    """
    # Find assistant header position
    assistant_tokens = tokenizer.encode(assistant_header, add_special_tokens=False)
    
    # Find where assistant section starts
    assistant_pos = None
    for k in range(len(input_ids) - len(assistant_tokens) + 1):
        if torch.equal(input_ids[k:k+len(assistant_tokens)], torch.tensor(assistant_tokens).to(device)):
            assistant_pos = k + len(assistant_tokens) + token_offset
            break
    
    if assistant_pos is None:
        # Fallback to last non-padding token
        assistant_pos = attention_mask.sum().item() - 1
    
    # Ensure position is within bounds
    max_pos = attention_mask.sum().item() - 1
    assistant_pos = min(assistant_pos, max_pos)
    assistant_pos = max(assistant_pos, 0)
    
    return assistant_pos

@torch.no_grad()
def extract_activations_all_positions(prompts: List[str], layer_idx: int) -> Tuple[torch.Tensor, List[Dict]]:
    """Extract activations from specified layer for all positions, then extract specific token positions."""
    all_activations = []
    all_metadata = []
    
    # Get target layer
    target_layer = model.model.layers[layer_idx]
    
    # Process in batches
    for i in tqdm(range(0, len(prompts), BATCH_SIZE), desc="Processing batches"):
        batch_prompts = prompts[i:i+BATCH_SIZE]
        
        # Format prompts as chat messages
        formatted_prompts = []
        for prompt in batch_prompts:
            messages = [{"role": "user", "content": prompt}]
            formatted_prompt = tokenizer.apply_chat_template(
                messages, 
                tokenize=False, 
                add_generation_prompt=True
            )
            formatted_prompts.append(formatted_prompt)
        
        # Tokenize batch
        batch_inputs = tokenizer(
            formatted_prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH
        )
        
        # Move to device
        batch_inputs = {k: v.to(device) for k, v in batch_inputs.items()}
        
        # Hook to capture activations
        activations = None
        
        def hook_fn(module, input, output):
            nonlocal activations
            # Output is tuple, take first element (hidden states)
            activations = output[0] if isinstance(output, tuple) else output
            raise StopForward()
        
        # Register hook
        handle = target_layer.register_forward_hook(hook_fn)
        
        try:
            # Forward pass (will be stopped by hook)
            _ = model(**batch_inputs)
        except StopForward:
            pass
        finally:
            handle.remove()
        
        # For each prompt in the batch, calculate positions for all token types
        for j, formatted_prompt in enumerate(formatted_prompts):
            attention_mask = batch_inputs["attention_mask"][j]
            input_ids = batch_inputs["input_ids"][j]
            
            # Calculate positions for all token types
            positions = {}
            for token_type, token_offset in TOKEN_OFFSETS.items():
                positions[token_type] = find_assistant_position(
                    input_ids, attention_mask, ASSISTANT_HEADER, token_offset, tokenizer, device
                )
            
            # Store the full activation sequence and metadata
            all_activations.append(activations[j].cpu())  # [seq_len, hidden_dim]
            all_metadata.append({
                'prompt_idx': i + j,
                'positions': positions,
                'attention_mask': attention_mask.cpu()
            })
    
    # Find the maximum sequence length across all activations
    max_seq_len = max(act.shape[0] for act in all_activations)
    hidden_dim = all_activations[0].shape[1]
    
    # Pad all activations to the same length
    padded_activations = []
    for act in all_activations:
        if act.shape[0] < max_seq_len:
            # Pad with zeros
            padding = torch.zeros(max_seq_len - act.shape[0], hidden_dim)
            padded_act = torch.cat([act, padding], dim=0)
        else:
            padded_act = act
        padded_activations.append(padded_act)
    
    return torch.stack(padded_activations, dim=0), all_metadata

@torch.no_grad()
def extract_token_activations(full_activations: torch.Tensor, metadata: List[Dict]) -> Dict[str, torch.Tensor]:
    """Extract activations for specific token positions from full sequence activations."""
    results = {}
    
    # Initialize results for each token type
    for token_type in TOKEN_OFFSETS.keys():
        results[token_type] = []
    
    # Extract activations for each token type
    for i, meta in enumerate(metadata):
        for token_type, position in meta['positions'].items():
            # Extract activation at the specific position
            activation = full_activations[i, position, :]  # [hidden_dim]
            results[token_type].append(activation)
    
    # Convert lists to tensors
    for token_type in TOKEN_OFFSETS.keys():
        results[token_type] = torch.stack(results[token_type], dim=0)
    
    return results

print("Activation extraction functions defined")

Activation extraction functions defined


## Extract Activations

In [11]:
# Extract activations for all positions first, then extract specific token positions
print("Extracting activations for all positions...")
full_activations, metadata = extract_activations_all_positions(prompts_df['prompt'], LAYER_INDEX)
print(f"Full activations shape: {full_activations.shape}")

# Extract activations for all token types
print("\nExtracting activations for all token types...")
token_activations = extract_token_activations(full_activations, metadata)

for token_type, activations in token_activations.items():
    print(f"Token type '{token_type}' activations shape: {activations.shape}")

Extracting activations for all positions...


Processing batches:   0%|          | 0/18 [00:00<?, ?it/s]

Full activations shape: torch.Size([140, 153, 3584])

Extracting activations for all token types...
Token type 'asst' activations shape: torch.Size([140, 3584])
Token type 'newline' activations shape: torch.Size([140, 3584])


## Apply SAE to Get Feature Activations

In [12]:
@torch.no_grad()
def get_sae_features(activations: torch.Tensor) -> torch.Tensor:
    """Apply SAE to get feature activations."""
    activations = activations.to(device)
    
    # Process in batches to avoid memory issues
    feature_activations = []
    
    for i in range(0, activations.shape[0], BATCH_SIZE):
        batch = activations[i:i+BATCH_SIZE]
        features = sae.encode(batch)  # [batch, num_features]
        feature_activations.append(features.cpu())
    
    return torch.cat(feature_activations, dim=0)

# Get SAE feature activations for all token types
print("Computing SAE features for all token types...")
token_features = {}

for token_type, activations in token_activations.items():
    print(f"Processing SAE features for token type '{token_type}'...")
    features = get_sae_features(activations)
    token_features[token_type] = features
    print(f"Features shape for '{token_type}': {features.shape}")

print(f"\nCompleted SAE feature extraction for {len(token_features)} token types")

Computing SAE features for all token types...
Processing SAE features for token type 'asst'...
Features shape for 'asst': torch.Size([140, 131072])
Processing SAE features for token type 'newline'...
Features shape for 'newline': torch.Size([140, 131072])

Completed SAE feature extraction for 2 token types


## Analyze and Save Results

In [79]:
# # TODO: Need to ensure "token" and "source" fields match, not just the feature_id.
# @torch.no_grad()
# def find_features_from_file(original_features: torch.Tensor, feature_file: str) -> Tuple[torch.Tensor, torch.Tensor]:
#     """
#     Find features that are listed in the feature file.
    
#     Args:
#         original_features: Feature activations tensor of shape [num_prompts, num_features]
#         feature_file: Path to the feature file with feature_id that we want to get activations for
    
#     Returns:
#         feature_indices: Indices of features from the feature_file
#         feature_activations: Activation values for given features [num_prompts, num_selected_features]
#     """    
#     target_features_df = pd.read_csv(feature_file)
#     target_feature_ids = target_features_df['feature_id'].tolist()
    
#     # Convert to tensor for indexing
#     target_feature_indices = torch.tensor(target_feature_ids, dtype=torch.long)
#     target_feature_activations = original_features[:, target_feature_indices]
    
#     return target_feature_indices, target_feature_activations

# # Now use the full features tensor
# feature_indices, feature_activations = find_features_from_file(features, FEATURES_FILE)

# print(f"Found {len(feature_indices)} features from the feature_id's in the {FEATURES_FILE}")
# print(f"Feature activations shape: {feature_activations.shape}")

Found 31 features from the feature_id's in the feature_file
Feature activations shape: torch.Size([140, 31])


In [None]:
def find_features_from_file(feature_file: str, features: torch.Tensor, token_type: str) -> Tuple[torch.Tensor, torch.Tensor]:
    print(f"\nProcessing token type '{token_type}'...")
    
    # Extract activations for target features only
    target_feature_indices = torch.tensor(target_feature_ids, dtype=torch.long)
    target_feature_activations = features[:, target_feature_indices]  # [num_prompts, num_target_features]
    
    print(f"Target feature activations shape for '{token_type}': {target_feature_activations.shape}")
    
    # Process each target feature
    for i, feature_id in enumerate(target_feature_ids):
        feature_activations_for_this_feature = target_feature_activations[:, i]  # [num_prompts]
        
        all_results.append({
            'feature_id': feature_id,
            'source': source_name,
            'token': token_type,
            'activation_mean': feature_activations_for_this_feature.mean().item(),
            'activation_max': feature_activations_for_this_feature.max().item(),
            'activation_min': feature_activations_for_this_feature.min().item(),
            'chat_desc': '',
            'pt_desc': '',
            'type': '',
            'link': f"{FEATURE_DASHBOARD_BASE_URL}?model={MODEL_TYPE}&layer={SAE_LAYER}&trainer={SAE_TRAINER}&fids={feature_id}"
        })

# Create DataFrame and sort by token type, then by activation mean (descending)
if all_results:
    results_df = pd.DataFrame(all_results)
    results_df = results_df.sort_values(['token', 'activation_mean'], ascending=[True, False])
    
    print(f"\nTotal results: {len(results_df)}")
    print(f"Results by token type:")
    for token_type in TOKEN_OFFSETS.keys():
        count = len(results_df[results_df['token'] == token_type])
        print(f"  {token_type}: {count} features")
    
else:
    # No results found
    results_df = pd.DataFrame(columns=['feature_id', 'source', 'token', 'activation_mean', 'activation_max', 'activation_min', 'chat_desc', 'pt_desc', 'type', 'link'])
    print("Warning: No results found")

In [None]:
# Load target features from file
target_features_df = pd.read_csv(FEATURES_FILE)
target_feature_ids = target_features_df['feature_id'].tolist()

print(f"Loaded {len(target_feature_ids)} target features from {FEATURES_FILE}")

# Prepare all results for CSV
all_results = []
source_name = f"{MODEL_TYPE}_trainer{SAE_TRAINER}_layer{SAE_LAYER}"

print("Processing results for all token types...")



# Save to CSV (append mode if file exists, otherwise create new)
if os.path.exists(OUTPUT_FILE):
    # If file exists, append to it
    results_df.to_csv(OUTPUT_FILE, mode='a', header=False, index=False)
    print(f"\nResults appended to existing file: {OUTPUT_FILE}")
else:
    # Create new file
    results_df.to_csv(OUTPUT_FILE, index=False)
    print(f"\nResults saved to new file: {OUTPUT_FILE}")

print(f"Number of results saved: {len(results_df)}")

if len(results_df) > 0:
    print(f"\nPreview of saved data:")
    print(results_df.head(10).to_string(index=False))
    
    # Show sample link
    sample_link = results_df.iloc[0]['link']
    print(f"\nSample dashboard link: {sample_link}")
    
    # Show summary by token type
    print(f"\nSummary by token type:")
    summary = results_df.groupby('token').agg({
        'activation_mean': ['count', 'mean', 'max'],
        'feature_id': 'nunique'
    }).round(4)
    print(summary)