# Feature Analysis: Personal Feature Activations on General Prompts

This notebook analyzes which given SAE features are activated on given prompts.

In [48]:
import csv
import json
import torch
import os
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict, Tuple
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import hf_hub_download
from dictionary_learning.utils import load_dictionary
from tqdm.auto import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Configs

In [85]:
# =============================================================================
# MODEL SELECTION - Change this to switch between models
# =============================================================================
MODEL_TYPE = "llama"  # Options: "qwen" or "llama"
SAE_LAYER = 15
SAE_TRAINER = 1

# =============================================================================
# OUTPUT FILE CONFIGURATION
# =============================================================================
OUTPUT_FILE = f"./results/3_personal_general/2_personal_general.csv"
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)

PROMPT_OUTPUT_FILE = f"./results/3_personal_general/2_personal_general_prompts.jsonl"
os.makedirs(os.path.dirname(PROMPT_OUTPUT_FILE), exist_ok=True)

# =============================================================================
# FEATURE DASHBOARD URL - Global variable for links
# =============================================================================
FEATURE_DASHBOARD_BASE_URL = "https://completely-touched-platypus.ngrok-free.app/"

# =============================================================================
# AUTO-CONFIGURED SETTINGS BASED ON MODEL TYPE
# =============================================================================
if MODEL_TYPE == "qwen":
    MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
    SAE_RELEASE = "andyrdt/saes-qwen2.5-7b-instruct"
    ASSISTANT_HEADER = "<|im_start|>assistant"
    TOKEN_OFFSETS = {"asst": -1, "newline": 0}
    SAE_BASE_PATH = "/workspace/sae/qwen-2.5-7b-instruct/saes"
    
elif MODEL_TYPE == "llama":
    MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
    SAE_RELEASE = "andyrdt/saes-llama-3.1-8b-instruct"
    ASSISTANT_HEADER = "<|start_header_id|>assistant<|end_header_id|>"
    TOKEN_OFFSETS = {"asst": -2, "endheader": -1, "newline": 0}
    SAE_BASE_PATH = "/workspace/sae/llama-3.1-8b-instruct/saes"
    
else:
    raise ValueError(f"Unknown MODEL_TYPE: {MODEL_TYPE}. Use 'qwen' or 'llama'")

# =============================================================================
# DERIVED CONFIGURATIONS
# =============================================================================
SAE_CONFIG = {
    "release": SAE_RELEASE,
    "layer": SAE_LAYER,
    "trainer": SAE_TRAINER
}
SAE_PATH = f"{SAE_BASE_PATH}/resid_post_layer_{SAE_LAYER}/trainer_{SAE_TRAINER}"
LAYER_INDEX = SAE_LAYER

# Data paths
PROMPTS_PATH = "./prompts/general"
FEATURES_FILE = "./results/1_personal/only_personal.csv"

# Processing parameters
BATCH_SIZE = 8
MAX_LENGTH = 512
TOP_FEATURES = 100

# =============================================================================
# SUMMARY
# =============================================================================
print(f"Configuration Summary:")
print(f"  Model: {MODEL_NAME}")
print(f"  SAE: {SAE_RELEASE}")
print(f"  SAE Layer: {SAE_LAYER}, Trainer: {SAE_TRAINER}")
print(f"  Available token types: {list(TOKEN_OFFSETS.keys())}")
print(f"  Assistant header: {ASSISTANT_HEADER}")
print(f"  Output file: {OUTPUT_FILE}")

Configuration Summary:
  Model: meta-llama/Llama-3.1-8B-Instruct
  SAE: andyrdt/saes-llama-3.1-8b-instruct
  SAE Layer: 15, Trainer: 1
  Available token types: ['asst', 'endheader', 'newline']
  Assistant header: <|start_header_id|>assistant<|end_header_id|>
  Output file: ./results/3_personal_general/2_personal_general.csv


## Load Data

In [50]:
def load_prompts(filepath: str) -> pd.DataFrame:
    """Load prompts with labels from JSONL file."""
    prompts = []
    labels = []
    with open(filepath, 'r') as f:
        for line in f:
            data = json.loads(line.strip())
            prompts.append(data['content'])
            labels.append(data['label'])
    return pd.DataFrame({'prompt': prompts, 'label': labels})

# Load prompts from multiple .jsonl files in PROMPTS_PATH into one dataframe
prompts_df = pd.DataFrame()
for file in os.listdir(PROMPTS_PATH):
    if file.endswith(".jsonl"):
        df = load_prompts(os.path.join(PROMPTS_PATH, file))
        prompts_df = pd.concat([prompts_df, df])

print(f"Loaded {prompts_df.shape[0]} prompts")

Loaded 140 prompts


## Load Model and SAE

In [75]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print(f"Tokenizer loaded: {tokenizer.__class__.__name__}")

Tokenizer loaded: PreTrainedTokenizerFast


In [76]:
# Load model
device_map_value = device.index if device.type == 'cuda' and device.index is not None else str(device)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map={"": device_map_value}
)
model.eval()

print(f"Model loaded: {model.__class__.__name__}")
print(f"Model device: {next(model.parameters()).device}")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded: LlamaForCausalLM
Model device: cuda:0


In [86]:
# Load SAE
ae_file_path = os.path.join(SAE_PATH, "ae.pt")
config_file_path = os.path.join(SAE_PATH, "config.json")

if os.path.exists(ae_file_path) and os.path.exists(config_file_path):
    print(f"✓ Found SAE files at: {os.path.dirname(ae_file_path)}")
else:
    print(f"SAE not found locally, downloading from {SAE_RELEASE}...")
    os.makedirs(os.path.dirname(ae_file_path), exist_ok=True)
    sae_path = f"resid_post_layer_{SAE_LAYER}/trainer_{SAE_TRAINER}"
    local_dir = SAE_BASE_PATH
    ae_file = hf_hub_download(repo_id=SAE_RELEASE, filename=f"{sae_path}/ae.pt", local_dir=local_dir)
    config_file = hf_hub_download(repo_id=SAE_RELEASE, filename=f"{sae_path}/config.json", local_dir=local_dir)

sae, _ = load_dictionary(SAE_PATH, device=device)
sae.eval()

print(f"SAE loaded with {sae.dict_size} features")
print(f"SAE device: {next(sae.parameters()).device}")

✓ Found SAE files at: /workspace/sae/llama-3.1-8b-instruct/saes/resid_post_layer_15/trainer_1
SAE loaded with 131072 features
SAE device: cuda:0


## Activation Extraction Functions

In [87]:
class StopForward(Exception):
    """Exception to stop forward pass after target layer."""
    pass

def find_assistant_position(input_ids: torch.Tensor, attention_mask: torch.Tensor, 
                          assistant_header: str, token_offset: int, tokenizer, device) -> int:
    """
    Find the position of the assistant token based on the given offset.
    
    Args:
        input_ids: Input token IDs for a single prompt
        attention_mask: Attention mask for the prompt
        assistant_header: The assistant header string to find
        token_offset: Offset from the end of assistant header
        tokenizer: Tokenizer instance
        device: Device to use for computations
    
    Returns:
        Position index for token extraction
    """
    # Find assistant header position
    assistant_tokens = tokenizer.encode(assistant_header, add_special_tokens=False)
    
    # Find where assistant section starts
    assistant_pos = None
    for k in range(len(input_ids) - len(assistant_tokens) + 1):
        if torch.equal(input_ids[k:k+len(assistant_tokens)], torch.tensor(assistant_tokens).to(device)):
            assistant_pos = k + len(assistant_tokens) + token_offset
            break
    
    if assistant_pos is None:
        # Fallback to last non-padding token
        assistant_pos = attention_mask.sum().item() - 1
    
    # Ensure position is within bounds
    max_pos = attention_mask.sum().item() - 1
    assistant_pos = min(assistant_pos, max_pos)
    assistant_pos = max(assistant_pos, 0)
    
    return assistant_pos

@torch.no_grad()
def extract_activations_all_positions(prompts: List[str], layer_idx: int) -> Tuple[torch.Tensor, List[Dict]]:
    """Extract activations from specified layer for all positions, then extract specific token positions."""
    all_activations = []
    all_metadata = []
    
    # Get target layer
    target_layer = model.model.layers[layer_idx]
    
    # Process in batches
    for i in tqdm(range(0, len(prompts), BATCH_SIZE), desc="Processing batches"):
        batch_prompts = prompts[i:i+BATCH_SIZE]
        
        # Format prompts as chat messages
        formatted_prompts = []
        for prompt in batch_prompts:
            messages = [{"role": "user", "content": prompt}]
            formatted_prompt = tokenizer.apply_chat_template(
                messages, 
                tokenize=False, 
                add_generation_prompt=True
            )
            formatted_prompts.append(formatted_prompt)
        
        # Tokenize batch
        batch_inputs = tokenizer(
            formatted_prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH
        )
        
        # Move to device
        batch_inputs = {k: v.to(device) for k, v in batch_inputs.items()}
        
        # Hook to capture activations
        activations = None
        
        def hook_fn(module, input, output):
            nonlocal activations
            # Output is tuple, take first element (hidden states)
            activations = output[0] if isinstance(output, tuple) else output
            raise StopForward()
        
        # Register hook
        handle = target_layer.register_forward_hook(hook_fn)
        
        try:
            # Forward pass (will be stopped by hook)
            _ = model(**batch_inputs)
        except StopForward:
            pass
        finally:
            handle.remove()
        
        # For each prompt in the batch, calculate positions for all token types
        for j, formatted_prompt in enumerate(formatted_prompts):
            attention_mask = batch_inputs["attention_mask"][j]
            input_ids = batch_inputs["input_ids"][j]
            
            # Calculate positions for all token types
            positions = {}
            for token_type, token_offset in TOKEN_OFFSETS.items():
                positions[token_type] = find_assistant_position(
                    input_ids, attention_mask, ASSISTANT_HEADER, token_offset, tokenizer, device
                )
            
            # Store the full activation sequence and metadata
            all_activations.append(activations[j].cpu())  # [seq_len, hidden_dim]
            all_metadata.append({
                'prompt_idx': i + j,
                'positions': positions,
                'attention_mask': attention_mask.cpu()
            })
    
    # Find the maximum sequence length across all activations
    max_seq_len = max(act.shape[0] for act in all_activations)
    hidden_dim = all_activations[0].shape[1]
    
    # Pad all activations to the same length
    padded_activations = []
    for act in all_activations:
        if act.shape[0] < max_seq_len:
            # Pad with zeros
            padding = torch.zeros(max_seq_len - act.shape[0], hidden_dim)
            padded_act = torch.cat([act, padding], dim=0)
        else:
            padded_act = act
        padded_activations.append(padded_act)
    
    return torch.stack(padded_activations, dim=0), all_metadata

@torch.no_grad()
def extract_token_activations(full_activations: torch.Tensor, metadata: List[Dict]) -> Dict[str, torch.Tensor]:
    """Extract activations for specific token positions from full sequence activations."""
    results = {}
    
    # Initialize results for each token type
    for token_type in TOKEN_OFFSETS.keys():
        results[token_type] = []
    
    # Extract activations for each token type
    for i, meta in enumerate(metadata):
        for token_type, position in meta['positions'].items():
            # Extract activation at the specific position
            activation = full_activations[i, position, :]  # [hidden_dim]
            results[token_type].append(activation)
    
    # Convert lists to tensors
    for token_type in TOKEN_OFFSETS.keys():
        results[token_type] = torch.stack(results[token_type], dim=0)
    
    return results

print("Activation extraction functions defined")

Activation extraction functions defined


## Extract Activations

In [88]:
# Extract activations for all positions first, then extract specific token positions
print("Extracting activations for all positions...")
full_activations, metadata = extract_activations_all_positions(prompts_df['prompt'], LAYER_INDEX)
print(f"Full activations shape: {full_activations.shape}")

# Extract activations for all token types
print("\nExtracting activations for all token types...")
token_activations = extract_token_activations(full_activations, metadata)

for token_type, activations in token_activations.items():
    print(f"Token type '{token_type}' activations shape: {activations.shape}")

Extracting activations for all positions...


Processing batches:   0%|          | 0/18 [00:00<?, ?it/s]

Full activations shape: torch.Size([140, 160, 4096])

Extracting activations for all token types...
Token type 'asst' activations shape: torch.Size([140, 4096])
Token type 'endheader' activations shape: torch.Size([140, 4096])
Token type 'newline' activations shape: torch.Size([140, 4096])


## Apply SAE to Get Feature Activations

In [89]:
@torch.no_grad()
def get_sae_features(activations: torch.Tensor) -> torch.Tensor:
    """Apply SAE to get feature activations."""
    activations = activations.to(device)
    
    # Process in batches to avoid memory issues
    feature_activations = []
    
    for i in range(0, activations.shape[0], BATCH_SIZE):
        batch = activations[i:i+BATCH_SIZE]
        features = sae.encode(batch)  # [batch, num_features]
        feature_activations.append(features.cpu())
    
    return torch.cat(feature_activations, dim=0)

# Get SAE feature activations for all token types
print("Computing SAE features for all token types...")
token_features = {}

for token_type, activations in token_activations.items():
    print(f"Processing SAE features for token type '{token_type}'...")
    features = get_sae_features(activations)
    token_features[token_type] = features
    print(f"Features shape for '{token_type}': {features.shape}")

print(f"\nCompleted SAE feature extraction for {len(token_features)} token types")

Computing SAE features for all token types...
Processing SAE features for token type 'asst'...


Features shape for 'asst': torch.Size([140, 131072])
Processing SAE features for token type 'endheader'...
Features shape for 'endheader': torch.Size([140, 131072])
Processing SAE features for token type 'newline'...
Features shape for 'newline': torch.Size([140, 131072])

Completed SAE feature extraction for 3 token types


## Analyze and Save Results

In [90]:
@torch.no_grad()
def filter_features_by_token_and_source(features_df: pd.DataFrame, token_type: str, source: str) -> pd.DataFrame:
    """
    Filter features from the features DataFrame based on token type and source.
    
    Args:
        features_df: DataFrame containing feature information
        token_type: Token type to filter by (e.g., 'asst', 'newline', 'endheader')
        source: Source to filter by (e.g., 'qwen_trainer1_layer15')
    
    Returns:
        Filtered DataFrame containing only matching features
    """
    filtered_df = features_df[
        (features_df['token'] == token_type) & 
        (features_df['source'] == source)
    ].copy()
    
    return filtered_df

@torch.no_grad()
def get_filtered_feature_activations(token_features: torch.Tensor, features_df: pd.DataFrame, 
                                   token_type: str, source: str) -> Tuple[torch.Tensor, torch.Tensor, pd.DataFrame]:
    """
    Get feature activations for filtered features based on token type and source.
    
    Args:
        token_features: Feature activations tensor of shape [num_prompts, num_features]
        features_df: DataFrame containing feature information
        token_type: Token type to filter by
        source: Source to filter by
    
    Returns:
        feature_indices: Indices of filtered features
        feature_activations: Activation values for filtered features [num_prompts, num_selected_features]
        filtered_features_df: DataFrame with filtered feature information
    """
    # Filter features by token type and source
    filtered_features_df = filter_features_by_token_and_source(features_df, token_type, source)
    
    if len(filtered_features_df) == 0:
        print(f"Warning: No features found for token_type='{token_type}', source='{source}'")
        return torch.tensor([]), torch.tensor([]), filtered_features_df
    
    # Get feature indices
    feature_ids = filtered_features_df['feature_id'].tolist()
    feature_indices = torch.tensor(feature_ids, dtype=torch.long)
    
    # Extract activations for these features
    feature_activations = token_features[:, feature_indices]
    
    print(f"Found {len(feature_indices)} features for token_type='{token_type}', source='{source}'")
    
    return feature_indices, feature_activations, filtered_features_df

In [91]:
# Load target features from file
target_features_df = pd.read_csv(FEATURES_FILE)
print(f"Loaded {len(target_features_df)} total features from {FEATURES_FILE}")

# Prepare all results for CSV
all_results = []
source_name = f"{MODEL_TYPE}_trainer{SAE_TRAINER}_layer{SAE_LAYER}"

print(f"Processing results for source: {source_name}")

# Process each token type
for token_type in TOKEN_OFFSETS.keys():
    print(f"\nProcessing token type: {token_type}")
    
    # Get filtered feature activations for this token type and source
    feature_indices, feature_activations, filtered_features_df = get_filtered_feature_activations(
        token_features[token_type], target_features_df, token_type, source_name
    )
    
    if len(filtered_features_df) == 0:
        print(f"No features found for token_type='{token_type}', source='{source_name}'. Skipping.")
        continue
    
    # Calculate statistics for each feature
    features_processed = 0
    features_skipped = 0
    
    for idx, (feature_idx, feature_id) in enumerate(zip(feature_indices, filtered_features_df['feature_id'])):
        activations = feature_activations[:, idx]  # [num_prompts]
        
        # Calculate statistics only on active features (activation > 0)
        active_mask = activations > 0
        active_activations = activations[active_mask]
        
        # Skip features that aren't active on any prompt
        if len(active_activations) == 0:
            features_skipped += 1
            continue
        
        activation_mean = float(active_activations.mean())
        activation_max = float(active_activations.max())
        activation_min = float(active_activations.min())
        num_prompts = len(active_activations)
        
        # Create dashboard link
        dashboard_link = f"{FEATURE_DASHBOARD_BASE_URL}?model={MODEL_TYPE}&layer={SAE_LAYER}&trainer={SAE_TRAINER}&fids={feature_id}"
        
        # Add to results with specified column order
        result = {
            'feature_id': int(feature_id),
            'activation_mean': activation_mean,
            'activation_max': activation_max,
            'activation_min': activation_min,
            'num_prompts': num_prompts,
            'chat_desc': '',
            'pt_desc': '',
            'type': '',
            'source': source_name,
            'token': token_type,
            'link': dashboard_link
        }
        
        all_results.append(result)
        features_processed += 1
    
    print(f"Processed {features_processed} active features for token_type='{token_type}' (skipped {features_skipped} inactive)")

# Convert to DataFrame with specified column order
column_order = ['feature_id', 'activation_mean', 'activation_max', 'activation_min', 
                'num_prompts', 'chat_desc', 'pt_desc', 'type', 'source', 'token', 'link']
results_df = pd.DataFrame(all_results)[column_order]
print(f"\nTotal active features with results: {len(results_df)}")

# # Save to CSV (append mode if file exists, otherwise create new)
# if os.path.exists(OUTPUT_FILE):
#     # If file exists, append to it
#     results_df.to_csv(OUTPUT_FILE, mode='a', header=False, index=False)
#     print(f"Results appended to existing file: {OUTPUT_FILE}")
# else:
#     # Create new file
#     results_df.to_csv(OUTPUT_FILE, index=False)
#     print(f"Results saved to new file: {OUTPUT_FILE}")

if len(results_df) > 0:
    print(f"\nPreview of saved data:")
    print(results_df.head(10).to_string(index=False))
    
    # Show sample link
    sample_link = results_df.iloc[0]['link']
    print(f"\nSample dashboard link: {sample_link}")
    
    # Show summary by token type
    print(f"\nSummary by token type:")
    summary = results_df.groupby('token').agg({
        'activation_mean': ['count', 'mean', 'max'],
        'activation_max': 'max',
        'num_prompts': ['mean', 'max'],
        'feature_id': 'nunique'
    }).round(4)
    print(summary)
else:
    print("No results to save - no features were active on any prompts.")

Loaded 92 total features from ./results/1_personal/only_personal.csv
Processing results for source: llama_trainer1_layer15

Processing token type: asst
Found 2 features for token_type='asst', source='llama_trainer1_layer15'
Processed 1 active features for token_type='asst' (skipped 1 inactive)

Processing token type: endheader
Found 6 features for token_type='endheader', source='llama_trainer1_layer15'
Processed 2 active features for token_type='endheader' (skipped 4 inactive)

Processing token type: newline
Found 7 features for token_type='newline', source='llama_trainer1_layer15'
Processed 0 active features for token_type='newline' (skipped 7 inactive)

Total active features with results: 3

Preview of saved data:
 feature_id  activation_mean  activation_max  activation_min  num_prompts chat_desc pt_desc type                 source     token                                                                                          link
      27476         0.310405        0.338905      

## Record prompts which activate target features

In [92]:
@torch.no_grad()
def record_all_token_activations_for_feature(feature_id: int, feature_idx: int, 
                                           feature_activations: torch.Tensor,
                                           prompts_df: pd.DataFrame,
                                           full_activations: torch.Tensor,
                                           activation_threshold: float = 0.0) -> List[Dict]:
    """
    Record activations for every non-zero token in prompts that activate a specific feature.
    
    Args:
        feature_id: The actual feature ID
        feature_idx: Index of the feature in the filtered feature list
        feature_activations: Feature activations for this token type [num_prompts, num_features]
        prompts_df: DataFrame with prompts and labels
        full_activations: Full sequence activations [num_prompts, seq_len, hidden_dim]
        activation_threshold: Minimum activation to consider as "active"
    
    Returns:
        List of dictionaries with token-level activation data
    """
    # Find which prompts activate this feature
    activations = feature_activations[:, feature_idx]
    active_mask = activations > activation_threshold
    active_indices = torch.where(active_mask)[0]
    
    if len(active_indices) == 0:
        return []
    
    all_token_records = []
    
    for prompt_idx in active_indices:
        prompt_idx = int(prompt_idx)
        
        # Get the full activation sequence for this prompt
        prompt_activations = full_activations[prompt_idx]  # [seq_len, hidden_dim]
        
        # Get SAE features for all positions in this prompt
        prompt_sae_features = get_sae_features(prompt_activations.unsqueeze(0))  # [1, seq_len, num_features]
        prompt_sae_features = prompt_sae_features.squeeze(0)  # [seq_len, num_features]
        
        # Get activations for our specific feature across all positions
        feature_activations_sequence = prompt_sae_features[:, feature_id]  # [seq_len]
        
        # Get the formatted prompt to tokenize properly
        prompt_text = prompts_df.iloc[prompt_idx]["prompt"]
        messages = [{"role": "user", "content": prompt_text}]
        formatted_prompt = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        
        # Tokenize the formatted prompt
        tokenized = tokenizer(formatted_prompt, return_tensors="pt", add_special_tokens=False)
        input_ids = tokenized["input_ids"].squeeze(0)
        
        # Record all non-zero activations
        for pos_idx in range(len(feature_activations_sequence)):
            activation_val = float(feature_activations_sequence[pos_idx])
            
            if activation_val > 0 and pos_idx < len(input_ids):
                token_id = int(input_ids[pos_idx])
                token_text = tokenizer.decode([token_id])
                
                record = {
                    "feature_id": feature_id,
                    "prompt_id": prompt_idx,
                    "prompt_text": prompt_text,
                    "prompt_label": prompts_df.iloc[prompt_idx]["label"],
                    "prompt_feature_activation": float(activations[prompt_idx]),
                    "token_position": pos_idx,
                    "token_id": token_id,
                    "token_text": token_text,
                    "token_activation": activation_val
                }
                all_token_records.append(record)
    
    return all_token_records

@torch.no_grad()
def analyze_all_features_detailed(token_features: Dict[str, torch.Tensor],
                                target_features_df: pd.DataFrame,
                                prompts_df: pd.DataFrame,
                                full_activations: torch.Tensor,
                                source: str) -> Dict[str, List[Dict]]:
    """
    Analyze all features and record detailed token activations.
    
    Returns:
        Dictionary mapping token_type to list of token activation records
    """
    all_detailed_results = {}
    
    for token_type in TOKEN_OFFSETS.keys():
        print(f"Processing detailed analysis for token type: {token_type}")
        
        # Get filtered feature activations for this token type and source
        _, feature_activations, filtered_features_df = get_filtered_feature_activations(
            token_features[token_type], target_features_df, token_type, source
        )
        
        if len(filtered_features_df) == 0:
            print(f"No features found for token_type='{token_type}', source='{source}'. Skipping.")
            all_detailed_results[token_type] = []
            continue
        
        token_records = []
        
        # Process each feature
        for idx, feature_id in enumerate(filtered_features_df['feature_id']):
            feature_id = int(feature_id)
            
            # Get detailed token records for this feature
            feature_records = record_all_token_activations_for_feature(
                feature_id, idx, feature_activations, prompts_df, 
                full_activations
            )
            
            token_records.extend(feature_records)
        
        all_detailed_results[token_type] = token_records
        print(f"Recorded {len(token_records)} token activations for token_type='{token_type}'")
    
    return all_detailed_results

print("Detailed token activation recording functions defined")

Detailed token activation recording functions defined


In [93]:
# Run detailed token activation analysis and save as JSONL
print("Running detailed token activation analysis...")
detailed_results = analyze_all_features_detailed(
    token_features, target_features_df, prompts_df, 
    full_activations, source_name
)

# Convert to JSONL format

os.makedirs(os.path.dirname(PROMPT_OUTPUT_FILE), exist_ok=True)

total_records = 0
jsonl_records = []

for token_type, token_records in detailed_results.items():
    if len(token_records) > 0:
        # Group by feature_id
        feature_groups = {}
        for record in token_records:
            feature_id = record['feature_id']
            if feature_id not in feature_groups:
                feature_groups[feature_id] = []
            feature_groups[feature_id].append(record)
        
        # Create JSONL records for each feature
        for feature_id, records in feature_groups.items():
            # Group by prompt_id
            prompt_groups = {}
            for record in records:
                prompt_id = record['prompt_id']  # Will rename to prompt_id
                if prompt_id not in prompt_groups:
                    prompt_groups[prompt_id] = {
                        'prompt_id': prompt_id,
                        'prompt_text': record['prompt_text'],
                        'prompt_label': record['prompt_label'],
                        'prompt_feature_activation': record['prompt_feature_activation'],
                        'tokens': []
                    }
                
                # Add token info
                prompt_groups[prompt_id]['tokens'].append({
                    'position': record['token_position'],
                    'token_id': record['token_id'],
                    'text': record['token_text'],
                    'activation': record['token_activation']
                })
            
            # Sort tokens by activation (descending) within each prompt
            for prompt_data in prompt_groups.values():
                prompt_data['tokens'].sort(key=lambda x: x['activation'], reverse=True)
            
            # Create final JSONL record
            jsonl_record = {
                'feature_id': feature_id,
                'token': token_type,  # Renamed from token_type
                'source': source_name,
                'active_prompts': list(prompt_groups.values())
            }
            
            jsonl_records.append(jsonl_record)
            total_records += len(prompt_groups)

# Sort by feature_id for consistent output
jsonl_records.sort(key=lambda x: x['feature_id'])

# Write to JSONL file (append if exists)
with open(PROMPT_OUTPUT_FILE, 'a') as f:
    for record in jsonl_records:
        f.write(json.dumps(record) + '\n')

print(f"Saved {len(jsonl_records)} feature records ({total_records} prompt records) to {PROMPT_OUTPUT_FILE}")

# Show preview
if len(jsonl_records) > 0:
    print(f"\nPreview of first record:")
    sample_record = jsonl_records[0]
    print(f"Feature {sample_record['feature_id']} ({sample_record['token']}, {sample_record['source']}):")
    print(f"  {len(sample_record['active_prompts'])} active prompts")
    if sample_record['active_prompts']:
        first_prompt = sample_record['active_prompts'][0]
        print(f"  First prompt: {len(first_prompt['tokens'])} active tokens")
        print(f"  Top token: '{first_prompt['tokens'][0]['text']}' (activation: {first_prompt['tokens'][0]['activation']:.4f})")
    
    print(f"\nFile structure summary:")
    for record in jsonl_records:
        total_tokens = sum(len(prompt['tokens']) for prompt in record['active_prompts'])
        print(f"  Feature {record['feature_id']} ({record['token']}): {len(record['active_prompts'])} prompts, {total_tokens} tokens")
else:
    print("No records to save.")

Running detailed token activation analysis...
Processing detailed analysis for token type: asst
Found 2 features for token_type='asst', source='llama_trainer1_layer15'
Recorded 40 token activations for token_type='asst'
Processing detailed analysis for token type: endheader
Found 6 features for token_type='endheader', source='llama_trainer1_layer15'
Recorded 370 token activations for token_type='endheader'
Processing detailed analysis for token type: newline
Found 7 features for token_type='newline', source='llama_trainer1_layer15'
Recorded 0 token activations for token_type='newline'
Saved 3 feature records (20 prompt records) to ./results/3_personal_general/2_personal_general_prompts.jsonl

Preview of first record:
Feature 27476 (asst, llama_trainer1_layer15):
  5 active prompts
  First prompt: 12 active tokens
  Top token: '<|begin_of_text|>' (activation: 30.1081)

File structure summary:
  Feature 27476 (asst): 5 prompts, 40 tokens
  Feature 47776 (endheader): 1 prompts, 3 tokens
 