# Feature Analysis: Assistant vs Control Prompts

This notebook analyzes which SAE features activate strongly on assistant prompts but not on control prompts. The model and layer are configurable - see the configuration cell below.

In [41]:
import json
import torch
import os
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict, Tuple
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import hf_hub_download
from dictionary_learning.utils import load_dictionary
from tqdm.auto import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Configs

In [None]:
# =============================================================================
# MODEL SELECTION - Change this to switch between models
# =============================================================================
MODEL_TYPE = "qwen"  # Options: "qwen" or "llama"
TOKEN_TYPE = "asst"  # Options: "asst", "newline", "endheader" (endheader only for llama)
SAE_LAYER = 11
SAE_TRAINER = 1

# =============================================================================
# AUTO-CONFIGURED SETTINGS BASED ON MODEL TYPE
# =============================================================================
if MODEL_TYPE == "qwen":
    MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
    SAE_RELEASE = "andyrdt/saes-qwen2.5-7b-instruct"
    ASSISTANT_HEADER = "<|im_start|>assistant"
    TOKEN_OFFSETS = {"asst": -1, "newline": 0}
    SAE_BASE_PATH = "/workspace/sae/qwen-2.5-7b-instruct/saes"
    
elif MODEL_TYPE == "llama":
    MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
    SAE_RELEASE = "andyrdt/saes-llama-3.1-8b-instruct"
    ASSISTANT_HEADER = "<|start_header_id|>assistant<|end_header_id|>"
    TOKEN_OFFSETS = {"asst": -2, "endheader": -1, "newline": 0}
    SAE_BASE_PATH = "/workspace/sae/llama-3.1-8b-instruct/saes"
    
else:
    raise ValueError(f"Unknown MODEL_TYPE: {MODEL_TYPE}. Use 'qwen' or 'llama'")

# Validate token type
if TOKEN_TYPE not in TOKEN_OFFSETS:
    raise ValueError(f"TOKEN_TYPE '{TOKEN_TYPE}' not available for {MODEL_TYPE}. Available: {list(TOKEN_OFFSETS.keys())}")

# =============================================================================
# DERIVED CONFIGURATIONS
# =============================================================================
SAE_CONFIG = {
    "release": SAE_RELEASE,
    "layer": SAE_LAYER,
    "trainer": SAE_TRAINER
}
SAE_PATH = f"{SAE_BASE_PATH}/resid_post_layer_{SAE_LAYER}/trainer_{SAE_TRAINER}"
LAYER_INDEX = SAE_LAYER
TOKEN_OFFSET = TOKEN_OFFSETS[TOKEN_TYPE]

# Data paths
ASSISTANT_PROMPTS_PATH = "assistant_prompts.jsonl"
CONTROL_PROMPTS_PATH = "control_prompts.jsonl"

# Output directory with clear naming
OUTPUT_DIR = f"./{MODEL_TYPE}_trainer{SAE_TRAINER}_layer{SAE_LAYER}_{TOKEN_TYPE}"

# Processing parameters
BATCH_SIZE = 8
MAX_LENGTH = 512
TOP_FEATURES = 100

# =============================================================================
# SUMMARY
# =============================================================================
print(f"Configuration Summary:")
print(f"  Model: {MODEL_NAME}")
print(f"  SAE Layer: {SAE_LAYER}, Trainer: {SAE_TRAINER}")
print(f"  Token extraction: {TOKEN_TYPE} (offset: {TOKEN_OFFSET})")
print(f"  Assistant header: {ASSISTANT_HEADER}")
print(f"  Output directory: {OUTPUT_DIR}")
print(f"  SAE Release: {SAE_RELEASE}")

## Load Data

In [43]:
def load_prompts(filepath: str) -> List[str]:
    """Load prompts from JSONL file."""
    prompts = []
    with open(filepath, 'r') as f:
        for line in f:
            data = json.loads(line.strip())
            prompts.append(data['content'])
    return prompts

# Load assistant and control prompts
assistant_prompts = load_prompts(ASSISTANT_PROMPTS_PATH)
control_prompts = load_prompts(CONTROL_PROMPTS_PATH)

print(f"Loaded {len(assistant_prompts)} assistant prompts")
print(f"Loaded {len(control_prompts)} control prompts")
print("\nExample assistant prompt:", assistant_prompts[0])
print("Example control prompt:", control_prompts[0])

Loaded 12 assistant prompts
Loaded 12 control prompts

Example assistant prompt: What's it like to be you?
Example control prompt: What is the weather today?


## Load Model and SAE

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print(f"Tokenizer loaded: {tokenizer.__class__.__name__}")

# Test chat template formatting
test_messages = [{"role": "user", "content": "What's it like to be you?"}]
formatted_test = tokenizer.apply_chat_template(test_messages, tokenize=False, add_generation_prompt=True)
print(f"\nChat template test:")
print(f"Original: What's it like to be you?")
print(f"Formatted: {repr(formatted_test)}")
print(f"Formatted (readable):\n{formatted_test}")

# Test tokenization of assistant header to understand positioning
print(f"\n" + "="*60)
print("ASSISTANT HEADER TOKENIZATION ANALYSIS")
print("="*60)

assistant_tokens = tokenizer.encode(ASSISTANT_HEADER, add_special_tokens=False)
assistant_token_texts = [tokenizer.decode([token]) for token in assistant_tokens]

print(f"Assistant header: {ASSISTANT_HEADER}")
print(f"Number of tokens: {len(assistant_tokens)}")
print(f"Token IDs: {assistant_tokens}")
print(f"Individual tokens: {assistant_token_texts}")

# Test with a full formatted prompt
full_tokens = tokenizer.encode(formatted_test, add_special_tokens=False)
full_token_texts = [tokenizer.decode([token]) for token in full_tokens]

print(f"\nFull prompt tokens: {len(full_tokens)}")
print("All tokens with positions:")
for i, token_text in enumerate(full_token_texts):
    print(f"  {i:2d}: '{token_text}'")

# Find where assistant header appears in full prompt
assistant_start_pos = None
for i in range(len(full_tokens) - len(assistant_tokens) + 1):
    if full_tokens[i:i+len(assistant_tokens)] == assistant_tokens:
        assistant_start_pos = i
        break

if assistant_start_pos is not None:
    assistant_end_pos = assistant_start_pos + len(assistant_tokens) - 1
    print(f"\nAssistant header found at positions {assistant_start_pos} to {assistant_end_pos}")
    print(f"Assistant header tokens: {full_token_texts[assistant_start_pos:assistant_end_pos+1]}")
    
    # Show what the extraction function will actually extract
    extraction_pos = assistant_start_pos + len(assistant_tokens) + TOKEN_OFFSET
    print(f"\nExtraction calculation:")
    print(f"  assistant_start_pos: {assistant_start_pos}")
    print(f"  + len(assistant_tokens): {len(assistant_tokens)}")  
    print(f"  + TOKEN_OFFSET ('{TOKEN_TYPE}'): {TOKEN_OFFSET}")
    print(f"  = extraction_pos: {extraction_pos}")
    
    if 0 <= extraction_pos < len(full_token_texts):
        print(f"✓ Token at extraction position {extraction_pos}: '{full_token_texts[extraction_pos]}'")
    else:
        print(f"❌ Extraction position {extraction_pos} is out of bounds (valid range: 0-{len(full_token_texts)-1})")
else:
    print("❌ Assistant header not found in full prompt")

In [45]:
# Load model
device_map_value = device.index if device.type == 'cuda' and device.index is not None else str(device)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map={"": device_map_value}
)
model.eval()

print(f"Model loaded: {model.__class__.__name__}")
print(f"Model device: {next(model.parameters()).device}")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded: Qwen2ForCausalLM
Model device: cuda:0


In [46]:
# Load SAE
ae_file_path = os.path.join(SAE_PATH, "ae.pt")
config_file_path = os.path.join(SAE_PATH, "config.json")

if os.path.exists(ae_file_path) and os.path.exists(config_file_path):
    print(f"✓ Found SAE files at: {os.path.dirname(ae_file_path)}")
else:
    print(f"SAE not found locally, downloading from {SAE_RELEASE}...")
    os.makedirs(os.path.dirname(ae_file_path), exist_ok=True)
    sae_path = f"resid_post_layer_{SAE_LAYER}/trainer_{SAE_TRAINER}"
    local_dir = SAE_BASE_PATH
    ae_file = hf_hub_download(repo_id=SAE_RELEASE, filename=f"{sae_path}/ae.pt", local_dir=local_dir)
    config_file = hf_hub_download(repo_id=SAE_RELEASE, filename=f"{sae_path}/config.json", local_dir=local_dir)

sae, _ = load_dictionary(SAE_PATH, device=device)
sae.eval()

print(f"SAE loaded with {sae.dict_size} features")
print(f"SAE device: {next(sae.parameters()).device}")

✓ Found SAE files at: /workspace/sae/qwen2.5-7b-instruct/saes/resid_post_layer_11/trainer_1
SAE loaded with 131072 features
SAE device: cuda:0


## Activation Extraction Functions

In [None]:
class StopForward(Exception):
    """Exception to stop forward pass after target layer."""
    pass

@torch.no_grad()
def extract_activations(prompts: List[str], layer_idx: int) -> torch.Tensor:
    """Extract activations from specified layer for given prompts."""
    all_activations = []
    
    # Get target layer
    target_layer = model.model.layers[layer_idx]
    
    # Process in batches
    for i in tqdm(range(0, len(prompts), BATCH_SIZE), desc="Processing batches"):
        batch_prompts = prompts[i:i+BATCH_SIZE]
        
        # Format prompts as chat messages
        formatted_prompts = []
        for prompt in batch_prompts:
            messages = [{"role": "user", "content": prompt}]
            formatted_prompt = tokenizer.apply_chat_template(
                messages, 
                tokenize=False, 
                add_generation_prompt=True
            )
            formatted_prompts.append(formatted_prompt)
        
        # Tokenize batch
        batch_inputs = tokenizer(
            formatted_prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH
        )
        
        # Move to device
        batch_inputs = {k: v.to(device) for k, v in batch_inputs.items()}
        
        # Hook to capture activations
        activations = None
        
        def hook_fn(module, input, output):
            nonlocal activations
            # Output is tuple, take first element (hidden states)
            activations = output[0] if isinstance(output, tuple) else output
            raise StopForward()
        
        # Register hook
        handle = target_layer.register_forward_hook(hook_fn)
        
        try:
            # Forward pass (will be stopped by hook)
            _ = model(**batch_inputs)
        except StopForward:
            pass
        finally:
            handle.remove()
        
        # Extract assistant token positions
        batch_activations = []
        for j, formatted_prompt in enumerate(formatted_prompts):
            # Find assistant header position
            assistant_tokens = tokenizer.encode(ASSISTANT_HEADER, add_special_tokens=False)
            input_ids = batch_inputs["input_ids"][j]
            
            # Find where assistant section starts
            assistant_pos = None
            for k in range(len(input_ids) - len(assistant_tokens) + 1):
                if torch.equal(input_ids[k:k+len(assistant_tokens)], torch.tensor(assistant_tokens).to(device)):
                    assistant_pos = k + len(assistant_tokens) + TOKEN_OFFSET
                    break
            
            if assistant_pos is None:
                # Fallback to last non-padding token
                attention_mask = batch_inputs["attention_mask"][j]
                assistant_pos = attention_mask.sum().item() - 1
            
            # Ensure position is within bounds
            max_pos = attention_mask.sum().item() - 1
            assistant_pos = min(assistant_pos, max_pos)
            assistant_pos = max(assistant_pos, 0)
            
            # Extract activation at assistant position
            assistant_activation = activations[j, assistant_pos, :]  # [hidden_dim]
            batch_activations.append(assistant_activation.cpu())
        
        all_activations.extend(batch_activations)
    
    return torch.stack(all_activations, dim=0)

print("Activation extraction functions defined")

## Extract Activations for Both Prompt Sets

In [48]:
# Extract activations for assistant prompts
print("Extracting activations for assistant prompts...")
assistant_activations = extract_activations(assistant_prompts, LAYER_INDEX)
print(f"Assistant activations shape: {assistant_activations.shape}")

# Extract activations for control prompts
print("\nExtracting activations for control prompts...")
control_activations = extract_activations(control_prompts, LAYER_INDEX)
print(f"Control activations shape: {control_activations.shape}")

Extracting activations for assistant prompts...


Processing batches:   0%|          | 0/2 [00:00<?, ?it/s]

Assistant activations shape: torch.Size([12, 3584])

Extracting activations for control prompts...


Processing batches:   0%|          | 0/2 [00:00<?, ?it/s]

Control activations shape: torch.Size([12, 3584])


## Apply SAE to Get Feature Activations

In [49]:
@torch.no_grad()
def get_sae_features(activations: torch.Tensor) -> torch.Tensor:
    """Apply SAE to get feature activations."""
    activations = activations.to(device)
    
    # Process in batches to avoid memory issues
    feature_activations = []
    
    for i in range(0, activations.shape[0], BATCH_SIZE):
        batch = activations[i:i+BATCH_SIZE]
        features = sae.encode(batch)  # [batch, num_features]
        feature_activations.append(features.cpu())
    
    return torch.cat(feature_activations, dim=0)

# Get SAE feature activations
print("Computing SAE features for assistant prompts...")
assistant_features = get_sae_features(assistant_activations)
print(f"Assistant features shape: {assistant_features.shape}")

print("\nComputing SAE features for control prompts...")
control_features = get_sae_features(control_activations)
print(f"Control features shape: {control_features.shape}")

Computing SAE features for assistant prompts...
Assistant features shape: torch.Size([12, 131072])

Computing SAE features for control prompts...
Control features shape: torch.Size([12, 131072])


# Overall Feature Landscape Analysis

This section provides a comprehensive view of how all SAE features differ between assistant and control prompts. This exploratory analysis helps understand the general patterns and identifies features with the largest overall differences.

In [50]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Calculate comprehensive feature statistics for exploratory analysis
assistant_mean = assistant_features.mean(dim=0)  # [num_features]
control_mean = control_features.mean(dim=0)      # [num_features]

# Calculate standard deviations
assistant_std = assistant_features.std(dim=0)
control_std = control_features.std(dim=0)

# Calculate difference (assistant - control)
feature_diff = assistant_mean - control_mean

# Calculate effect size (Cohen's d)
pooled_std = torch.sqrt(((assistant_features.shape[0] - 1) * assistant_std**2 + 
                        (control_features.shape[0] - 1) * control_std**2) / 
                       (assistant_features.shape[0] + control_features.shape[0] - 2))
cohens_d = feature_diff / (pooled_std + 1e-8)  # Add small epsilon to avoid division by zero

print(f"Comprehensive statistics computed for {len(feature_diff):,} SAE features")
print(f"Max difference: {feature_diff.max():.4f}")
print(f"Min difference: {feature_diff.min():.4f}")
print(f"Max Cohen's d: {cohens_d.max():.4f}")
print(f"Min Cohen's d: {cohens_d.min():.4f}")
print(f"Features with positive difference (favor assistant): {(feature_diff > 0).sum():,}")
print(f"Features with negative difference (favor control): {(feature_diff < 0).sum():,}")

Comprehensive statistics computed for 131,072 SAE features
Max difference: 4.5973
Min difference: -3.4631
Max Cohen's d: 5.6876
Min Cohen's d: -3.5273
Features with positive difference (favor assistant): 17
Features with negative difference (favor control): 9


## Top Features by Overall Difference (Exploratory)

In [51]:
# # Find features with largest overall differences (comprehensive landscape view)
# # This shows ALL types of differences, not just assistant-specific ones

# # Sort by difference (assistant - control)
# diff_sorted_indices = torch.argsort(feature_diff, descending=True)
# top_diff_features = diff_sorted_indices[:TOP_FEATURES]

# # Sort by Cohen's d (effect size)
# cohens_d_sorted_indices = torch.argsort(cohens_d, descending=True)
# top_cohens_d_features = cohens_d_sorted_indices[:TOP_FEATURES]

# # Create comprehensive results dataframe for general landscape
# landscape_results_data = []

# print(f"Top {TOP_FEATURES} features by raw difference (assistant - control):")
# print("This exploratory analysis shows the biggest overall differences, regardless of activation patterns")
# print("Feature ID | Assistant Mean | Control Mean | Difference | Cohen's d | Interpretation")
# print("-" * 90)

# for i, feature_idx in enumerate(top_diff_features[:20]):  # Show top 20
#     feature_id = feature_idx.item()
#     ass_mean = assistant_mean[feature_idx].item()
#     ctrl_mean = control_mean[feature_idx].item()
#     diff = feature_diff[feature_idx].item()
#     cohens = cohens_d[feature_idx].item()
    
#     # Determine interpretation
#     if ctrl_mean < 0.01:
#         interpretation = "Assistant-specific"
#     elif ass_mean < 0.01:
#         interpretation = "Control-specific"  
#     elif ass_mean > ctrl_mean * 3:
#         interpretation = "Strongly favors assistant"
#     elif ctrl_mean > ass_mean * 3:
#         interpretation = "Strongly favors control"
#     else:
#         interpretation = "Moderate difference"
    
#     print(f"{feature_id:>10} | {ass_mean:>13.4f} | {ctrl_mean:>12.4f} | {diff:>10.4f} | {cohens:>9.3f} | {interpretation}")
    
#     landscape_results_data.append({
#         'feature_id': feature_id,
#         'assistant_mean': ass_mean,
#         'control_mean': ctrl_mean,
#         'difference': diff,
#         'cohens_d': cohens,
#         'rank_by_diff': i + 1,
#         'interpretation': interpretation,
#         'analysis_type': 'landscape_difference'
#     })

# # Save comprehensive landscape results
# landscape_df = pd.DataFrame(landscape_results_data)
# landscape_df.to_csv('feature_landscape_analysis.csv', index=False)
# print(f"\nComprehensive landscape results saved to feature_landscape_analysis.csv")
# print("Note: This includes all types of differences - see next section for assistant-specific analysis")

## Statistical Significance Testing

In [52]:
# from scipy import stats

# # Perform t-tests for top landscape features to assess statistical significance
# significant_features = []

# print("Statistical significance testing (t-test) for top 20 landscape features:")
# print("Testing whether the observed differences are statistically reliable")
# print("Feature ID | t-statistic | p-value | Significant (p<0.05)")
# print("-" * 60)

# for feature_idx in top_diff_features[:20]:
#     feature_id = feature_idx.item()
    
#     # Get feature activations for both groups
#     assistant_vals = assistant_features[:, feature_idx].numpy()
#     control_vals = control_features[:, feature_idx].numpy()
    
#     # Perform independent t-test
#     t_stat, p_val = stats.ttest_ind(assistant_vals, control_vals)
    
#     is_significant = p_val < 0.05
#     if is_significant:
#         significant_features.append(feature_id)
    
#     print(f"{feature_id:>10} | {t_stat:>11.3f} | {p_val:>7.4f} | {is_significant}")

# print(f"\nFound {len(significant_features)} statistically significant features (p < 0.05) from landscape analysis")

# Targeted Assistant-Specific Feature Analysis

This section identifies features that are **uniquely characteristic of assistant responses**.

In [53]:
# Assistant-specific feature analysis functions
def find_assistant_only_features(assistant_mean, control_mean, 
                                 assistant_threshold=0.1, control_threshold=0.01):
    """Find features that activate strongly on assistant prompts but not on control prompts."""
    assistant_active = assistant_mean > assistant_threshold
    control_inactive = control_mean <= control_threshold
    
    assistant_only_mask = assistant_active & control_inactive
    assistant_only_indices = torch.where(assistant_only_mask)[0]
    
    # Sort by assistant activation strength
    assistant_only_values = assistant_mean[assistant_only_indices]
    sorted_indices = torch.argsort(assistant_only_values, descending=True)
    top_assistant_only = assistant_only_indices[sorted_indices]
    
    return top_assistant_only

def find_high_effect_features(cohens_d, assistant_mean, control_mean,
                             effect_threshold=1.0, control_threshold=0.05, assistant_threshold=0.1):
    """Find features with high effect size favoring assistant prompts."""
    strong_assistant_features = (
        (cohens_d > effect_threshold) & 
        (control_mean < control_threshold) &
        (assistant_mean > assistant_threshold)
    )
    
    strong_assistant_indices = torch.where(strong_assistant_features)[0]
    
    # Sort by effect size
    effect_values = cohens_d[strong_assistant_indices]
    sorted_indices = torch.argsort(effect_values, descending=True)
    top_effect_features = strong_assistant_indices[sorted_indices]
    
    return top_effect_features

def find_ratio_based_features(assistant_mean, control_mean,
                             min_ratio=10.0, min_assistant_activation=0.1):
    """Find features where assistant activation is much higher than control."""
    # Avoid division by zero
    safe_control_mean = control_mean + 1e-6
    activation_ratio = assistant_mean / safe_control_mean
    
    ratio_based_features = (
        (activation_ratio > min_ratio) &
        (assistant_mean > min_assistant_activation)
    )
    
    ratio_based_indices = torch.where(ratio_based_features)[0]
    
    # Sort by ratio
    ratio_values = activation_ratio[ratio_based_indices]
    sorted_indices = torch.argsort(ratio_values, descending=True)
    top_ratio_features = ratio_based_indices[sorted_indices]
    
    return top_ratio_features

def find_universal_assistant_features(assistant_features, control_features,
                                     assistant_threshold=0.1, control_threshold=0.01):
    """Find features that activate on ALL assistant prompts but NO control prompts."""
    assistant_active_all = (assistant_features > assistant_threshold).all(dim=0)
    control_active_none = (control_features <= control_threshold).all(dim=0)
    
    universal_assistant_features = assistant_active_all & control_active_none
    universal_indices = torch.where(universal_assistant_features)[0]
    
    # Sort by mean assistant activation
    assistant_mean_universal = assistant_features[:, universal_indices].mean(dim=0)
    sorted_indices = torch.argsort(assistant_mean_universal, descending=True)
    top_universal_features = universal_indices[sorted_indices]
    
    return top_universal_features

def save_feature_analysis_to_csv(feature_indices, assistant_mean, control_mean, 
                                cohens_d, filename, analysis_type):
    """Save feature analysis results to CSV."""
    results_data = []
    
    for i, feature_idx in enumerate(feature_indices):
        feature_id = feature_idx.item()
        ass_mean = assistant_mean[feature_idx].item()
        ctrl_mean = control_mean[feature_idx].item()
        effect_size = cohens_d[feature_idx].item()
        
        results_data.append({
            'feature_id': feature_id,
            'assistant_mean': ass_mean,
            'control_mean': ctrl_mean,
            'difference': ass_mean - ctrl_mean,
            'cohens_d': effect_size,
            'rank': i + 1,
            'analysis_type': analysis_type
        })
    
    results_df = pd.DataFrame(results_data)
    results_df.to_csv(f"{OUTPUT_DIR}/{filename}", index=False)
    print(f"Saved {len(results_data)} {analysis_type} features to {filename}")
    
    return results_df

print("Assistant-specific analysis functions defined")

Assistant-specific analysis functions defined


In [54]:
# Run all four assistant-specific analyses
print("=" * 80)
print("ASSISTANT-SPECIFIC FEATURE ANALYSIS")
print("=" * 80)
print("These analyses identify features uniquely characteristic of assistant responses")
print()

# Option 1: Assistant-only features (strict criteria)
print("Analysis 1: ASSISTANT-ONLY FEATURES")
print("   Criteria: Strong on assistant (>0.1), minimal on control (≤0.01)")
assistant_only_features = find_assistant_only_features(assistant_mean, control_mean)
print(f"   Found: {len(assistant_only_features)} features")
if len(assistant_only_features) > 0:
    save_feature_analysis_to_csv(assistant_only_features, assistant_mean, control_mean, 
                                cohens_d, 'assistant_only_features.csv', 'assistant_only')
    # Show top 5
    for i, feature_idx in enumerate(assistant_only_features[:5]):
        feature_id = feature_idx.item()
        ass_mean = assistant_mean[feature_idx].item()
        ctrl_mean = control_mean[feature_idx].item()
        print(f"     #{i+1}: Feature {feature_id} (Assistant: {ass_mean:.4f}, Control: {ctrl_mean:.4f})")
print()

# Option 2: High effect size features (statistical strength)
print("Analysis 2: HIGH EFFECT SIZE FEATURES")
print("   Criteria: Large Cohen's d (>1.0), low control activation (<0.05)")
high_effect_features = find_high_effect_features(cohens_d, assistant_mean, control_mean)
print(f"   Found: {len(high_effect_features)} features")
if len(high_effect_features) > 0:
    save_feature_analysis_to_csv(high_effect_features, assistant_mean, control_mean,
                                cohens_d, 'high_effect_features.csv', 'high_effect')
    # Show top 5
    for i, feature_idx in enumerate(high_effect_features[:5]):
        feature_id = feature_idx.item()
        effect_size = cohens_d[feature_idx].item()
        ass_mean = assistant_mean[feature_idx].item()
        print(f"     #{i+1}: Feature {feature_id} (Cohen's d: {effect_size:.3f}, Assistant: {ass_mean:.4f})")
print()

# Option 3: Ratio-based features (dramatic differences)
print("Analysis 3: RATIO-BASED FEATURES")
print("   Criteria: Assistant activation ≥10x control activation")
ratio_features = find_ratio_based_features(assistant_mean, control_mean)
print(f"   Found: {len(ratio_features)} features")
if len(ratio_features) > 0:
    save_feature_analysis_to_csv(ratio_features, assistant_mean, control_mean,
                                cohens_d, 'ratio_based_features.csv', 'ratio_based')
    # Show top 5 with ratios
    safe_control = control_mean + 1e-6
    ratios = assistant_mean / safe_control
    for i, feature_idx in enumerate(ratio_features[:5]):
        feature_id = feature_idx.item()
        ratio = ratios[feature_idx].item()
        ass_mean = assistant_mean[feature_idx].item()
        ctrl_mean = control_mean[feature_idx].item()
        print(f"     #{i+1}: Feature {feature_id} (Ratio: {ratio:.1f}x, {ass_mean:.4f} vs {ctrl_mean:.4f})")
print()

# Option 4: Universal assistant features (most reliable)
print("Analysis 4: UNIVERSAL ASSISTANT FEATURES")
print("   Criteria: Active on ALL assistant prompts, inactive on ALL control prompts")
universal_features = find_universal_assistant_features(assistant_features, control_features)
print(f"   Found: {len(universal_features)} features")
if len(universal_features) > 0:
    save_feature_analysis_to_csv(universal_features, assistant_mean, control_mean,
                                cohens_d, 'universal_assistant_features.csv', 'universal')
    # Show top 5
    for i, feature_idx in enumerate(universal_features[:5]):
        feature_id = feature_idx.item()
        ass_mean = assistant_mean[feature_idx].item()
        ctrl_mean = control_mean[feature_idx].item()
        print(f"     #{i+1}: Feature {feature_id} (Assistant: {ass_mean:.4f}, Control: {ctrl_mean:.4f})")
print()

print("=" * 80)
print("SUMMARY OF ANALYSIS OUTPUTS")
print("=" * 80)
print("Assistant-only features: assistant_only_features.csv")
print("High effect size features: high_effect_features.csv") 
print("Ratio-based features: ratio_based_features.csv")
print("Universal features: universal_assistant_features.csv")

ASSISTANT-SPECIFIC FEATURE ANALYSIS
These analyses identify features uniquely characteristic of assistant responses

Analysis 1: ASSISTANT-ONLY FEATURES
   Criteria: Strong on assistant (>0.1), minimal on control (≤0.01)
   Found: 8 features
Saved 8 assistant_only features to assistant_only_features.csv
     #1: Feature 18703 (Assistant: 0.8322, Control: 0.0000)
     #2: Feature 45508 (Assistant: 0.7316, Control: 0.0000)
     #3: Feature 30068 (Assistant: 0.4780, Control: 0.0000)
     #4: Feature 70419 (Assistant: 0.3200, Control: 0.0000)
     #5: Feature 20338 (Assistant: 0.2552, Control: 0.0000)

Analysis 2: HIGH EFFECT SIZE FEATURES
   Criteria: Large Cohen's d (>1.0), low control activation (<0.05)
   Found: 2 features
Saved 2 high_effect features to high_effect_features.csv
     #1: Feature 45508 (Cohen's d: 1.338, Assistant: 0.7316)
     #2: Feature 18703 (Cohen's d: 1.319, Assistant: 0.8322)

Analysis 3: RATIO-BASED FEATURES
   Criteria: Assistant activation ≥10x control activat