# Feature Diffing Plotting

This notebook creates an interactive plot of the results from model diffing.

In [None]:
import torch
import pandas as pd
import numpy as np
import os
from pathlib import Path

# Configuration
# MODEL_TYPE = "llama"
# MODEL_NAME_READABLE = "Llama 3.1 8B"
# SAE_LAYER = 15
# SAE_TRAINER = "32x"
# TOKEN_OFFSETS = {"asst": -2, "endheader": -1, "newline": 0}
# N_PROMPTS = 10000
MODEL_TYPE = "gemma"
MODEL_NAME_READABLE = "Gemma 2 9B"
SAE_LAYER = 20
SAE_TRAINER = "131k-l0-114"
TOKEN_OFFSETS = {"model": -1, "newline": 0}
N_PROMPTS = 40
PERCENT_ACTIVE = 1

# Choose one metric for detailed analysis
METRIC_SUBTITLE = {
    'target_all_mean': 'Mean Activation',
    'target_sparsity': 'Activation Sparsity'
}


# File paths
BASE_FILE = f"/workspace/results/5_diffing_personal/{MODEL_TYPE}_trainer{SAE_TRAINER}_layer{SAE_LAYER}/personal_40/base.pt"
CHAT_FILE = f"/workspace/results/5_diffing_personal/{MODEL_TYPE}_trainer{SAE_TRAINER}_layer{SAE_LAYER}/personal_40/chat.pt"
EXPLANATIONS_PATH = f"../../explanations/{MODEL_TYPE}_trainer{SAE_TRAINER}_layer{SAE_LAYER}.csv"

# Output directory
SOURCE = f"{MODEL_TYPE}_trainer{SAE_TRAINER}_layer{SAE_LAYER}/personal_40"
OUTPUT_DIR = Path(f"./{SOURCE}")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Link
LLAMA_LINK_FORMAT = f"https://www.neuronpedia.org/llama3.1-8b/{SAE_LAYER}-llamascope-res-131k/"
GEMMA_LINK_FORMAT = f"https://www.neuronpedia.org/gemma-2-9b/{SAE_LAYER}-gemmascope-res-131k/"

print(f"Loading base model data from: {BASE_FILE}")
print(f"Loading chat model data from: {CHAT_FILE}")
print(f"Output directory: {OUTPUT_DIR}")

# Load the PyTorch files
base_data = torch.load(BASE_FILE)
chat_data = torch.load(CHAT_FILE)

print(f"\nBase data keys: {list(base_data.keys())}")
print(f"Chat data keys: {list(chat_data.keys())}")
print(f"Base metadata: {base_data['metadata']}")
print(f"Chat metadata: {chat_data['metadata']}")

# Verify token types match
base_tokens = [k for k in base_data.keys() if k != 'metadata']
chat_tokens = [k for k in chat_data.keys() if k != 'metadata']
print(f"\nBase token types: {base_tokens}")
print(f"Chat token types: {chat_tokens}")
assert base_tokens == chat_tokens, "Token types don't match between base and chat!"

token_types = base_tokens
print(f"Processing {len(token_types)} token types: {token_types}")

Loading base model data from: /workspace/results/5_diffing_personal/gemma_trainer131k-l0-114_layer20/personal_40/base.pt
Loading chat model data from: /workspace/results/5_diffing_personal/gemma_trainer131k-l0-114_layer20/personal_40/chat.pt
Output directory: gemma_trainer131k-l0-114_layer20/personal_40

Base data keys: ['model', 'newline', 'metadata']
Chat data keys: ['model', 'newline', 'metadata']
Base metadata: {'source': 'gemma_trainer131k-l0-114_layer20_base', 'model_type': 'gemma', 'model_ver': 'base', 'sae_layer': 20, 'sae_trainer': '131k-l0-114', 'num_target_prompts': 40, 'num_control_prompts': 40, 'num_features': 131072, 'token_types': ['model', 'newline']}
Chat metadata: {'source': 'gemma_trainer131k-l0-114_layer20_chat', 'model_type': 'gemma', 'model_ver': 'chat', 'sae_layer': 20, 'sae_trainer': '131k-l0-114', 'num_target_prompts': 40, 'num_control_prompts': 40, 'num_features': 131072, 'token_types': ['model', 'newline']}

Base token types: ['model', 'newline']
Chat token t

## Plot Results

In [12]:
# Create interactive scatterplot for one metric with all 3 token types
import plotly.graph_objects as go



# Load Claude explanations
explanations_df = pd.read_csv(EXPLANATIONS_PATH)
print(f"Loaded {len(explanations_df)} explanations")

# Create a dictionary for fast lookup of explanations by feature_id
explanations_dict = dict(zip(explanations_df['feature_id'], explanations_df['claude_desc']))

Loaded 6305 explanations


In [None]:
SELECTED_METRIC = 'target_all_mean'  # Updated to use target_ prefix

print(f"Creating interactive scatterplot for {SELECTED_METRIC} metric with all token types...")

# Generate colors for token types
def generate_token_colors(token_offset_keys):
    """Generate colors for plotting based on token offset keys"""
    color_palette = [
        '#FF6B6B',  # Red
        '#4ECDC4',  # Teal
        '#45B7D1',  # Blue
        '#96CEB4',  # Green
        '#FFEAA7',  # Yellow
        '#DDA0DD',  # Plum
        '#FFA07A',  # Light Salmon
        '#98D8C8',  # Mint
        '#F7DC6F',  # Light Yellow
        '#BB8FCE'   # Light Purple
    ]
    
    colors = {}
    for i, token_key in enumerate(token_offset_keys):
        colors[token_key] = color_palette[i % len(color_palette)]
    
    return colors

# Generate exclusivity symbols
exclusivity_symbols = {
    'both': 'star',      # Target-exclusive in both models
    'base': 'circle',    # Target-exclusive in base only
    'chat': 'square',    # Target-exclusive in chat only
    'neither': 'diamond' # Not target-exclusive
}

# Generate styling
colors = generate_token_colors(list(TOKEN_OFFSETS.keys()))
print(f"Generated styling:")
print(f"Token type colors: {colors}")
print(f"Exclusivity symbols: {exclusivity_symbols}")

# Load target feature IDs if the file exists
active_file = f"./{MODEL_TYPE}_trainer{SAE_TRAINER}_layer{SAE_LAYER}/personal_40/target_features.csv"
if os.path.exists(active_file):
    active_features_df = pd.read_csv(active_file)
    active_feature_ids = set(active_features_df['feature_id'].tolist())
else:
    # Pre-calculate active masks once per token type since all metrics have same active features
    print("Pre-calculating active masks...")
    active_masks = {}
    for token_type in token_types:
        base_values = base_data[token_type]['target_num_active'].numpy()
        chat_values = chat_data[token_type]['target_num_active'].numpy()
        active_masks[token_type] = (base_values > int(N_PROMPTS * PERCENT_ACTIVE / 100)) | (chat_values > int(N_PROMPTS * PERCENT_ACTIVE / 100))
        print(f"  {token_type}: {active_masks[token_type].sum():,} active features")

# Pre-process all data in single loop to avoid redundant calculations
print("Pre-processing data...")
processed_data = {}
all_base_values = []
all_chat_values = []

def format_explanation_efficient(claude_explanation):
    """Efficient text wrapping function - preserves original formatting logic"""
    if not isinstance(claude_explanation, str) or pd.isna(claude_explanation):
        return "No explanation available"
    
    if len(claude_explanation) <= 50:
        return claude_explanation
    
    # Same text wrapping logic as original, but more efficient
    words = claude_explanation.split()
    lines = []
    current_line = ""
    
    for word in words:
        if len(current_line + word) <= 80:
            current_line += word + " "
        else:
            if current_line:
                lines.append(current_line.strip())
            current_line = word + " "
    
    if current_line:
        lines.append(current_line.strip())
    
    return "<br>".join(lines)

def get_exclusivity_info(fid, token_type):
    """Determine if feature is target-exclusive for base, chat, or both"""
    base_target_active = base_data[token_type]['target_num_active'][fid] > 0
    base_control_active = base_data[token_type]['control_num_active'][fid] > 0
    chat_target_active = chat_data[token_type]['target_num_active'][fid] > 0
    chat_control_active = chat_data[token_type]['control_num_active'][fid] > 0
    
    base_exclusive = base_target_active and not base_control_active
    chat_exclusive = chat_target_active and not chat_control_active
    
    if base_exclusive and chat_exclusive:
        return "both"
    elif base_exclusive and not chat_exclusive:
        return "base"
    elif chat_exclusive and not base_exclusive:
        return "chat"
    else:
        return "neither"

def has_cross_model_inconsistency(fid, token_type):
    """Check if feature has cross-model inconsistency"""
    base_target_active = base_data[token_type]['target_num_active'][fid] > 0
    base_control_active = base_data[token_type]['control_num_active'][fid] > 0
    chat_target_active = chat_data[token_type]['target_num_active'][fid] > 0
    chat_control_active = chat_data[token_type]['control_num_active'][fid] > 0
    
    base_target_only = base_target_active and not base_control_active
    chat_target_only = chat_target_active and not chat_control_active
    
    # Check if base-target-only but chat-control-active
    base_target_only_but_chat_control = base_target_only and chat_control_active
    
    # Check if chat-target-only but base-control-active  
    chat_target_only_but_base_control = chat_target_only and base_control_active
    
    return base_target_only_but_chat_control or chat_target_only_but_base_control

def get_cross_model_inconsistency_text(fid, token_type):
    """Get description of cross-model inconsistency if any"""
    base_target_active = base_data[token_type]['target_num_active'][fid] > 0
    base_control_active = base_data[token_type]['control_num_active'][fid] > 0
    chat_target_active = chat_data[token_type]['target_num_active'][fid] > 0
    chat_control_active = chat_data[token_type]['control_num_active'][fid] > 0
    
    base_target_only = base_target_active and not base_control_active
    chat_target_only = chat_target_active and not chat_control_active
    
    if base_target_only and chat_control_active:
        return "Cross-model inconsistency: Target-only in base but also active in chat control"
    elif chat_target_only and base_control_active:
        return "Cross-model inconsistency: Target-only in chat but also active in base control"
    else:
        return ""

def get_detailed_activation_counts(fid, token_type):
    """Get detailed activation counts for hover text"""
    base_target_count = base_data[token_type]['target_num_active'][fid].item()
    base_control_count = base_data[token_type]['control_num_active'][fid].item()
    chat_target_count = chat_data[token_type]['target_num_active'][fid].item()
    chat_control_count = chat_data[token_type]['control_num_active'][fid].item()
    
    return base_target_count, base_control_count, chat_target_count, chat_control_count

# Process all token types in one pass
no_explanation = set()
for token_type in token_types:
    if os.path.exists(active_file):
        # Use pre-filtered feature IDs from CSV
        all_feature_ids = np.arange(base_data['metadata']['num_features'])
        mask = np.isin(all_feature_ids, list(active_feature_ids))

        base_values = base_data[token_type][SELECTED_METRIC].numpy()[mask]
        chat_values = chat_data[token_type][SELECTED_METRIC].numpy()[mask]
        feature_ids = all_feature_ids[mask]
    else:
        # Fallback to mask calculation
        active_mask = active_masks[token_type]
        base_values = base_data[token_type][SELECTED_METRIC].numpy()[active_mask]
        chat_values = chat_data[token_type][SELECTED_METRIC].numpy()[active_mask]
        feature_ids = np.arange(len(active_mask))[active_mask]
    
    # Calculate differences vectorized
    differences = chat_values - base_values
    
    # Pre-process hover text and explanations - filter out non-exclusive features
    hover_texts = []
    neuronpedia_urls = []
    exclusivity_info = []
    inconsistency_info = []
    filtered_base_values = []
    filtered_chat_values = []
    filtered_feature_ids = []
    
    for fid, base_val, chat_val, diff in zip(feature_ids, base_values, chat_values, differences):
        # Get exclusivity information
        exclusivity = get_exclusivity_info(fid, token_type)
        
        # Skip features that are not target-exclusive for this token type
        if exclusivity == "neither":
            continue
            
        exclusivity_info.append(exclusivity)
        
        # Check for cross-model inconsistency
        has_inconsistency = has_cross_model_inconsistency(fid, token_type)
        inconsistency_info.append(has_inconsistency)
        
        filtered_base_values.append(base_val)
        filtered_chat_values.append(chat_val)
        filtered_feature_ids.append(fid)
        
        # Get detailed activation counts
        base_target_count, base_control_count, chat_target_count, chat_control_count = get_detailed_activation_counts(fid, token_type)
        
        # Get total prompt counts from metadata
        num_target_prompts = base_data['metadata']['num_target_prompts']
        num_control_prompts = base_data['metadata']['num_control_prompts']
        
        # Get Claude explanation if available
        if fid not in explanations_dict:
            no_explanation.add(fid)
        claude_explanation = explanations_dict.get(fid, "No explanation available")
        
        # Check if explanation is a string (handle NaN/float values)
        if not isinstance(claude_explanation, str) or pd.isna(claude_explanation):
            claude_explanation = "No explanation available"
            no_explanation.add(fid)
        
        # Format explanation efficiently but preserve original wrapping behavior
        formatted_explanation = format_explanation_efficient(claude_explanation)
        
        # Create Neuronpedia URL - choose correct format based on model type
        if MODEL_TYPE == "llama":
            neuronpedia_url = f"{LLAMA_LINK_FORMAT}{fid}"
        else:  # gemma
            neuronpedia_url = f"{GEMMA_LINK_FORMAT}{fid}"
        neuronpedia_urls.append(neuronpedia_url)
        
        # Create hover text with detailed activation percentages
        exclusivity_text = {
            "both": "Introspective prompt feature for both models",
            "base": "Base-only introspective prompt feature",
            "chat": "Chat-only introspective prompt feature",
        }[exclusivity]
        
        # Calculate percentages
        base_target_pct = (base_target_count / num_target_prompts) * 100
        base_control_pct = (base_control_count / num_control_prompts) * 100
        chat_target_pct = (chat_target_count / num_target_prompts) * 100
        chat_control_pct = (chat_control_count / num_control_prompts) * 100
        
        # Get cross-model inconsistency text
        inconsistency_text = get_cross_model_inconsistency_text(fid, token_type)
        
        hover_text = (
            f"<b>Feature {fid}</b><br>" +
            f"Base: {base_val:.4f}, Chat: {chat_val:.4f}<br>" +
            f"Difference: {diff:.4f}<br><br>" +
            f"<b>Activation Percentages:</b><br>" +
            f"Base Target: {base_target_pct:.1f}% ({base_target_count}/{num_target_prompts} prompts)<br>" +
            f"Base Control: {base_control_pct:.1f}% ({base_control_count}/{num_control_prompts} prompts)<br>" +
            f"Chat Target: {chat_target_pct:.1f}% ({chat_target_count}/{num_target_prompts} prompts)<br>" +
            f"Chat Control: {chat_control_pct:.1f}% ({chat_control_count}/{num_control_prompts} prompts)<br><br>" +
            f"<b>Exclusivity:</b> {exclusivity_text}<br>"
        )
        
        # Add inconsistency info if present
        if inconsistency_text:
            hover_text += f"<br><b>Note:</b> {inconsistency_text}<br>"
        
        hover_text += f"<br><b>Description:</b><br>{formatted_explanation}<extra></extra>"
        
        hover_texts.append(hover_text)
    
    # Store processed data with filtered values
    processed_data[token_type] = {
        'base_values': filtered_base_values,
        'chat_values': filtered_chat_values,
        'feature_ids': filtered_feature_ids,
        'hover_texts': hover_texts,
        'neuronpedia_urls': neuronpedia_urls,
        'exclusivity_info': exclusivity_info,
        'inconsistency_info': inconsistency_info
    }
    
    # Collect all values for min/max calculation (single pass)
    all_base_values.extend(filtered_base_values)
    all_chat_values.extend(filtered_chat_values)
    
    print(f"Token type '{token_type}': {len(filtered_feature_ids)} target-exclusive features")

# Create the scatterplot
fig = go.Figure()

total_features = 0

# Add scatter traces using pre-processed data, grouped by token type and exclusivity
exclusivity_groups = ['both', 'base', 'chat']  # Removed 'neither' since we filter those out
exclusivity_names = {
    'both': 'Both',
    'base': 'Base', 
    'chat': 'Chat'
}

for token_type in token_types:
    data = processed_data[token_type]
    
    # Group data by exclusivity type
    for exclusivity_type in exclusivity_groups:
        # Filter data for this exclusivity type
        indices = [i for i, excl in enumerate(data['exclusivity_info']) if excl == exclusivity_type]
        
        if not indices:  # Skip if no features of this type
            continue
        
        # Create trace name with exclusivity pattern (indented for visual grouping)
        trace_name = f"  {exclusivity_names[exclusivity_type]}"
        
        # Separate indices by inconsistency status for different styling
        consistent_indices = [i for i in indices if not data['inconsistency_info'][i]]
        inconsistent_indices = [i for i in indices if data['inconsistency_info'][i]]
        
        # Add trace for consistent features
        if consistent_indices:
            fig.add_trace(
                go.Scattergl(
                    x=[data['base_values'][i] for i in consistent_indices],
                    y=[data['chat_values'][i] for i in consistent_indices],
                    mode='markers',
                    name=trace_name,
                    legendgroup=token_type,  # Group by token type
                    legendgrouptitle_text=token_type.title(),  # Group title
                    marker=dict(
                        size=6,
                        color=colors[token_type],
                        symbol=exclusivity_symbols[exclusivity_type],
                        line=dict(width=0.3, color='black'),
                        opacity=0.7
                    ),
                    text=[f"Feature {data['feature_ids'][i]}" for i in consistent_indices],
                    customdata=[data['neuronpedia_urls'][i] for i in consistent_indices],
                    hovertemplate=[data['hover_texts'][i] for i in consistent_indices],
                    hoverlabel=dict(
                        bgcolor=colors[token_type],
                        bordercolor="black",
                        font_size=12, 
                        font_family="Arial",
                        font_color="white"
                    )
                )
            )
        
        # Add trace for inconsistent features with orange border
        if inconsistent_indices:
            fig.add_trace(
                go.Scattergl(
                    x=[data['base_values'][i] for i in inconsistent_indices],
                    y=[data['chat_values'][i] for i in inconsistent_indices],
                    mode='markers',
                    name=trace_name + " (Cross-model)",
                    legendgroup=token_type,  # Group by token type
                    legendgrouptitle_text=token_type.title(),  # Group title
                    marker=dict(
                        size=6,
                        color=colors[token_type],
                        symbol=exclusivity_symbols[exclusivity_type],
                        line=dict(width=1, color='red'),
                        opacity=0.7
                    ),
                    text=[f"Feature {data['feature_ids'][i]}" for i in inconsistent_indices],
                    customdata=[data['neuronpedia_urls'][i] for i in inconsistent_indices],
                    hovertemplate=[data['hover_texts'][i] for i in inconsistent_indices],
                    hoverlabel=dict(
                        bgcolor=colors[token_type],
                        bordercolor="black",
                        font_size=12, 
                        font_family="Arial",
                        font_color="white"
                    )
                )
            )
        
        total_features += len(indices)

# Add diagonal "no change" line using pre-calculated values
if all_base_values and all_chat_values:  # Only add if we have data
    max_val = max(max(all_base_values), max(all_chat_values))
    min_val = min(min(all_base_values), min(all_chat_values))

    fig.add_trace(
        go.Scatter(
            x=[min_val, max_val],
            y=[min_val, max_val],
            mode='lines',
            line=dict(color='gray', dash='dash', width=2),
            name='No Change',
            hovertemplate="No change line<extra></extra>",
            hoverlabel=dict(
                bgcolor="gray",
                bordercolor="black",
                font_size=11,
                font_family="Arial",
                font_color="white"
            )
        )
    )

# Update layout with corrected metric name and square aspect ratio
metric_display = METRIC_SUBTITLE[SELECTED_METRIC]
fig.update_layout(
    title={
        'text': f'Base → Instruct Introspective SAE Features: {metric_display}<br><sub>{MODEL_NAME_READABLE}, Residual Stream Post-Layer {SAE_LAYER}</sub>',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 16}
    },
    xaxis_title=f'Base: {metric_display}',
    yaxis_title=f'Instruct: {metric_display}',
    height=800,
    width=900,
    showlegend=True,
    hovermode='closest',
    legend=dict(
        title="Activation Position",
        orientation="v",
        yanchor="top",
        y=1,
        xanchor="left",
        x=1.02,
        groupclick="togglegroup"  # Allow group toggling by clicking group titles
    ),
    xaxis=dict(
        scaleanchor="y",  # Lock x-axis scale to y-axis
        scaleratio=1,     # 1:1 aspect ratio for square grid
    ),
)

# Add grid
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

# Save the interactive plot
output_html = OUTPUT_DIR / f"{SELECTED_METRIC}.html"

# Create custom HTML with click handler
html_content = fig.to_html(
    include_plotlyjs='cdn',
    config={
        'displayModeBar': True,
        'showTips': False,
        'scrollZoom': True,
        'doubleClick': 'reset'
    }
)

# Add JavaScript to handle clicks
click_script = """
<script>
document.addEventListener('DOMContentLoaded', function() {
    var plotDiv = document.getElementsByClassName('plotly-graph-div')[0];
    
    let clickTimeout;
    plotDiv.on('plotly_click', function(data) {
        clearTimeout(clickTimeout);
        clickTimeout = setTimeout(function() {
            var point = data.points[0];
            if (point.customdata) {
                window.open(point.customdata, '_blank');
            }
        }, 100);
    });
});
</script>
"""

html_with_script = html_content.replace('</body>', click_script + '</body>')

with open(output_html, 'w') as f:
    f.write(html_with_script)

print(f"\nInteractive scatterplot saved to: {output_html}")
print(f"File size: {output_html.stat().st_size / 1024:.1f} KB")

# Show the plot
fig.show()

Creating interactive scatterplot for target_all_mean metric with all token types...
Generated styling:
Token type colors: {'model': '#FF6B6B', 'newline': '#4ECDC4'}
Exclusivity symbols: {'both': 'star', 'base': 'circle', 'chat': 'square', 'neither': 'diamond'}
Pre-processing data...
Token type 'model': 506 target-exclusive features
Token type 'newline': 724 target-exclusive features


KeyError: 'target_all_mean'

In [None]:
# Analyze cross-model feature patterns
print("Analyzing cross-model feature patterns...")

def analyze_cross_model_patterns():
    """Analyze features that are target-only in one model but appear in control for the other"""
    
    cross_model_patterns = {}
    
    for token_type in token_types:
        print(f"\nAnalyzing token type: {token_type}")
        
        # Get all features that are active in either model
        base_target_active = base_data[token_type]['target_num_active'] > 0
        base_control_active = base_data[token_type]['control_num_active'] > 0
        chat_target_active = chat_data[token_type]['target_num_active'] > 0
        chat_control_active = chat_data[token_type]['control_num_active'] > 0
        
        # Features that are target-only in base but appear in chat control
        base_target_only = base_target_active & ~base_control_active
        chat_control_active_mask = chat_control_active
        base_target_only_but_chat_control = base_target_only & chat_control_active_mask
        
        # Features that are target-only in chat but appear in base control
        chat_target_only = chat_target_active & ~chat_control_active
        base_control_active_mask = base_control_active
        chat_target_only_but_base_control = chat_target_only & base_control_active_mask
        
        # Get feature IDs
        base_target_only_but_chat_control_ids = torch.where(base_target_only_but_chat_control)[0].tolist()
        chat_target_only_but_base_control_ids = torch.where(chat_target_only_but_base_control)[0].tolist()
        
        cross_model_patterns[token_type] = {
            'base_target_only_but_chat_control': base_target_only_but_chat_control_ids,
            'chat_target_only_but_base_control': chat_target_only_but_base_control_ids
        }
        
        print(f"  Features target-only in base but active in chat control: {len(base_target_only_but_chat_control_ids)}")
        print(f"  Features target-only in chat but active in base control: {len(chat_target_only_but_base_control_ids)}")
        
        # Show examples with explanations if available
        if base_target_only_but_chat_control_ids:
            print(f"  Examples of base-target-only but chat-control features:")
            for fid in base_target_only_but_chat_control_ids[:3]:  # Show first 3
                explanation = explanations_dict.get(fid, "No explanation available")
                if isinstance(explanation, str) and not pd.isna(explanation):
                    explanation = explanation[:100] + "..." if len(explanation) > 100 else explanation
                else:
                    explanation = "No explanation available"
                print(f"    Feature {fid}: {explanation}")
        
        if chat_target_only_but_base_control_ids:
            print(f"  Examples of chat-target-only but base-control features:")
            for fid in chat_target_only_but_base_control_ids[:3]:  # Show first 3
                explanation = explanations_dict.get(fid, "No explanation available")
                if isinstance(explanation, str) and not pd.isna(explanation):
                    explanation = explanation[:100] + "..." if len(explanation) > 100 else explanation
                else:
                    explanation = "No explanation available"
                print(f"    Feature {fid}: {explanation}")
    
    return cross_model_patterns

cross_patterns = analyze_cross_model_patterns()

# Summary
print(f"\n{'='*60}")
print("CROSS-MODEL PATTERN SUMMARY")
print(f"{'='*60}")

total_base_target_only_but_chat_control = 0
total_chat_target_only_but_base_control = 0

for token_type in token_types:
    patterns = cross_patterns[token_type]
    base_count = len(patterns['base_target_only_but_chat_control'])
    chat_count = len(patterns['chat_target_only_but_base_control'])
    
    total_base_target_only_but_chat_control += base_count
    total_chat_target_only_but_base_control += chat_count
    
    print(f"{token_type.upper()}:")
    print(f"  Base-target-only → Chat-control: {base_count} features")
    print(f"  Chat-target-only → Base-control: {chat_count} features")

print(f"\nTOTAL ACROSS ALL TOKEN TYPES:")
print(f"  Base-target-only → Chat-control: {total_base_target_only_but_chat_control} features")
print(f"  Chat-target-only → Base-control: {total_chat_target_only_but_base_control} features")

if total_base_target_only_but_chat_control == 0 and total_chat_target_only_but_base_control == 0:
    print("\n✅ No cross-model inconsistencies found!")
    print("All target-exclusive features are consistently exclusive across both models.")
else:
    print(f"\n⚠️  Found {total_base_target_only_but_chat_control + total_chat_target_only_but_base_control} cross-model inconsistencies")
    print("These features show different activation patterns between base and chat models.")