# Experiment: Attention Pattern Analysis

**Goal:** Understand WHERE the model looks under different system prompts.

**Key Questions:**
- How does attention to system prompt tokens change with different instructions?
- Which attention heads are most sensitive to system prompts?
- Does CoT instruction change attention patterns differently than persona instructions?

In [None]:
import sys
import os

# Setup path
if 'google.colab' in sys.modules:
    if not os.path.exists('/content/LLM-Instruction-Understanding'):
        !git clone https://github.com/maralkh/LLM-Instruction-Understanding.git
    os.chdir('/content/LLM-Instruction-Understanding')
    !pip install -q -r requirements.txt
    if '/content/LLM-Instruction-Understanding' not in sys.path:
        sys.path.insert(0, '/content/LLM-Instruction-Understanding')
else:
    parent = os.path.abspath('..')
    if parent not in sys.path:
        sys.path.insert(0, parent)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

from src.model_utils import load_model
from src.test_configs import (
    get_test_prompts, get_system_prompts, get_core_system_prompts,
    build_chat_prompt, get_all_test_prompts
)

plt.style.use('seaborn-v0_8-whitegrid')

In [None]:
model = load_model("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
layer_info = model.get_layer_info()
print(f"Model: {layer_info['model_name']}")
print(f"Layers: {layer_info['n_layers']}, Heads: {layer_info['n_heads']}")

## 1. Compare Attention Patterns Across System Prompts

In [None]:
# DEBUG: Test attention extraction on one prompt
test_prompt = "What is 2+2?"
attn_data = model.get_attention_patterns(test_prompt, debug=True)

print("\n=== Attention Stats by Layer ===")
for layer, stats in attn_data['attention_stats'].items():
    print(f"Layer {layer}: shape={stats['shape']}, min={stats['min']:.6f}, max={stats['max']:.6f}, mean={stats['mean']:.6f}")

print("\n=== Entropy by Layer ===")
for layer, ent in attn_data['attention_entropy'].items():
    print(f"Layer {layer}: entropy={ent:.4f}")

In [None]:
# Select test cases
test_prompts = get_all_test_prompts()[:5]
system_prompts = get_core_system_prompts()

print(f"Testing {len(test_prompts)} prompts × {len(system_prompts)} system prompts")

In [None]:
def analyze_attention_for_prompt(model, user_prompt, system_prompt_text, tokenizer):
    """Analyze attention patterns for a prompt."""
    full_prompt = build_chat_prompt(system_prompt_text, user_prompt, tokenizer)
    
    # Get attention patterns
    attn_data = model.get_attention_patterns(full_prompt, aggregate="last_token")
    
    # Get token boundaries
    tokens = attn_data['tokens']
    
    # Try to identify system prompt region
    sys_tokens = model.tokenizer(system_prompt_text, return_tensors="pt").input_ids.shape[1] if system_prompt_text else 0
    
    return {
        'tokens': tokens,
        'n_tokens': len(tokens),
        'sys_token_count': sys_tokens,
        'layer_attention': attn_data['layer_attention'],
        'attention_entropy': attn_data['attention_entropy'],
        'attention_stats': attn_data.get('attention_stats', {})
    }

In [None]:
# Collect attention data
attention_results = []

for test in tqdm(test_prompts, desc="Test prompts"):
    for sys_name, sys_info in system_prompts.items():
        try:
            attn_data = analyze_attention_for_prompt(
                model, 
                test['prompt'], 
                sys_info['text'],
                model.tokenizer
            )
            
            # Calculate summary metrics with NaN handling
            mean_entropy_by_layer = {k: v for k, v in attn_data['attention_entropy'].items()}
            entropy_values = [v for v in mean_entropy_by_layer.values() if np.isfinite(v)]
            mean_entropy = np.mean(entropy_values) if entropy_values else 0.0
            
            attention_results.append({
                'test_id': test['id'],
                'category': test['category'],
                'system_prompt': sys_name,
                'n_tokens': attn_data['n_tokens'],
                'sys_tokens': attn_data['sys_token_count'],
                'mean_attn_entropy': mean_entropy,
                'layer_entropy': mean_entropy_by_layer,
                'layer_attention': attn_data['layer_attention']
            })
        except Exception as e:
            print(f"Error for {test['id']}/{sys_name}: {e}")

attn_df = pd.DataFrame(attention_results)
print(f"Collected {len(attn_df)} attention analyses")

## 2. Attention Entropy Analysis

In [None]:
# Compare attention entropy across system prompts
entropy_by_sys = attn_df.groupby('system_prompt')['mean_attn_entropy'].agg(['mean', 'std']).round(4)
entropy_by_sys = entropy_by_sys.sort_values('mean')

print("=== Attention Entropy by System Prompt ===")
print("Lower entropy = more focused attention")
print(entropy_by_sys)

In [None]:
# Visualize
fig, ax = plt.subplots(figsize=(10, 6))

ax.barh(range(len(entropy_by_sys)), entropy_by_sys['mean'], 
        xerr=entropy_by_sys['std'], capsize=3, alpha=0.7)
ax.set_yticks(range(len(entropy_by_sys)))
ax.set_yticklabels(entropy_by_sys.index)
ax.set_xlabel('Mean Attention Entropy')
ax.set_title('Attention Focus by System Prompt\n(Lower = More Focused)')

plt.tight_layout()
plt.savefig('../results/attention_entropy_by_system.png', dpi=150)
plt.show()

## 3. Layer-wise Attention Analysis

In [None]:
# Extract layer-wise entropy for different system prompts
def get_layer_entropy_matrix(attn_df, n_layers):
    """Create matrix of entropy by (system_prompt, layer)."""
    systems = attn_df['system_prompt'].unique()
    matrix = np.zeros((len(systems), n_layers))
    counts = np.zeros((len(systems), n_layers))
    
    for i, sys in enumerate(systems):
        sys_data = attn_df[attn_df['system_prompt'] == sys]
        for row in sys_data['layer_entropy']:
            for layer, entropy in row.items():
                if layer < n_layers and np.isfinite(entropy):
                    matrix[i, layer] += entropy
                    counts[i, layer] += 1
        # Avoid division by zero
        counts[i, :] = np.maximum(counts[i, :], 1)
        matrix[i, :] /= counts[i, :]
    
    return matrix, systems

n_layers = layer_info['n_layers']
entropy_matrix, sys_names = get_layer_entropy_matrix(attn_df, n_layers)

In [None]:
# Heatmap of layer-wise entropy
fig, ax = plt.subplots(figsize=(14, 6))

sns.heatmap(entropy_matrix, 
            xticklabels=[f"L{i}" for i in range(n_layers)],
            yticklabels=sys_names,
            cmap='viridis', ax=ax, annot=False)
ax.set_xlabel('Layer')
ax.set_ylabel('System Prompt')
ax.set_title('Attention Entropy by Layer × System Prompt')

plt.tight_layout()
plt.savefig('../results/attention_layer_heatmap.png', dpi=150)
plt.show()

## 4. Head-Level Analysis

In [None]:
# Analyze individual attention heads
test_prompt = test_prompts[0]

head_analyses = {}
for sys_name, sys_info in list(system_prompts.items())[:4]:  # Just a few for visualization
    full_prompt = build_chat_prompt(sys_info['text'], test_prompt['prompt'], model.tokenizer)
    head_data = model.get_head_contributions(full_prompt)
    head_analyses[sys_name] = head_data

In [None]:
# Compare head entropy across system prompts
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for ax, (sys_name, head_data) in zip(axes.flatten(), head_analyses.items()):
    # Extract entropy for each head across layers
    layers = sorted(head_data.keys())
    n_heads = len(head_data[layers[0]])
    
    entropy_matrix = np.zeros((len(layers), n_heads))
    for i, layer in enumerate(layers):
        for head_info in head_data[layer]:
            entropy_matrix[i, head_info['head']] = head_info['entropy']
    
    im = ax.imshow(entropy_matrix, aspect='auto', cmap='viridis')
    ax.set_xlabel('Head')
    ax.set_ylabel('Layer')
    ax.set_title(f'{sys_name}')
    plt.colorbar(im, ax=ax, label='Entropy')

plt.suptitle('Head Entropy by Layer (per System Prompt)', y=1.02)
plt.tight_layout()
plt.savefig('../results/head_entropy_comparison.png', dpi=150)
plt.show()

## 5. Key Findings

In [None]:
print("="*60)
print("ATTENTION PATTERN ANALYSIS SUMMARY")
print("="*60)

print("\n1. System prompts that focus attention most:")
for sys in entropy_by_sys.head(3).index:
    print(f"   • {sys}: entropy={entropy_by_sys.loc[sys, 'mean']:.4f}")

print("\n2. System prompts with most distributed attention:")
for sys in entropy_by_sys.tail(3).index:
    print(f"   • {sys}: entropy={entropy_by_sys.loc[sys, 'mean']:.4f}")

print("\n3. Implication:")
print("   - Focused attention may indicate clearer instruction interpretation")
print("   - CoT prompts may distribute attention more broadly for reasoning")

In [None]:
# Save results
import json
import os

os.makedirs('../results', exist_ok=True)

results = {
    'entropy_by_system': entropy_by_sys.to_dict(),
    'n_samples': len(attn_df),
    'model': layer_info
}

with open('../results/attention_analysis.json', 'w') as f:
    json.dump(results, f, indent=2, default=float)
print("Results saved.")