# Experiment: Model Internals - Why Instructions Change Behavior

**Goal:** Understand the internal mechanisms through which system prompts affect model outputs.

**Analysis Focus:**
- Hidden state changes across layers
- Which layers are most affected by system prompts?
- Correlation between internal changes and output changes

In [None]:
import sys, os
if 'google.colab' in sys.modules:
    if not os.path.exists('/content/LLM-Instruction-Understanding'):
        !git clone https://github.com/maralkh/LLM-Instruction-Understanding.git
    os.chdir('/content/LLM-Instruction-Understanding')
    !pip install -q -r requirements.txt
    sys.path.insert(0, '/content/LLM-Instruction-Understanding')
else:
    sys.path.insert(0, os.path.abspath('..'))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

from src.model_utils import load_model
from src.test_configs import get_all_test_prompts, get_core_system_prompts, build_chat_prompt

plt.style.use('seaborn-v0_8-whitegrid')

In [None]:
model = load_model("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
layer_info = model.get_layer_info()
print(f"Model: {layer_info}")

## 1. Compare Internals Across System Prompts

In [None]:
test_prompts = get_all_test_prompts()[:5]
system_prompts = get_core_system_prompts()
baseline_sys = system_prompts['none']

internal_comparisons = []

for test in tqdm(test_prompts, desc="Analyzing"):
    for sys_name, sys_info in system_prompts.items():
        if sys_name == 'none':
            continue
        
        try:
            prompt_base = build_chat_prompt(baseline_sys['text'], test['prompt'], model.tokenizer)
            prompt_var = build_chat_prompt(sys_info['text'], test['prompt'], model.tokenizer)
            
            comparison = model.compare_internals(prompt_base, prompt_var)
            
            for layer, hs_diff in comparison['hidden_state_diff'].items():
                internal_comparisons.append({
                    'test_id': test['id'],
                    'category': test['category'],
                    'system_prompt': sys_name,
                    'layer': layer,
                    'hs_cosine_sim': hs_diff['cosine_sim'],
                    'hs_l2_norm': hs_diff['l2_norm'],
                    'logit_cosine_sim': comparison['logit_diff']['cosine_sim'],
                    'top_token_same': comparison['logit_diff']['top_token_same']
                })
        except Exception as e:
            print(f"Error: {e}")

internals_df = pd.DataFrame(internal_comparisons)
print(f"Collected {len(internals_df)} measurements")

## 2. Layer Impact Analysis

In [None]:
layer_impact = internals_df.groupby('layer').agg({
    'hs_cosine_sim': 'mean',
    'hs_l2_norm': 'mean'
}).round(4)

print("=== Layer Impact ===")
print(layer_impact)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

ax = axes[0]
ax.plot(layer_impact.index, layer_impact['hs_cosine_sim'], 'o-')
ax.set_xlabel('Layer')
ax.set_ylabel('Cosine Similarity to Baseline')
ax.set_title('Hidden State Similarity by Layer')

ax = axes[1]
ax.plot(layer_impact.index, layer_impact['hs_l2_norm'], 'o-', color='orange')
ax.set_xlabel('Layer')
ax.set_ylabel('L2 Distance')
ax.set_title('Hidden State L2 Distance by Layer')

plt.tight_layout()
plt.savefig('../results/layer_impact.png', dpi=150)
plt.show()

## 3. System Prompt Impact

In [None]:
sys_impact = internals_df.groupby('system_prompt').agg({
    'hs_cosine_sim': 'mean',
    'logit_cosine_sim': 'mean',
    'top_token_same': 'mean'
}).round(4).sort_values('hs_cosine_sim')

print("=== System Prompt Impact ===")
print(sys_impact)

In [None]:
# Heatmap
pivot = internals_df.pivot_table(values='hs_cosine_sim', index='system_prompt', columns='layer', aggfunc='mean')

fig, ax = plt.subplots(figsize=(14, 6))
sns.heatmap(1 - pivot, cmap='YlOrRd', ax=ax)
ax.set_title('Hidden State Change by System Prompt Ã— Layer')
plt.tight_layout()
plt.savefig('../results/internals_heatmap.png', dpi=150)
plt.show()

In [None]:
# Summary
print("\n=== KEY FINDINGS ===")
print(f"Most affected layer: {layer_impact['hs_cosine_sim'].idxmin()}")
print(f"Most impactful system prompt: {sys_impact['hs_cosine_sim'].idxmin()}")
print(f"Least impactful system prompt: {sys_impact['hs_cosine_sim'].idxmax()}")

In [None]:
import json, os
os.makedirs('../results', exist_ok=True)
with open('../results/internals_analysis.json', 'w') as f:
    json.dump({'layer_impact': layer_impact.to_dict(), 'sys_impact': sys_impact.to_dict()}, f, indent=2, default=float)
print("Saved.")