# Experiment 9: System Prompt Length and Position Effects

**Goal:** Test how system prompt LENGTH and component ORDER affect behavior.

**Setup:**
- Fixed test prompts
- Same content at different lengths (repetition, verbosity)
- Same components in different orders

In [None]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import random

from src.model_utils import load_model
from src.metrics import DistributionMetrics
from src.visualization import set_style
from src.test_configs import ALL_TEST_PROMPTS, build_chat_prompt

set_style()

In [None]:
model = load_model("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

## 1. Length Effects

In [None]:
# Same instruction at different lengths
BASE_INSTRUCTION = "Be helpful and accurate."

LENGTH_VARIANTS = {
    "minimal": "Be helpful.",
    "short": "Be helpful and accurate.",
    "medium": "You are a helpful assistant. Please provide accurate and useful information.",
    "long": """You are a helpful AI assistant. Your goal is to provide accurate, 
useful, and well-reasoned responses. Always strive to be helpful while 
ensuring the information you provide is correct and reliable.""",
    "very_long": """You are a helpful AI assistant created to assist users with 
their questions and tasks. Your primary goal is to provide accurate, useful, 
and well-reasoned responses to any query. Always strive to be helpful while 
ensuring the information you provide is correct, reliable, and up-to-date. 
Take care to understand what the user is asking and respond appropriately.
If you're uncertain about something, say so clearly.""",
    "repeated": "Be helpful. Be accurate. Be helpful. Be accurate. Be helpful.",
}

In [None]:
TEST_SUBSET = ALL_TEST_PROMPTS[:10]

length_results = []
for variant_name, variant_text in tqdm(LENGTH_VARIANTS.items()):
    for test in TEST_SUBSET:
        prompt = build_chat_prompt(variant_text, test["prompt"], model.tokenizer)
        dist = model.get_next_token_distribution(prompt, top_k=50)
        
        length_results.append({
            "variant": variant_name,
            "length_chars": len(variant_text),
            "length_words": len(variant_text.split()),
            "test_id": test["id"],
            "entropy": dist["entropy"],
            "top_prob": dist["top_probs"][0],
            "full_probs": dist["full_probs"]
        })

length_df = pd.DataFrame(length_results)

In [None]:
# Analyze length effects
length_summary = length_df.groupby(['variant', 'length_words']).agg({
    'entropy': 'mean',
    'top_prob': 'mean'
}).reset_index().sort_values('length_words')

print("=== Length Effects ===")
print(length_summary)

In [None]:
import os
os.makedirs('../results', exist_ok=True)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Entropy vs length
ax = axes[0]
ax.scatter(length_summary['length_words'], length_summary['entropy'], s=100, alpha=0.7)
for _, row in length_summary.iterrows():
    ax.annotate(row['variant'], (row['length_words'], row['entropy']), 
                textcoords="offset points", xytext=(5, 5), fontsize=8)
ax.set_xlabel('System Prompt Length (words)')
ax.set_ylabel('Mean Entropy')
ax.set_title('Does Longer System Prompt = Different Behavior?')

# Confidence vs length
ax = axes[1]
ax.scatter(length_summary['length_words'], length_summary['top_prob'], s=100, alpha=0.7)
ax.set_xlabel('System Prompt Length (words)')
ax.set_ylabel('Mean Top Token Probability')
ax.set_title('Confidence vs System Prompt Length')

plt.tight_layout()
plt.savefig('../results/exp9_length_effects.png', dpi=150)
plt.show()

## 2. Order Effects

In [None]:
# Components in different orders
COMPONENTS = [
    "Be helpful.",
    "Be accurate.", 
    "Think step by step.",
    "Be concise."
]

# Generate permutations
from itertools import permutations

ORDER_VARIANTS = {}
all_perms = list(permutations(range(len(COMPONENTS))))
# Sample 6 permutations
random.seed(42)
sampled_perms = random.sample(all_perms, min(6, len(all_perms)))

for i, perm in enumerate(sampled_perms):
    ordered = " ".join([COMPONENTS[j] for j in perm])
    ORDER_VARIANTS[f"order_{i}"] = {
        "text": ordered,
        "order": perm
    }

In [None]:
order_results = []
for variant_name, variant_info in tqdm(ORDER_VARIANTS.items()):
    for test in TEST_SUBSET:
        prompt = build_chat_prompt(variant_info["text"], test["prompt"], model.tokenizer)
        dist = model.get_next_token_distribution(prompt, top_k=50)
        
        order_results.append({
            "variant": variant_name,
            "order": str(variant_info["order"]),
            "first_component": COMPONENTS[variant_info["order"][0]],
            "last_component": COMPONENTS[variant_info["order"][-1]],
            "test_id": test["id"],
            "entropy": dist["entropy"],
            "full_probs": dist["full_probs"]
        })

order_df = pd.DataFrame(order_results)

In [None]:
# Analyze order effects
order_summary = order_df.groupby('variant').agg({
    'entropy': ['mean', 'std'],
    'first_component': 'first',
    'last_component': 'first'
}).round(4)
order_summary.columns = ['entropy_mean', 'entropy_std', 'first', 'last']

print("=== Order Effects ===")
print(order_summary)

In [None]:
# Calculate pairwise JS between different orders
baseline_order = list(ORDER_VARIANTS.keys())[0]
baseline_by_test = {r["test_id"]: r["full_probs"] 
                    for r in order_results if r["variant"] == baseline_order}

order_divergences = []
for variant in ORDER_VARIANTS.keys():
    if variant == baseline_order:
        continue
    
    js_values = []
    for _, row in order_df[order_df['variant'] == variant].iterrows():
        js = DistributionMetrics.jensen_shannon(
            baseline_by_test[row['test_id']], row['full_probs']
        )
        js_values.append(js)
    
    order_divergences.append({
        "variant": variant,
        "js_from_baseline": np.mean(js_values)
    })

divergence_df = pd.DataFrame(order_divergences)
print(f"\nMean JS divergence between orderings: {divergence_df['js_from_baseline'].mean():.4f}")
print(f"Max JS divergence: {divergence_df['js_from_baseline'].max():.4f}")

In [None]:
# Does position matter? Check if first/last component affects outcome
by_first = order_df.groupby('first_component')['entropy'].mean()
by_last = order_df.groupby('last_component')['entropy'].mean()

print("\n=== Position Effects ===")
print("\nBy FIRST component:")
print(by_first.sort_values())

print("\nBy LAST component:")
print(by_last.sort_values())

## 3. Key Findings

In [None]:
print("="*60)
print("EXPERIMENT 9 SUMMARY: Length and Order Effects")
print("="*60)

# Length
length_corr = length_summary['length_words'].corr(length_summary['entropy'])
print(f"\n1. Length-Entropy Correlation: {length_corr:.3f}")
if abs(length_corr) > 0.3:
    print("   → System prompt length DOES affect behavior")
else:
    print("   → System prompt length has minimal effect")

# Order
order_variance = order_summary['entropy_mean'].var()
print(f"\n2. Order Variance in Entropy: {order_variance:.6f}")
if order_variance > 0.01:
    print("   → Component order MATTERS")
else:
    print("   → Component order has minimal effect")

# Position
first_range = by_first.max() - by_first.min()
last_range = by_last.max() - by_last.min()
print(f"\n3. First component effect: {first_range:.4f}")
print(f"   Last component effect: {last_range:.4f}")
if last_range > first_range:
    print("   → LAST component has more influence (recency bias)")
else:
    print("   → FIRST component has more influence (primacy bias)")

In [None]:
import json
with open('../results/exp9_results.json', 'w') as f:
    json.dump({
        "length_effects": length_summary.to_dict('records'),
        "order_effects": order_summary.to_dict(),
        "position_effects": {
            "by_first": by_first.to_dict(),
            "by_last": by_last.to_dict()
        }
    }, f, indent=2, default=float)
print("Saved.")