# Experiment 3: Base vs Instruction-Tuned Model Comparison

**Goal:** Understand how instruction tuning changes prompt sensitivity.

**Key Questions:**
- Do instruction-tuned models respond differently to prompt variations?
- Which prompt strategies become more/less effective after instruction tuning?
- How does the "assistant" role affect model behavior?

In [None]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

from src.model_utils import load_model, ModelConfig
from src.prompt_utils import PromptVariantGenerator, ASSISTANT_PREFIXES
from src.metrics import DistributionMetrics, ExperimentResults, ComparisonMetrics
from src.visualization import set_style, plot_model_comparison, plot_distribution_comparison

set_style()

## 1. Load Both Models

We'll compare:
- **Base model**: TinyLlama-1.1B (pre-trained only)
- **Instruction-tuned**: TinyLlama-1.1B-Chat (fine-tuned for chat)

In [None]:
# Load models - they'll share GPU memory efficiently if using same architecture
MODELS = {
    "base": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
    "chat": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
}

# Load base model first
print("Loading base model...")
model_base = load_model(MODELS["base"])

print("\nLoading chat model...")
model_chat = load_model(MODELS["chat"])

models = {"base": model_base, "chat": model_chat}

## 2. Test Questions

In [None]:
TEST_CASES = {
    "factual": {
        "question": "What is the capital of France?",
        "expected": "Paris"
    },
    "reasoning": {
        "question": "If John has 5 apples and gives 2 to Mary, how many does John have left?",
        "expected": "3"
    },
    "instruction_following": {
        "question": "List three primary colors.",
        "expected": "red"  # Check if it starts listing
    },
    "classification": {
        "question": "Is the following positive or negative: 'I love this product!'",
        "expected": "positive"
    }
}

## 3. Compare Prompt Sensitivity

In [None]:
def compare_models_on_variants(models, question, expected, dimensions):
    """
    Compare how different models respond to the same prompt variants.
    """
    variants = PromptVariantGenerator.create_variants(question, dimensions=dimensions)
    
    results = {model_name: [] for model_name in models}
    
    for variant in tqdm(variants, desc="Testing variants"):
        prompt = variant["prompt"]
        config = variant["config"]
        
        for model_name, model in models.items():
            # Get distribution
            dist = model.get_next_token_distribution(prompt)
            
            # Get probability of expected completion
            seq_probs = model.get_sequence_log_probs(prompt, " " + expected)
            
            results[model_name].append({
                "config": config,
                "prompt": prompt,
                "entropy": dist["entropy"],
                "target_log_prob": seq_probs["total_log_prob"],
                "top_5": dist["top_tokens"][:5]
            })
    
    return results

In [None]:
# Run comparison
all_comparisons = {}

for task_name, task_data in TEST_CASES.items():
    print(f"\n{'='*60}")
    print(f"Task: {task_name}")
    print(f"{'='*60}")
    
    comparison = compare_models_on_variants(
        models,
        task_data["question"],
        task_data["expected"],
        dimensions=['specificity', 'format']
    )
    
    all_comparisons[task_name] = comparison
    
    # Quick summary
    for model_name in models:
        log_probs = [r["target_log_prob"] for r in comparison[model_name]]
        print(f"\n{model_name}:")
        print(f"  Target log-prob: mean={np.mean(log_probs):.3f}, std={np.std(log_probs):.3f}")
        print(f"  Range: [{np.min(log_probs):.3f}, {np.max(log_probs):.3f}]")

## 4. Analyze Prompt Sensitivity Differences

In [None]:
def calculate_sensitivity(results):
    """
    Calculate how sensitive a model is to prompt variations.
    Higher variance = more sensitive
    """
    log_probs = [r["target_log_prob"] for r in results]
    entropies = [r["entropy"] for r in results]
    
    return {
        "log_prob_variance": np.var(log_probs),
        "log_prob_range": np.max(log_probs) - np.min(log_probs),
        "entropy_variance": np.var(entropies),
        "mean_entropy": np.mean(entropies)
    }

In [None]:
# Compare sensitivity across models and tasks
sensitivity_data = []

for task_name, comparison in all_comparisons.items():
    for model_name, results in comparison.items():
        sens = calculate_sensitivity(results)
        sensitivity_data.append({
            "task": task_name,
            "model": model_name,
            **sens
        })

sens_df = pd.DataFrame(sensitivity_data)
print("\n=== Prompt Sensitivity Comparison ===")
print(sens_df.pivot(index='task', columns='model', values='log_prob_variance').round(4))

In [None]:
# Visualize sensitivity
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Log-prob variance by model
pivot_var = sens_df.pivot(index='task', columns='model', values='log_prob_variance')
pivot_var.plot(kind='bar', ax=axes[0], color=['steelblue', 'coral'])
axes[0].set_title('Prompt Sensitivity (Log-Prob Variance)')
axes[0].set_ylabel('Variance')
axes[0].legend(title='Model')
axes[0].tick_params(axis='x', rotation=45)

# Mean entropy by model
pivot_ent = sens_df.pivot(index='task', columns='model', values='mean_entropy')
pivot_ent.plot(kind='bar', ax=axes[1], color=['steelblue', 'coral'])
axes[1].set_title('Mean Output Entropy')
axes[1].set_ylabel('Entropy (nats)')
axes[1].legend(title='Model')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('../results/exp3_sensitivity_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Assistant Prefix Experiment

Test how different assistant response prefixes affect model behavior.

In [None]:
# Test assistant prefixes
ASSISTANT_TEST_PREFIXES = {
    "none": "",
    "sure": "Sure! ",
    "certainly": "Certainly. ",
    "lets_think": "Let me think about this. ",
    "step_by_step": "I'll solve this step by step. ",
    "great_question": "Great question! ",
    "hmm": "Hmm, ",
    "well": "Well, "
}

def test_assistant_prefixes(model, question, expected, prefixes):
    """Test how assistant prefixes affect completion probability."""
    base_prompt = f"Question: {question}\nAnswer:"
    
    results = []
    for prefix_name, prefix in prefixes.items():
        prompt = base_prompt + prefix
        
        dist = model.get_next_token_distribution(prompt)
        seq_probs = model.get_sequence_log_probs(prompt, expected)
        
        results.append({
            "prefix": prefix_name,
            "prompt": prompt,
            "target_log_prob": seq_probs["total_log_prob"],
            "entropy": dist["entropy"],
            "top_5": dist["top_tokens"][:5]
        })
    
    return results

In [None]:
# Run assistant prefix test on both models
prefix_results = {}

test_question = "What is 15 + 27?"
test_expected = "42"

for model_name, model in models.items():
    print(f"\nTesting {model_name} model...")
    results = test_assistant_prefixes(model, test_question, test_expected, ASSISTANT_TEST_PREFIXES)
    prefix_results[model_name] = results
    
    # Show results
    for r in sorted(results, key=lambda x: x["target_log_prob"], reverse=True):
        print(f"  {r['prefix']:20s}: log_prob={r['target_log_prob']:.3f}, entropy={r['entropy']:.3f}")

In [None]:
# Visualize assistant prefix effects
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

for idx, (model_name, results) in enumerate(prefix_results.items()):
    ax = axes[idx]
    
    sorted_results = sorted(results, key=lambda x: x["target_log_prob"], reverse=True)
    prefixes = [r["prefix"] for r in sorted_results]
    log_probs = [r["target_log_prob"] for r in sorted_results]
    
    colors = plt.cm.RdYlGn(np.linspace(0.2, 0.8, len(prefixes)))
    ax.barh(range(len(prefixes)), log_probs, color=colors)
    ax.set_yticks(range(len(prefixes)))
    ax.set_yticklabels(prefixes)
    ax.set_xlabel('Target Log Probability')
    ax.set_title(f'Assistant Prefix Effect: {model_name}')
    ax.invert_yaxis()

plt.tight_layout()
plt.savefig('../results/exp3_assistant_prefix.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Chat Template Effect (Instruction-Tuned Only)

In [None]:
# Test with proper chat template vs raw prompt
def test_chat_template(model, tokenizer_name, question, expected):
    """Compare raw prompt vs chat template."""
    from transformers import AutoTokenizer
    
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    # Raw prompt
    raw_prompt = f"Question: {question}\nAnswer:"
    
    # Chat template (if available)
    if hasattr(tokenizer, 'apply_chat_template'):
        messages = [
            {"role": "user", "content": question}
        ]
        try:
            chat_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        except:
            chat_prompt = raw_prompt
    else:
        chat_prompt = raw_prompt
    
    # Evaluate both
    raw_dist = model.get_next_token_distribution(raw_prompt)
    raw_probs = model.get_sequence_log_probs(raw_prompt, " " + expected)
    
    chat_dist = model.get_next_token_distribution(chat_prompt)
    chat_probs = model.get_sequence_log_probs(chat_prompt, " " + expected)
    
    return {
        "raw": {
            "prompt": raw_prompt,
            "log_prob": raw_probs["total_log_prob"],
            "entropy": raw_dist["entropy"],
            "top_5": raw_dist["top_tokens"][:5]
        },
        "chat": {
            "prompt": chat_prompt,
            "log_prob": chat_probs["total_log_prob"],
            "entropy": chat_dist["entropy"],
            "top_5": chat_dist["top_tokens"][:5]
        }
    }

In [None]:
# Test chat template on chat model
print("Testing chat template effect...")

template_results = {}
for task_name, task_data in TEST_CASES.items():
    result = test_chat_template(
        model_chat, 
        MODELS["chat"],
        task_data["question"],
        task_data["expected"]
    )
    template_results[task_name] = result
    
    print(f"\n{task_name}:")
    print(f"  Raw prompt log-prob: {result['raw']['log_prob']:.3f}")
    print(f"  Chat template log-prob: {result['chat']['log_prob']:.3f}")
    print(f"  Improvement: {result['chat']['log_prob'] - result['raw']['log_prob']:.3f}")

## 7. Key Findings

In [None]:
print("="*60)
print("EXPERIMENT 3 SUMMARY: Base vs Instruction-Tuned")
print("="*60)

# Aggregate findings
print("\n1. Prompt Sensitivity:")
base_vars = [s["log_prob_variance"] for s in sensitivity_data if s["model"] == "base"]
chat_vars = [s["log_prob_variance"] for s in sensitivity_data if s["model"] == "chat"]
print(f"   - Base model avg variance: {np.mean(base_vars):.4f}")
print(f"   - Chat model avg variance: {np.mean(chat_vars):.4f}")
print(f"   - Chat model is {'more' if np.mean(chat_vars) > np.mean(base_vars) else 'less'} sensitive to prompts")

print("\n2. Output Entropy:")
base_ent = [s["mean_entropy"] for s in sensitivity_data if s["model"] == "base"]
chat_ent = [s["mean_entropy"] for s in sensitivity_data if s["model"] == "chat"]
print(f"   - Base model avg entropy: {np.mean(base_ent):.4f}")
print(f"   - Chat model avg entropy: {np.mean(chat_ent):.4f}")

print("\n3. Assistant Prefix Effects:")
for model_name, results in prefix_results.items():
    best = max(results, key=lambda x: x["target_log_prob"])
    worst = min(results, key=lambda x: x["target_log_prob"])
    print(f"   {model_name}: best='{best['prefix']}', worst='{worst['prefix']}'")

In [None]:
# Save results
import json
import os

os.makedirs('../results', exist_ok=True)

# Save sensitivity data
sens_df.to_csv('../results/exp3_sensitivity.csv', index=False)

# Save prefix results
prefix_save = {}
for model_name, results in prefix_results.items():
    prefix_save[model_name] = [
        {"prefix": r["prefix"], "log_prob": r["target_log_prob"], "entropy": r["entropy"]}
        for r in results
    ]

with open('../results/exp3_prefix_results.json', 'w') as f:
    json.dump(prefix_save, f, indent=2)

print("Results saved.")