# Experiment 6: Output Distribution Under Paraphrase

**Goal:** Test whether semantically equivalent prompts produce equivalent outputs.

**Key Questions:**
- Do paraphrased prompts yield similar distributions?
- Is the model responding to meaning or surface form?
- Which paraphrases break prompt effectiveness?

In [None]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import re

from src.model_utils import load_model
from src.metrics import DistributionMetrics, compute_all_metrics, ExperimentResults
from src.visualization import set_style, plot_distribution_comparison

set_style()

In [None]:
model = load_model("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

## 1. Define Paraphrase Sets

We create semantically equivalent prompts with different surface forms.

In [None]:
# Paraphrase sets - semantically equivalent prompts
PARAPHRASE_SETS = {
    "chain_of_thought": {
        "original": "Let's think step by step.",
        "paraphrases": [
            "Let's approach this systematically.",
            "Let's work through this one step at a time.",
            "Let's break this down into steps.",
            "Let's solve this step-by-step.",
            "Let's reason through this carefully.",
            "Let's analyze this methodically.",
            "Think about this step by step.",
            "Work through this systematically.",
            "Proceed step by step.",
            "Take it one step at a time."
        ]
    },
    
    "expert_persona": {
        "original": "You are an expert.",
        "paraphrases": [
            "You are a specialist in this field.",
            "You have extensive expertise.",
            "You are highly knowledgeable.",
            "You are a professional.",
            "You are an authority on this topic.",
            "You possess expert-level knowledge.",
            "You are a subject matter expert.",
            "Act as an expert.",
            "Respond as an expert would.",
            "Answer like an expert."
        ]
    },
    
    "instruction": {
        "original": "Answer the following question.",
        "paraphrases": [
            "Please answer this question.",
            "Respond to the question below.",
            "Provide an answer to this question.",
            "Give your answer to the following.",
            "Answer this:",
            "What is your answer to this question?",
            "Please respond to the following question.",
            "Here is a question to answer.",
            "Question for you to answer:",
            "Consider and answer this question."
        ]
    },
    
    "be_careful": {
        "original": "Be careful and accurate.",
        "paraphrases": [
            "Take care to be precise.",
            "Ensure accuracy in your response.",
            "Be thorough and correct.",
            "Pay attention to accuracy.",
            "Make sure your answer is accurate.",
            "Respond with care and precision.",
            "Be meticulous in your answer.",
            "Carefully consider your response.",
            "Accuracy is important here.",
            "Please be precise."
        ]
    }
}

# Test questions
TEST_QUESTIONS = [
    {"q": "What is 127 + 385?", "a": "512"},
    {"q": "What is the capital of Australia?", "a": "Canberra"},
    {"q": "What is the chemical symbol for gold?", "a": "Au"}
]

## 2. Measure Paraphrase Consistency

In [None]:
def test_paraphrase_consistency(model, paraphrase_set, question, expected):
    """
    Test how consistent model behavior is across paraphrased prompts.
    """
    original = paraphrase_set["original"]
    paraphrases = paraphrase_set["paraphrases"]
    
    # Build prompts
    original_prompt = f"{original}\n\n{question}"
    
    # Get original distribution
    orig_dist = model.get_next_token_distribution(original_prompt)
    orig_probs = model.get_sequence_log_probs(original_prompt, " " + expected)
    
    results = [{
        "variant": "original",
        "text": original,
        "target_log_prob": orig_probs["total_log_prob"],
        "entropy": orig_dist["entropy"],
        "top_5": orig_dist["top_tokens"][:5],
        "kl_from_original": 0.0,
        "js_from_original": 0.0
    }]
    
    orig_full_probs = orig_dist["full_probs"]
    
    # Test each paraphrase
    for para in paraphrases:
        para_prompt = f"{para}\n\n{question}"
        para_dist = model.get_next_token_distribution(para_prompt)
        para_probs = model.get_sequence_log_probs(para_prompt, " " + expected)
        para_full_probs = para_dist["full_probs"]
        
        # Calculate divergence from original
        kl = DistributionMetrics.kl_divergence(orig_full_probs, para_full_probs)
        js = DistributionMetrics.jensen_shannon(orig_full_probs, para_full_probs)
        
        results.append({
            "variant": "paraphrase",
            "text": para,
            "target_log_prob": para_probs["total_log_prob"],
            "entropy": para_dist["entropy"],
            "top_5": para_dist["top_tokens"][:5],
            "kl_from_original": kl,
            "js_from_original": js
        })
    
    return results

In [None]:
# Run paraphrase analysis
all_paraphrase_results = {}

for set_name, paraphrase_set in PARAPHRASE_SETS.items():
    print(f"\n{'='*60}")
    print(f"Testing: {set_name}")
    print(f"Original: '{paraphrase_set['original']}'")
    print(f"{'='*60}")
    
    set_results = []
    
    for test in TEST_QUESTIONS:
        results = test_paraphrase_consistency(
            model, paraphrase_set, test["q"], test["a"]
        )
        for r in results:
            r["question"] = test["q"]
        set_results.extend(results)
    
    all_paraphrase_results[set_name] = set_results
    
    # Summary stats
    paraphrase_only = [r for r in set_results if r["variant"] == "paraphrase"]
    kl_values = [r["kl_from_original"] for r in paraphrase_only]
    js_values = [r["js_from_original"] for r in paraphrase_only]
    log_prob_values = [r["target_log_prob"] for r in paraphrase_only]
    orig_log_probs = [r["target_log_prob"] for r in set_results if r["variant"] == "original"]
    
    print(f"\nDistribution divergence from original:")
    print(f"  KL: mean={np.mean(kl_values):.4f}, std={np.std(kl_values):.4f}")
    print(f"  JS: mean={np.mean(js_values):.4f}, std={np.std(js_values):.4f}")
    print(f"\nTarget log-prob:")
    print(f"  Original: {np.mean(orig_log_probs):.4f}")
    print(f"  Paraphrases: mean={np.mean(log_prob_values):.4f}, std={np.std(log_prob_values):.4f}")

## 3. Identify Breaking Paraphrases

In [None]:
# Find paraphrases that significantly hurt performance
print("=== Paraphrases That Break Performance ===")

for set_name, results in all_paraphrase_results.items():
    print(f"\n{set_name}:")
    
    # Get original performance per question
    orig_by_q = {r["question"]: r["target_log_prob"] 
                 for r in results if r["variant"] == "original"}
    
    # Find worst performing paraphrases
    para_results = [r for r in results if r["variant"] == "paraphrase"]
    
    for r in para_results:
        orig_prob = orig_by_q[r["question"]]
        r["relative_change"] = r["target_log_prob"] - orig_prob
    
    # Group by paraphrase text and average
    para_performance = {}
    for r in para_results:
        text = r["text"]
        if text not in para_performance:
            para_performance[text] = []
        para_performance[text].append(r["relative_change"])
    
    avg_performance = {t: np.mean(v) for t, v in para_performance.items()}
    sorted_paras = sorted(avg_performance.items(), key=lambda x: x[1])
    
    print("  Worst paraphrases (hurt most):")
    for text, change in sorted_paras[:3]:
        print(f"    '{text[:50]}...': {change:+.4f}")
    
    print("  Best paraphrases (help most):")
    for text, change in sorted_paras[-3:]:
        print(f"    '{text[:50]}...': {change:+.4f}")

## 4. Surface Form Analysis

In [None]:
def extract_surface_features(text):
    """Extract surface-level features from prompt text."""
    return {
        "length": len(text),
        "word_count": len(text.split()),
        "starts_with_lets": text.lower().startswith("let's"),
        "starts_with_you": text.lower().startswith("you"),
        "has_period": "." in text,
        "has_colon": ":" in text,
        "imperative": text.split()[0].lower() in ["be", "think", "answer", "respond", "provide", "give", "work", "take", "proceed"],
        "has_please": "please" in text.lower(),
        "ends_period": text.strip().endswith("."),
        "ends_colon": text.strip().endswith(":")
    }

In [None]:
# Analyze which surface features correlate with performance
all_features = []

for set_name, results in all_paraphrase_results.items():
    for r in results:
        features = extract_surface_features(r["text"])
        features["target_log_prob"] = r["target_log_prob"]
        features["set_name"] = set_name
        all_features.append(features)

feature_df = pd.DataFrame(all_features)

# Calculate correlations
print("=== Surface Feature Correlations with Performance ===")
numeric_cols = [c for c in feature_df.columns if c not in ['target_log_prob', 'set_name']]

correlations = []
for col in numeric_cols:
    corr = feature_df[col].astype(float).corr(feature_df['target_log_prob'])
    if not np.isnan(corr):
        correlations.append((col, corr))

correlations.sort(key=lambda x: abs(x[1]), reverse=True)
for feat, corr in correlations:
    print(f"  {feat:20s}: {corr:+.4f}")

## 5. Semantic Similarity vs Distribution Similarity

In [None]:
def calculate_lexical_overlap(text1, text2):
    """Simple lexical overlap as proxy for surface similarity."""
    words1 = set(text1.lower().split())
    words2 = set(text2.lower().split())
    
    if not words1 or not words2:
        return 0.0
    
    intersection = words1 & words2
    union = words1 | words2
    
    return len(intersection) / len(union)  # Jaccard similarity

In [None]:
# Analyze relationship between lexical overlap and distribution similarity
overlap_vs_divergence = []

for set_name, results in all_paraphrase_results.items():
    original_text = PARAPHRASE_SETS[set_name]["original"]
    
    for r in results:
        if r["variant"] == "paraphrase":
            overlap = calculate_lexical_overlap(original_text, r["text"])
            overlap_vs_divergence.append({
                "set": set_name,
                "lexical_overlap": overlap,
                "kl_divergence": r["kl_from_original"],
                "js_divergence": r["js_from_original"],
                "text": r["text"]
            })

overlap_df = pd.DataFrame(overlap_vs_divergence)

# Correlation
corr_kl = overlap_df["lexical_overlap"].corr(overlap_df["kl_divergence"])
corr_js = overlap_df["lexical_overlap"].corr(overlap_df["js_divergence"])

print(f"Correlation between lexical overlap and distribution divergence:")
print(f"  Lexical overlap vs KL: {corr_kl:.4f}")
print(f"  Lexical overlap vs JS: {corr_js:.4f}")
print(f"\nInterpretation: {'Surface form matters!' if abs(corr_kl) > 0.3 else 'Meaning matters more than form.'}")

In [None]:
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scatter: lexical overlap vs KL
for set_name in PARAPHRASE_SETS.keys():
    subset = overlap_df[overlap_df["set"] == set_name]
    axes[0].scatter(subset["lexical_overlap"], subset["kl_divergence"], 
                    label=set_name, alpha=0.7, s=50)

axes[0].set_xlabel("Lexical Overlap (Jaccard)")
axes[0].set_ylabel("KL Divergence from Original")
axes[0].set_title("Does Surface Similarity Predict Distribution Similarity?")
axes[0].legend()

# Box plot: divergence by paraphrase set
data_for_box = [overlap_df[overlap_df["set"] == s]["js_divergence"].values 
                for s in PARAPHRASE_SETS.keys()]
bp = axes[1].boxplot(data_for_box, labels=list(PARAPHRASE_SETS.keys()), patch_artist=True)
axes[1].set_ylabel("JS Divergence from Original")
axes[1].set_title("Distribution Sensitivity by Prompt Type")
axes[1].tick_params(axis='x', rotation=30)

plt.tight_layout()
plt.savefig('../results/exp6_paraphrase_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Key Findings

In [None]:
print("="*60)
print("EXPERIMENT 6 SUMMARY: Paraphrase Sensitivity")
print("="*60)

print("\n1. Distribution Consistency Across Paraphrases:")
for set_name, results in all_paraphrase_results.items():
    para_only = [r for r in results if r["variant"] == "paraphrase"]
    js_values = [r["js_from_original"] for r in para_only]
    print(f"   {set_name}: JS divergence = {np.mean(js_values):.4f} ± {np.std(js_values):.4f}")

print(f"\n2. Surface Form vs Meaning:")
print(f"   Lexical overlap correlation with divergence: {corr_kl:.4f}")
if abs(corr_kl) > 0.3:
    print("   → Model is sensitive to surface form, not just meaning")
else:
    print("   → Model responds more to semantic content than surface form")

print("\n3. Actionable Insights:")
print("   - [Fill after running: Which paraphrase patterns work best?]")
print("   - [Fill after running: What surface features matter?]")

In [None]:
# Save results
import json
import os

os.makedirs('../results', exist_ok=True)

save_data = {
    "divergence_by_set": {
        set_name: {
            "mean_js": float(np.mean([r["js_from_original"] for r in results if r["variant"] == "paraphrase"])),
            "std_js": float(np.std([r["js_from_original"] for r in results if r["variant"] == "paraphrase"]))
        }
        for set_name, results in all_paraphrase_results.items()
    },
    "surface_correlations": dict(correlations),
    "lexical_vs_distribution_corr": float(corr_kl)
}

with open('../results/exp6_paraphrase_results.json', 'w') as f:
    json.dump(save_data, f, indent=2)

print("Results saved.")