# Experiment: Sampling Consistency Analysis

**Goal:** Analyze how system prompts affect behavioral consistency across repeated nucleus sampling.

**Key Questions:**
- Do certain system prompts lead to more consistent/deterministic outputs?
- How does output variance change with different instruction types?
- Is there a correlation between entropy and actual output diversity?
- Do some prompts stabilize behavior while others introduce more randomness?

**Metrics:**
- Output token overlap across samples
- Semantic similarity (via embeddings)
- Length variance
- First token consistency
- Response structure consistency

In [None]:
import sys, os
if 'google.colab' in sys.modules:
    import shutil
    if os.path.exists('/content/LLM-Instruction-Understanding'):
        shutil.rmtree('/content/LLM-Instruction-Understanding')
    !git clone https://github.com/maralkh/LLM-Instruction-Understanding.git
    os.chdir('/content/LLM-Instruction-Understanding')
    !pip install -q -r requirements.txt
    sys.path.insert(0, '/content/LLM-Instruction-Understanding')
else:
    sys.path.insert(0, os.path.abspath('..'))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
from scipy import stats
from collections import Counter
import torch
import torch.nn.functional as F
from typing import List, Dict, Tuple

from src.model_utils import load_model
from src.test_configs import get_all_test_prompts, get_core_system_prompts, get_system_prompts, build_chat_prompt

plt.style.use('seaborn-v0_8-whitegrid')

In [None]:
model = load_model("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
layer_info = model.get_layer_info()
print(f"Model: {layer_info['model_name']}")
print(f"Layers: {layer_info['n_layers']}, Heads: {layer_info['n_heads']}")

## 1. Nucleus Sampling with Multiple Generations

In [None]:
def generate_samples(
    model, 
    prompt: str, 
    n_samples: int = 10,
    max_new_tokens: int = 50,
    top_p: float = 0.9,
    temperature: float = 1.0,
    top_k: int = 50
) -> List[Dict]:
    """
    Generate multiple samples using nucleus sampling.
    Returns list of dicts with generated text, tokens, and metadata.
    """
    inputs = model.tokenizer(prompt, return_tensors="pt").to(model.config.device)
    prompt_length = inputs.input_ids.shape[1]
    
    samples = []
    for i in range(n_samples):
        with torch.no_grad():
            outputs = model.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                top_p=top_p,
                top_k=top_k,
                temperature=temperature,
                pad_token_id=model.tokenizer.pad_token_id,
                return_dict_in_generate=True,
                output_scores=True
            )
        
        generated_ids = outputs.sequences[0, prompt_length:]
        generated_text = model.tokenizer.decode(generated_ids, skip_special_tokens=True)
        generated_tokens = [model.tokenizer.decode([t]) for t in generated_ids]
        
        # Calculate per-token entropy from scores
        entropies = []
        for score in outputs.scores:
            probs = F.softmax(score[0].float(), dim=-1)
            ent = -torch.sum(probs * torch.log(probs + 1e-10)).item()
            entropies.append(ent if np.isfinite(ent) else 0.0)
        
        samples.append({
            'text': generated_text,
            'tokens': generated_tokens,
            'token_ids': generated_ids.cpu().tolist(),
            'length': len(generated_ids),
            'first_token': generated_tokens[0] if generated_tokens else '',
            'mean_entropy': np.mean(entropies) if entropies else 0.0,
            'entropies': entropies
        })
    
    return samples

In [None]:
def compute_sample_consistency(samples: List[Dict]) -> Dict:
    """
    Compute consistency metrics across multiple samples.
    """
    n = len(samples)
    if n == 0:
        return {}
    
    # 1. First token consistency
    first_tokens = [s['first_token'] for s in samples]
    first_token_counts = Counter(first_tokens)
    most_common_first = first_token_counts.most_common(1)[0][1] / n
    first_token_entropy = -sum((c/n) * np.log(c/n + 1e-10) for c in first_token_counts.values())
    
    # 2. Length consistency
    lengths = [s['length'] for s in samples]
    length_mean = np.mean(lengths)
    length_std = np.std(lengths)
    length_cv = length_std / (length_mean + 1e-10)  # Coefficient of variation
    
    # 3. Token overlap (Jaccard similarity between all pairs)
    jaccard_scores = []
    for i in range(n):
        for j in range(i+1, n):
            set1 = set(samples[i]['token_ids'])
            set2 = set(samples[j]['token_ids'])
            intersection = len(set1 & set2)
            union = len(set1 | set2)
            jaccard = intersection / union if union > 0 else 0
            jaccard_scores.append(jaccard)
    mean_jaccard = np.mean(jaccard_scores) if jaccard_scores else 0
    
    # 4. N-gram overlap (bigrams)
    def get_ngrams(tokens, n=2):
        return set(tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1))
    
    bigram_overlaps = []
    for i in range(n):
        for j in range(i+1, n):
            bg1 = get_ngrams(samples[i]['token_ids'], 2)
            bg2 = get_ngrams(samples[j]['token_ids'], 2)
            if len(bg1 | bg2) > 0:
                overlap = len(bg1 & bg2) / len(bg1 | bg2)
                bigram_overlaps.append(overlap)
    mean_bigram_overlap = np.mean(bigram_overlaps) if bigram_overlaps else 0
    
    # 5. Unique outputs ratio
    unique_texts = len(set(s['text'] for s in samples))
    unique_ratio = unique_texts / n
    
    # 6. Mean generation entropy
    mean_entropy = np.mean([s['mean_entropy'] for s in samples])
    
    # 7. Text similarity (character-level)
    def levenshtein_ratio(s1, s2):
        """Simplified similarity based on common prefix/suffix."""
        if not s1 or not s2:
            return 0.0
        # Common prefix length
        prefix_len = 0
        for c1, c2 in zip(s1, s2):
            if c1 == c2:
                prefix_len += 1
            else:
                break
        return prefix_len / max(len(s1), len(s2))
    
    prefix_similarities = []
    for i in range(n):
        for j in range(i+1, n):
            sim = levenshtein_ratio(samples[i]['text'], samples[j]['text'])
            prefix_similarities.append(sim)
    mean_prefix_sim = np.mean(prefix_similarities) if prefix_similarities else 0
    
    return {
        'first_token_consistency': most_common_first,
        'first_token_entropy': first_token_entropy,
        'length_mean': length_mean,
        'length_std': length_std,
        'length_cv': length_cv,
        'token_jaccard': mean_jaccard,
        'bigram_overlap': mean_bigram_overlap,
        'unique_ratio': unique_ratio,
        'mean_entropy': mean_entropy,
        'prefix_similarity': mean_prefix_sim,
        'n_samples': n
    }

In [None]:
# Test on a single prompt
test_prompt = "What is the capital of France?"
full_prompt = build_chat_prompt("", test_prompt, model.tokenizer)

print("Generating 5 samples...")
samples = generate_samples(model, full_prompt, n_samples=5, max_new_tokens=30)

print("\n=== Sample Outputs ===")
for i, s in enumerate(samples):
    print(f"\n[{i+1}] {s['text'][:100]}..." if len(s['text']) > 100 else f"\n[{i+1}] {s['text']}")

consistency = compute_sample_consistency(samples)
print("\n=== Consistency Metrics ===")
for k, v in consistency.items():
    print(f"  {k}: {v:.4f}" if isinstance(v, float) else f"  {k}: {v}")

## 2. Compare Consistency Across System Prompts

In [None]:
# Configuration
N_SAMPLES = 10  # Samples per prompt
MAX_NEW_TOKENS = 40
TOP_P = 0.9
TEMPERATURE = 1.0

test_prompts = get_all_test_prompts()[:5]  # Use 5 test prompts
system_prompts = get_core_system_prompts()

print(f"Testing {len(test_prompts)} prompts × {len(system_prompts)} system prompts × {N_SAMPLES} samples")
print(f"Total generations: {len(test_prompts) * len(system_prompts) * N_SAMPLES}")

In [None]:
all_consistency_results = []
all_samples_data = []  # Store raw samples for deeper analysis

for test in tqdm(test_prompts, desc="Test prompts"):
    for sys_name, sys_info in system_prompts.items():
        try:
            full_prompt = build_chat_prompt(sys_info['text'], test['prompt'], model.tokenizer)
            
            samples = generate_samples(
                model, full_prompt, 
                n_samples=N_SAMPLES,
                max_new_tokens=MAX_NEW_TOKENS,
                top_p=TOP_P,
                temperature=TEMPERATURE
            )
            
            consistency = compute_sample_consistency(samples)
            consistency['test_id'] = test['id']
            consistency['category'] = test['category']
            consistency['system_prompt'] = sys_name
            
            all_consistency_results.append(consistency)
            
            # Store sample data
            for i, s in enumerate(samples):
                all_samples_data.append({
                    'test_id': test['id'],
                    'system_prompt': sys_name,
                    'sample_idx': i,
                    'text': s['text'],
                    'length': s['length'],
                    'first_token': s['first_token'],
                    'mean_entropy': s['mean_entropy']
                })
                
        except Exception as e:
            print(f"Error for {test['id']}/{sys_name}: {e}")

consistency_df = pd.DataFrame(all_consistency_results)
samples_df = pd.DataFrame(all_samples_data)
print(f"\nCollected {len(consistency_df)} consistency measurements")
print(f"Stored {len(samples_df)} individual samples")

In [None]:
# Summary by system prompt
sys_consistency = consistency_df.groupby('system_prompt').agg({
    'first_token_consistency': 'mean',
    'token_jaccard': 'mean',
    'bigram_overlap': 'mean',
    'unique_ratio': 'mean',
    'length_cv': 'mean',
    'mean_entropy': 'mean'
}).round(4)

# Sort by overall consistency (higher jaccard = more consistent)
sys_consistency = sys_consistency.sort_values('token_jaccard', ascending=False)

print("=== Consistency by System Prompt ===")
print("(Higher values = more consistent, except unique_ratio and length_cv)")
print(sys_consistency)

In [None]:
# Visualize consistency metrics
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

metrics = [
    ('first_token_consistency', 'First Token Consistency', True),
    ('token_jaccard', 'Token Jaccard Similarity', True),
    ('bigram_overlap', 'Bigram Overlap', True),
    ('unique_ratio', 'Unique Output Ratio', False),  # Lower = more consistent
    ('length_cv', 'Length Coef. of Variation', False),  # Lower = more consistent
    ('mean_entropy', 'Mean Generation Entropy', False)  # Lower = more confident
]

for ax, (metric, title, higher_better) in zip(axes.flatten(), metrics):
    data = sys_consistency[metric].sort_values(ascending=not higher_better)
    colors = plt.cm.RdYlGn(np.linspace(0.2, 0.8, len(data)))
    if not higher_better:
        colors = colors[::-1]
    
    ax.barh(range(len(data)), data.values, color=colors)
    ax.set_yticks(range(len(data)))
    ax.set_yticklabels(data.index)
    ax.set_xlabel(metric)
    ax.set_title(title)

plt.tight_layout()
plt.savefig('../results/sampling_consistency_metrics.png', dpi=150)
plt.show()

## 3. Consistency vs Entropy Correlation

In [None]:
# Is higher entropy associated with lower consistency?
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Entropy vs Token Jaccard
ax = axes[0]
ax.scatter(consistency_df['mean_entropy'], consistency_df['token_jaccard'], alpha=0.6)
corr1, p1 = stats.pearsonr(consistency_df['mean_entropy'], consistency_df['token_jaccard'])
ax.set_xlabel('Mean Generation Entropy')
ax.set_ylabel('Token Jaccard Similarity')
ax.set_title(f'Entropy vs Token Overlap\nr={corr1:.3f}, p={p1:.3f}')

# Entropy vs Unique Ratio
ax = axes[1]
ax.scatter(consistency_df['mean_entropy'], consistency_df['unique_ratio'], alpha=0.6, color='orange')
corr2, p2 = stats.pearsonr(consistency_df['mean_entropy'], consistency_df['unique_ratio'])
ax.set_xlabel('Mean Generation Entropy')
ax.set_ylabel('Unique Output Ratio')
ax.set_title(f'Entropy vs Output Diversity\nr={corr2:.3f}, p={p2:.3f}')

# Entropy vs First Token Consistency
ax = axes[2]
ax.scatter(consistency_df['mean_entropy'], consistency_df['first_token_consistency'], alpha=0.6, color='green')
corr3, p3 = stats.pearsonr(consistency_df['mean_entropy'], consistency_df['first_token_consistency'])
ax.set_xlabel('Mean Generation Entropy')
ax.set_ylabel('First Token Consistency')
ax.set_title(f'Entropy vs First Token\nr={corr3:.3f}, p={p3:.3f}')

plt.tight_layout()
plt.savefig('../results/entropy_vs_consistency.png', dpi=150)
plt.show()

print("\n=== Correlation Summary ===")
print(f"Entropy vs Token Jaccard: r={corr1:.3f} (p={p1:.3f})")
print(f"Entropy vs Unique Ratio: r={corr2:.3f} (p={p2:.3f})")
print(f"Entropy vs First Token: r={corr3:.3f} (p={p3:.3f})")

## 4. Consistency by Task Category

In [None]:
# Does consistency vary by task type?
cat_consistency = consistency_df.groupby('category').agg({
    'first_token_consistency': 'mean',
    'token_jaccard': 'mean',
    'unique_ratio': 'mean',
    'mean_entropy': 'mean'
}).round(4).sort_values('token_jaccard', ascending=False)

print("=== Consistency by Task Category ===")
print(cat_consistency)

In [None]:
# Heatmap: System Prompt × Category for Token Jaccard
pivot = consistency_df.pivot_table(
    values='token_jaccard', 
    index='system_prompt', 
    columns='category', 
    aggfunc='mean'
)

fig, ax = plt.subplots(figsize=(12, 6))
sns.heatmap(pivot, annot=True, fmt='.3f', cmap='RdYlGn', ax=ax, vmin=0, vmax=1)
ax.set_title('Output Consistency (Token Jaccard) by System Prompt × Category')
plt.tight_layout()
plt.savefig('../results/consistency_heatmap.png', dpi=150)
plt.show()

## 5. Temperature Sensitivity Analysis

In [None]:
# How does temperature affect consistency for different system prompts?
temperatures = [0.5, 0.7, 1.0, 1.2]
test_prompt = test_prompts[0]
selected_systems = ['none', 'concise', 'cot', 'expert']

temp_results = []

for temp in tqdm(temperatures, desc="Temperatures"):
    for sys_name in selected_systems:
        if sys_name not in system_prompts:
            continue
        sys_info = system_prompts[sys_name]
        full_prompt = build_chat_prompt(sys_info['text'], test_prompt['prompt'], model.tokenizer)
        
        samples = generate_samples(
            model, full_prompt,
            n_samples=8,
            max_new_tokens=30,
            temperature=temp
        )
        
        consistency = compute_sample_consistency(samples)
        consistency['temperature'] = temp
        consistency['system_prompt'] = sys_name
        temp_results.append(consistency)

temp_df = pd.DataFrame(temp_results)
print(f"Collected {len(temp_df)} temperature measurements")

In [None]:
# Visualize temperature effects
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for ax, metric in zip(axes, ['token_jaccard', 'unique_ratio', 'first_token_consistency']):
    for sys_name in selected_systems:
        data = temp_df[temp_df['system_prompt'] == sys_name]
        ax.plot(data['temperature'], data[metric], 'o-', label=sys_name, linewidth=2, markersize=8)
    
    ax.set_xlabel('Temperature')
    ax.set_ylabel(metric.replace('_', ' ').title())
    ax.set_title(f'{metric.replace("_", " ").title()} vs Temperature')
    ax.legend()

plt.tight_layout()
plt.savefig('../results/temperature_sensitivity.png', dpi=150)
plt.show()

## 6. First Token Analysis

In [None]:
# Analyze first token distribution by system prompt
first_token_analysis = samples_df.groupby(['system_prompt', 'first_token']).size().reset_index(name='count')

# Get top 5 first tokens per system prompt
top_first_tokens = first_token_analysis.groupby('system_prompt').apply(
    lambda x: x.nlargest(5, 'count')
).reset_index(drop=True)

print("=== Top First Tokens by System Prompt ===")
for sys_name in system_prompts.keys():
    sys_data = top_first_tokens[top_first_tokens['system_prompt'] == sys_name]
    if len(sys_data) > 0:
        print(f"\n{sys_name}:")
        for _, row in sys_data.iterrows():
            print(f"  '{row['first_token']}': {row['count']}")

In [None]:
# Visualize first token entropy by system prompt
first_token_entropy = consistency_df.groupby('system_prompt')['first_token_entropy'].mean().sort_values()

fig, ax = plt.subplots(figsize=(10, 6))
colors = plt.cm.RdYlGn_r(np.linspace(0.2, 0.8, len(first_token_entropy)))
ax.barh(range(len(first_token_entropy)), first_token_entropy.values, color=colors)
ax.set_yticks(range(len(first_token_entropy)))
ax.set_yticklabels(first_token_entropy.index)
ax.set_xlabel('First Token Entropy')
ax.set_title('First Token Variability by System Prompt\n(Lower = More Deterministic Start)')

plt.tight_layout()
plt.savefig('../results/first_token_entropy.png', dpi=150)
plt.show()

## 7. Summary & Key Findings

In [None]:
print("="*70)
print("SAMPLING CONSISTENCY ANALYSIS - KEY FINDINGS")
print("="*70)

print("\n1. MOST CONSISTENT SYSTEM PROMPTS (highest token overlap):")
top_consistent = sys_consistency['token_jaccard'].nlargest(3)
for sys_name, score in top_consistent.items():
    print(f"   - {sys_name}: Jaccard={score:.4f}")

print("\n2. MOST VARIABLE SYSTEM PROMPTS (highest unique ratio):")
top_variable = sys_consistency['unique_ratio'].nlargest(3)
for sys_name, score in top_variable.items():
    print(f"   - {sys_name}: Unique={score:.4f}")

print("\n3. ENTROPY-CONSISTENCY CORRELATION:")
print(f"   - Entropy vs Token Overlap: r={corr1:.3f}")
interpretation = "Higher entropy leads to LESS consistent outputs" if corr1 < 0 else "Weak relationship"
print(f"   - Interpretation: {interpretation}")

print("\n4. CATEGORY INSIGHTS:")
most_consistent_cat = cat_consistency['token_jaccard'].idxmax()
least_consistent_cat = cat_consistency['token_jaccard'].idxmin()
print(f"   - Most consistent category: {most_consistent_cat}")
print(f"   - Most variable category: {least_consistent_cat}")

print("\n5. PRACTICAL IMPLICATIONS:")
print("   - For reproducible outputs: Use system prompts with high first-token consistency")
print("   - For creative diversity: Use system prompts with high unique ratio")
print("   - Lower temperature reduces variance for all system prompts")

In [None]:
# Save results
import json
os.makedirs('../results', exist_ok=True)

results = {
    'system_consistency': sys_consistency.to_dict(),
    'category_consistency': cat_consistency.to_dict(),
    'entropy_correlations': {
        'entropy_vs_jaccard': {'r': corr1, 'p': p1},
        'entropy_vs_unique': {'r': corr2, 'p': p2},
        'entropy_vs_first_token': {'r': corr3, 'p': p3}
    },
    'config': {
        'n_samples': N_SAMPLES,
        'max_new_tokens': MAX_NEW_TOKENS,
        'top_p': TOP_P,
        'temperature': TEMPERATURE
    }
}

with open('../results/sampling_consistency.json', 'w') as f:
    json.dump(results, f, indent=2, default=float)

consistency_df.to_csv('../results/sampling_consistency_full.csv', index=False)
samples_df.to_csv('../results/all_samples.csv', index=False)

print("Results saved to ../results/")