# Experiment 18: Truncation vs Full-Context Comparison

**Date:** 2026-02-04

## Critical Question

Experiments 15-17 found that "random beats oracle" - but they used **full-context** priming
(prefix stays visible to query). This conflates two mechanisms:

1. **Value contamination**: Document values change during forward pass through prefix
2. **Attention interference**: Query competes with visible prefix for attention weight

**This experiment** tests both **TRUNCATION** (prefix removed after building cache) and
**FULL-CONTEXT** (prefix stays visible) to isolate these mechanisms.

## Hypothesis

- With **truncation**, semantic prefixes (oracle) should help (Exps 05-14 showed d=0.15-0.25)
- With **full-context**, semantic prefixes may hurt due to interference (Exps 15-17)
- The "random beats oracle" finding may be specific to full-context, not truncation

## Experimental Conditions

| Condition | Build | Score | What it tests |
|-----------|-------|-------|---------------|
| `bare` | `[document]` | Query sees document | Baseline |
| `oracle_5x_truncated` | `[oracle×5][doc]` → truncate → RoPE | Query sees only doc | Value contamination (semantic) |
| `random_5x_truncated` | `[random×5][doc]` → truncate → RoPE | Query sees only doc | Value contamination (structural) |
| `oracle_5x_fullctx` | `[oracle×5][doc]` | Query sees prefix + doc | Contamination + attention |
| `random_5x_fullctx` | `[random×5][doc]` | Query sees prefix + doc | Contamination + attention |

In [1]:
# Cell 1: Setup and Imports
import os
os.umask(0o000)  # Fix permissions for two-user environment

import sys
sys.path.insert(0, '/home/jupyter/research/directed_kvcache')

import json
import random
import numpy as np
import torch
from tqdm.auto import tqdm
from collections import defaultdict
from typing import Dict, List, Tuple, Any, Optional

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, DynamicCache
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

# Library imports
from lib.kv_cache import (
    build_kv_cache,
    build_cache_with_mask,
    extract_and_truncate_cache_with_bos,
    correct_rope_positions_with_bos,
    score_answer_with_cache,
    deepcopy_cache,
)
from lib.config import ExperimentConfig
from lib.analysis import compute_ranking_metrics

# Seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

PyTorch version: 2.10.0+cu128
CUDA available: True


In [2]:
# Cell 2: Load Model and Tokenizer
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
model.eval()

# Create config
config = ExperimentConfig(
    model_name=MODEL_NAME,
    device=model.device,
    seed=SEED
)

print(f"Model loaded on {model.device}")
print(f"Model dtype: {model.dtype}")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

Model loaded on cuda:0
Model dtype: torch.bfloat16


In [3]:
# Cell 3: Load Data
# Load MS MARCO for main test
marco = load_dataset("ms_marco", "v1.1", split="train")

# Filter to samples with answers
marco_with_answers = [s for s in marco if s['answers'] and s['answers'][0] and len(s['answers'][0]) > 0]
print(f"MS MARCO samples with answers: {len(marco_with_answers)}")

# Sample for experiment
N_SAMPLES = 500
random.shuffle(marco_with_answers)
samples = marco_with_answers[:N_SAMPLES]

print(f"Selected {len(samples)} samples for experiment")

MS MARCO samples with answers: 80142
Selected 500 samples for experiment


In [4]:
# Cell 4: Load Embedding Model for Distractor Selection
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# Pre-compute embeddings for all queries
print("Computing query embeddings...")
all_queries = [s['query'] for s in samples]
query_embeddings = embed_model.encode(all_queries, show_progress_bar=True)
print(f"Computed {len(query_embeddings)} query embeddings")

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Computing query embeddings...


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Computed 500 query embeddings


In [5]:
# Cell 5: Helper Functions

def build_bare_cache(passage: str) -> Tuple[DynamicCache, int]:
    """Build baseline cache from passage only."""
    ids = tokenizer.encode(passage, return_tensors='pt', add_special_tokens=True).to(model.device)
    with torch.no_grad():
        out = model(ids, use_cache=True)
    return out.past_key_values, ids.shape[1]


def build_primed_cache_fullcontext(prefix: str, passage: str) -> Tuple[DynamicCache, int]:
    """Build full-context cache: [prefix][passage], prefix stays visible."""
    text = prefix + " " + passage
    ids = tokenizer.encode(text, return_tensors='pt', add_special_tokens=True).to(model.device)
    with torch.no_grad():
        out = model(ids, use_cache=True)
    return out.past_key_values, ids.shape[1]


def build_primed_cache_truncated(prefix: str, passage: str) -> Tuple[DynamicCache, int]:
    """
    Build truncated cache: [prefix][passage] -> truncate prefix -> RoPE correct.
    Returns cache containing only document tokens (with BOS).
    """
    # Tokenize prefix to get its length
    prefix_with_sep = prefix + " "
    prefix_ids = tokenizer.encode(prefix_with_sep, return_tensors='pt', add_special_tokens=True)
    prefix_len = prefix_ids.shape[1]  # includes BOS
    
    # Tokenize full text
    full_text = prefix_with_sep + passage
    full_ids = tokenizer.encode(full_text, return_tensors='pt', add_special_tokens=True).to(model.device)
    full_len = full_ids.shape[1]
    doc_len = full_len - prefix_len  # document tokens (without BOS)
    
    # Build full cache
    with torch.no_grad():
        out = model(full_ids, use_cache=True)
    
    # Truncate: keep BOS + document portion
    truncated_cache = extract_and_truncate_cache_with_bos(out.past_key_values, doc_len)
    
    # RoPE correction: shift document keys back by (prefix_len - 1) positions
    # (BOS stays at position 0, document tokens shift from prefix_len..full_len-1 to 1..doc_len)
    surrogate_offset = prefix_len - 1
    correct_rope_positions_with_bos(truncated_cache, surrogate_offset, model)
    
    keep_len = 1 + doc_len  # BOS + document
    return truncated_cache, keep_len


def score_with_cache(cache: DynamicCache, cache_len: int, query: str, answer: str) -> float:
    """Score P(answer | cache, query) using NLL."""
    return score_answer_with_cache(
        cache, cache_len,
        f"\n\nQuery: {query}\nAnswer:",
        " " + answer,
        model, tokenizer, config
    )


def get_random_queries(n: int, exclude_idx: int) -> List[str]:
    """Get n random queries from dataset, excluding the specified index."""
    indices = [i for i in range(len(samples)) if i != exclude_idx]
    selected = random.sample(indices, min(n, len(indices)))
    return [samples[i]['query'] for i in selected]


def make_prefix_5x(query: str) -> str:
    """Create 5x repeated prefix from query."""
    return " ".join([query] * 5)


print("Helper functions defined.")

Helper functions defined.


In [6]:
# Cell 6: Explain Experimental Conditions (Documentation)
print("="*70)
print("EXPERIMENTAL CONDITIONS EXPLAINED")
print("="*70)

example_query = "What is the capital of France?"
example_passage = "Paris is the capital and largest city of France. It is located on the Seine River."
example_random = "How to train a dog? What causes earthquakes? Best pizza recipe?"

print("\n### BASELINE: bare ###")
print("Build:  [BOS][passage]")
print("Score:  Query attends to [BOS][passage]")
print(f"Cache:  '{example_passage[:50]}...'")
print("Tests:  Baseline performance with no priming")

print("\n" + "-"*70)

print("\n### TRUNCATED CONDITIONS ###")
print("These test PURE VALUE CONTAMINATION (prefix removed before scoring)")

print("\n## oracle_5x_truncated ##")
print("Build:  [BOS][oracle oracle oracle oracle oracle][passage]")
print("        -> Forward pass (document values influenced by oracle)")
print("        -> Truncate to [BOS][passage]")
print("        -> Apply RoPE correction to fix key positions")
print("Score:  Query attends to [BOS][passage] (oracle REMOVED)")
print(f"Prefix: '{example_query} ' * 5")
print("Tests:  Does SEMANTIC value contamination help?")

print("\n## random_5x_truncated ##")
print("Build:  [BOS][rand1 rand2 rand3 rand4 rand5][passage]")
print("        -> Same truncation + RoPE process")
print("Score:  Query attends to [BOS][passage] (random REMOVED)")
print(f"Prefix: '{example_random[:40]}...'")
print("Tests:  Does STRUCTURAL value contamination help?")

print("\n" + "-"*70)

print("\n### FULL-CONTEXT CONDITIONS ###")
print("These test VALUE CONTAMINATION + ATTENTION INTERFERENCE (prefix visible)")

print("\n## oracle_5x_fullctx ##")
print("Build:  [BOS][oracle oracle oracle oracle oracle][passage]")
print("Score:  Query attends to [BOS][oracle×5][passage] (oracle VISIBLE)")
print(f"Prefix: '{example_query} ' * 5")
print("Tests:  Contamination + interference. Oracle may compete with real query.")

print("\n## random_5x_fullctx ##")
print("Build:  [BOS][rand1 rand2 rand3 rand4 rand5][passage]")
print("Score:  Query attends to [BOS][random×5][passage] (random VISIBLE)")
print(f"Prefix: '{example_random[:40]}...'")
print("Tests:  Structural benefit without semantic interference.")

print("\n" + "="*70)
print("KEY COMPARISONS:")
print("- oracle_truncated vs random_truncated: Does semantic content help with pure contamination?")
print("- oracle_fullctx vs random_fullctx: Does interference hurt semantic prefixes?")
print("- truncated vs fullctx (same prefix): Does removing prefix help or hurt?")
print("="*70)

EXPERIMENTAL CONDITIONS EXPLAINED

### BASELINE: bare ###
Build:  [BOS][passage]
Score:  Query attends to [BOS][passage]
Cache:  'Paris is the capital and largest city of France. I...'
Tests:  Baseline performance with no priming

----------------------------------------------------------------------

### TRUNCATED CONDITIONS ###
These test PURE VALUE CONTAMINATION (prefix removed before scoring)

## oracle_5x_truncated ##
Build:  [BOS][oracle oracle oracle oracle oracle][passage]
        -> Forward pass (document values influenced by oracle)
        -> Truncate to [BOS][passage]
        -> Apply RoPE correction to fix key positions
Score:  Query attends to [BOS][passage] (oracle REMOVED)
Prefix: 'What is the capital of France? ' * 5
Tests:  Does SEMANTIC value contamination help?

## random_5x_truncated ##
Build:  [BOS][rand1 rand2 rand3 rand4 rand5][passage]
        -> Same truncation + RoPE process
Score:  Query attends to [BOS][passage] (random REMOVED)
Prefix: 'How to train a dog?

In [7]:
# Cell 7: Main Experiment Loop

results = []
OUTPUT_DIR = '/home/jupyter/research/directed_kvcache/results/exp18'
CHECKPOINT_PATH = f'{OUTPUT_DIR}/checkpoint.json'

# Load checkpoint if exists
start_idx = 0
if os.path.exists(CHECKPOINT_PATH):
    with open(CHECKPOINT_PATH, 'r') as f:
        checkpoint = json.load(f)
        results = checkpoint.get('results', [])
        start_idx = len(results)
        print(f"Resuming from checkpoint at sample {start_idx}")

print(f"Processing {N_SAMPLES - start_idx} samples...")

for idx in tqdm(range(start_idx, N_SAMPLES)):
    sample = samples[idx]
    passage = sample['passages']['passage_text'][0] if sample['passages']['passage_text'] else ""
    query = sample['query']
    answer = sample['answers'][0]
    
    if not passage or not answer:
        continue
    
    # Create prefixes
    oracle_prefix = make_prefix_5x(query)
    random_queries = get_random_queries(5, idx)
    random_prefix = " ".join(random_queries)
    
    try:
        # Build all caches
        bare_cache, bare_len = build_bare_cache(passage)
        oracle_trunc_cache, oracle_trunc_len = build_primed_cache_truncated(oracle_prefix, passage)
        random_trunc_cache, random_trunc_len = build_primed_cache_truncated(random_prefix, passage)
        oracle_full_cache, oracle_full_len = build_primed_cache_fullcontext(oracle_prefix, passage)
        random_full_cache, random_full_len = build_primed_cache_fullcontext(random_prefix, passage)
        
        # Score all conditions (deepcopy to avoid mutation)
        nll_bare = score_with_cache(deepcopy_cache(bare_cache), bare_len, query, answer)
        nll_oracle_trunc = score_with_cache(deepcopy_cache(oracle_trunc_cache), oracle_trunc_len, query, answer)
        nll_random_trunc = score_with_cache(deepcopy_cache(random_trunc_cache), random_trunc_len, query, answer)
        nll_oracle_full = score_with_cache(deepcopy_cache(oracle_full_cache), oracle_full_len, query, answer)
        nll_random_full = score_with_cache(deepcopy_cache(random_full_cache), random_full_len, query, answer)
        
        result = {
            'idx': idx,
            'query': query,
            'answer': answer,
            'passage_len': len(passage.split()),
            'nll_bare': nll_bare,
            'nll_oracle_truncated': nll_oracle_trunc,
            'nll_random_truncated': nll_random_trunc,
            'nll_oracle_fullctx': nll_oracle_full,
            'nll_random_fullctx': nll_random_full,
            # Deltas (positive = priming helped)
            'delta_oracle_truncated': nll_bare - nll_oracle_trunc,
            'delta_random_truncated': nll_bare - nll_random_trunc,
            'delta_oracle_fullctx': nll_bare - nll_oracle_full,
            'delta_random_fullctx': nll_bare - nll_random_full,
        }
        results.append(result)
        
        # Checkpoint every 50 samples
        if len(results) % 50 == 0:
            with open(CHECKPOINT_PATH, 'w') as f:
                json.dump({'results': results}, f)
            print(f"Checkpoint saved at {len(results)} samples")
            
    except Exception as e:
        print(f"Error at sample {idx}: {e}")
        continue

# Final save
with open(f'{OUTPUT_DIR}/results.json', 'w') as f:
    json.dump({'results': results}, f, indent=2)
print(f"Saved {len(results)} results to {OUTPUT_DIR}/results.json")

Processing 500 samples...


  0%|          | 0/500 [00:00<?, ?it/s]

Checkpoint saved at 50 samples
Checkpoint saved at 100 samples
Checkpoint saved at 150 samples
Checkpoint saved at 200 samples
Checkpoint saved at 250 samples
Checkpoint saved at 300 samples
Checkpoint saved at 350 samples
Checkpoint saved at 400 samples
Checkpoint saved at 450 samples
Checkpoint saved at 500 samples
Saved 500 results to /home/jupyter/research/directed_kvcache/results/exp18/results.json


In [8]:
# Cell 8: Analysis
from scipy import stats

def cohens_d(x):
    """Cohen's d for a difference array."""
    return np.mean(x) / np.std(x, ddof=1) if np.std(x) > 0 else 0

print("="*70)
print("EXPERIMENT 18 RESULTS: Truncation vs Full-Context")
print("="*70)

n = len(results)
print(f"\nSamples analyzed: {n}")

# Extract arrays
bare = np.array([r['nll_bare'] for r in results])
oracle_trunc = np.array([r['nll_oracle_truncated'] for r in results])
random_trunc = np.array([r['nll_random_truncated'] for r in results])
oracle_full = np.array([r['nll_oracle_fullctx'] for r in results])
random_full = np.array([r['nll_random_fullctx'] for r in results])

delta_oracle_trunc = bare - oracle_trunc
delta_random_trunc = bare - random_trunc
delta_oracle_full = bare - oracle_full
delta_random_full = bare - random_full

print("\n" + "="*70)
print("NLL BY CONDITION (lower is better)")
print("="*70)
print(f"{'Condition':<25} {'Mean NLL':>12} {'Std':>10} {'Win%':>10} {'Cohen d':>10}")
print("-"*70)
print(f"{'bare (baseline)':<25} {np.mean(bare):>12.4f} {np.std(bare):>10.4f} {'--':>10} {'--':>10}")
print("-"*70)
print(f"{'oracle_5x_truncated':<25} {np.mean(oracle_trunc):>12.4f} {np.std(oracle_trunc):>10.4f} {np.mean(delta_oracle_trunc > 0)*100:>9.1f}% {cohens_d(delta_oracle_trunc):>10.3f}")
print(f"{'random_5x_truncated':<25} {np.mean(random_trunc):>12.4f} {np.std(random_trunc):>10.4f} {np.mean(delta_random_trunc > 0)*100:>9.1f}% {cohens_d(delta_random_trunc):>10.3f}")
print("-"*70)
print(f"{'oracle_5x_fullctx':<25} {np.mean(oracle_full):>12.4f} {np.std(oracle_full):>10.4f} {np.mean(delta_oracle_full > 0)*100:>9.1f}% {cohens_d(delta_oracle_full):>10.3f}")
print(f"{'random_5x_fullctx':<25} {np.mean(random_full):>12.4f} {np.std(random_full):>10.4f} {np.mean(delta_random_full > 0)*100:>9.1f}% {cohens_d(delta_random_full):>10.3f}")

EXPERIMENT 18 RESULTS: Truncation vs Full-Context

Samples analyzed: 500

NLL BY CONDITION (lower is better)
Condition                     Mean NLL        Std       Win%    Cohen d
----------------------------------------------------------------------
bare (baseline)                 2.2359     1.8192         --         --
----------------------------------------------------------------------
oracle_5x_truncated             2.2722     1.8597      41.4%     -0.105
random_5x_truncated             2.3750     1.9641      30.0%     -0.288
----------------------------------------------------------------------
oracle_5x_fullctx               2.4159     2.0258      31.6%     -0.293
random_5x_fullctx               2.4214     2.0210      30.8%     -0.315


In [9]:
# Cell 9: Key Comparisons

print("\n" + "="*70)
print("KEY COMPARISONS (paired t-tests)")
print("="*70)

# Comparison 1: Truncated oracle vs truncated random
t1, p1 = stats.ttest_rel(oracle_trunc, random_trunc)
oracle_wins_trunc = np.mean(oracle_trunc < random_trunc) * 100
print(f"\n1. TRUNCATED: Oracle vs Random")
print(f"   Oracle wins: {oracle_wins_trunc:.1f}%")
print(f"   t={t1:.3f}, p={p1:.4f}")
if p1 < 0.05:
    winner = "Oracle" if np.mean(oracle_trunc) < np.mean(random_trunc) else "Random"
    print(f"   -> {winner} significantly better (p<0.05)")
else:
    print(f"   -> No significant difference")

# Comparison 2: Full-context oracle vs full-context random
t2, p2 = stats.ttest_rel(oracle_full, random_full)
oracle_wins_full = np.mean(oracle_full < random_full) * 100
print(f"\n2. FULL-CONTEXT: Oracle vs Random")
print(f"   Oracle wins: {oracle_wins_full:.1f}%")
print(f"   t={t2:.3f}, p={p2:.4f}")
if p2 < 0.05:
    winner = "Oracle" if np.mean(oracle_full) < np.mean(random_full) else "Random"
    print(f"   -> {winner} significantly better (p<0.05)")
else:
    print(f"   -> No significant difference")

# Comparison 3: Truncated vs Full-context (oracle)
t3, p3 = stats.ttest_rel(oracle_trunc, oracle_full)
trunc_wins_oracle = np.mean(oracle_trunc < oracle_full) * 100
print(f"\n3. ORACLE: Truncated vs Full-Context")
print(f"   Truncated wins: {trunc_wins_oracle:.1f}%")
print(f"   t={t3:.3f}, p={p3:.4f}")
if p3 < 0.05:
    winner = "Truncated" if np.mean(oracle_trunc) < np.mean(oracle_full) else "Full-context"
    print(f"   -> {winner} significantly better (p<0.05)")
else:
    print(f"   -> No significant difference")

# Comparison 4: Truncated vs Full-context (random)
t4, p4 = stats.ttest_rel(random_trunc, random_full)
trunc_wins_random = np.mean(random_trunc < random_full) * 100
print(f"\n4. RANDOM: Truncated vs Full-Context")
print(f"   Truncated wins: {trunc_wins_random:.1f}%")
print(f"   t={t4:.3f}, p={p4:.4f}")
if p4 < 0.05:
    winner = "Truncated" if np.mean(random_trunc) < np.mean(random_full) else "Full-context"
    print(f"   -> {winner} significantly better (p<0.05)")
else:
    print(f"   -> No significant difference")


KEY COMPARISONS (paired t-tests)

1. TRUNCATED: Oracle vs Random
   Oracle wins: 61.8%
   t=-5.309, p=0.0000
   -> Oracle significantly better (p<0.05)

2. FULL-CONTEXT: Oracle vs Random
   Oracle wins: 50.0%
   t=-0.206, p=0.8371
   -> No significant difference

3. ORACLE: Truncated vs Full-Context
   Truncated wins: 62.6%
   t=-7.089, p=0.0000
   -> Truncated significantly better (p<0.05)

4. RANDOM: Truncated vs Full-Context
   Truncated wins: 48.2%
   t=-2.956, p=0.0033
   -> Truncated significantly better (p<0.05)


In [10]:
# Cell 10: Interpretation

print("\n" + "="*70)
print("INTERPRETATION")
print("="*70)

print("\n## Key Questions Answered:\n")

# Q1: Does truncated oracle beat truncated random?
print("Q1: Does TRUNCATED oracle beat TRUNCATED random?")
print("    (Tests: Does semantic content help with pure value contamination?)")
if p1 < 0.05 and np.mean(oracle_trunc) < np.mean(random_trunc):
    print(f"    ANSWER: YES - Oracle wins {oracle_wins_trunc:.1f}% of the time (p={p1:.4f})")
    print("    -> Semantic value contamination provides benefit beyond structural.")
elif p1 < 0.05 and np.mean(oracle_trunc) > np.mean(random_trunc):
    print(f"    ANSWER: NO - Random wins {100-oracle_wins_trunc:.1f}% of the time (p={p1:.4f})")
    print("    -> Even with truncation, random is better. Value contamination is structural.")
else:
    print(f"    ANSWER: NO DIFFERENCE - Oracle wins {oracle_wins_trunc:.1f}% (p={p1:.4f}, not significant)")
    print("    -> Semantic content doesn't matter for pure value contamination.")

print("")

# Q2: Does full-context replicate Exps 15-17 (random beats oracle)?
print("Q2: Does FULL-CONTEXT replicate Exps 15-17 (random beats oracle)?")
print("    (Tests: Does semantic interference hurt with visible prefix?)")
if p2 < 0.05 and np.mean(random_full) < np.mean(oracle_full):
    print(f"    ANSWER: YES - Random wins {100-oracle_wins_full:.1f}% of the time (p={p2:.4f})")
    print("    -> Confirms Exps 15-17: semantic interference hurts with visible prefix.")
elif p2 < 0.05 and np.mean(oracle_full) < np.mean(random_full):
    print(f"    ANSWER: NO - Oracle wins {oracle_wins_full:.1f}% of the time (p={p2:.4f})")
    print("    -> Does NOT replicate Exps 15-17. Oracle helps with visible prefix.")
else:
    print(f"    ANSWER: NO DIFFERENCE - Oracle wins {oracle_wins_full:.1f}% (p={p2:.4f}, not significant)")

print("")

# Q3: Is truncation better than full-context?
print("Q3: Is TRUNCATION better than FULL-CONTEXT?")
print("    (Tests: Does removing prefix help by eliminating interference?)")
if p3 < 0.05 and np.mean(oracle_trunc) < np.mean(oracle_full):
    print(f"    For ORACLE: YES - Truncated wins {trunc_wins_oracle:.1f}% (p={p3:.4f})")
    print("    -> Removing oracle prefix eliminates interference, improves performance.")
else:
    print(f"    For ORACLE: NO - Truncated wins {trunc_wins_oracle:.1f}% (p={p3:.4f})")

if p4 < 0.05 and np.mean(random_trunc) < np.mean(random_full):
    print(f"    For RANDOM: YES - Truncated wins {trunc_wins_random:.1f}% (p={p4:.4f})")
else:
    print(f"    For RANDOM: NO - Truncated wins {trunc_wins_random:.1f}% (p={p4:.4f})")


INTERPRETATION

## Key Questions Answered:

Q1: Does TRUNCATED oracle beat TRUNCATED random?
    (Tests: Does semantic content help with pure value contamination?)
    ANSWER: YES - Oracle wins 61.8% of the time (p=0.0000)
    -> Semantic value contamination provides benefit beyond structural.

Q2: Does FULL-CONTEXT replicate Exps 15-17 (random beats oracle)?
    (Tests: Does semantic interference hurt with visible prefix?)
    ANSWER: NO DIFFERENCE - Oracle wins 50.0% (p=0.8371, not significant)

Q3: Is TRUNCATION better than FULL-CONTEXT?
    (Tests: Does removing prefix help by eliminating interference?)
    For ORACLE: YES - Truncated wins 62.6% (p=0.0000)
    -> Removing oracle prefix eliminates interference, improves performance.
    For RANDOM: YES - Truncated wins 48.2% (p=0.0033)


In [11]:
# Cell 11: Save Final Analysis

analysis = {
    'n_samples': n,
    'mean_bare_nll': float(np.mean(bare)),
    'truncated': {
        'oracle': {
            'mean_nll': float(np.mean(oracle_trunc)),
            'win_rate': float(np.mean(delta_oracle_trunc > 0)),
            'cohens_d': float(cohens_d(delta_oracle_trunc)),
        },
        'random': {
            'mean_nll': float(np.mean(random_trunc)),
            'win_rate': float(np.mean(delta_random_trunc > 0)),
            'cohens_d': float(cohens_d(delta_random_trunc)),
        },
        'oracle_vs_random': {
            't_stat': float(t1),
            'p_value': float(p1),
            'oracle_wins_pct': float(oracle_wins_trunc),
        },
    },
    'fullcontext': {
        'oracle': {
            'mean_nll': float(np.mean(oracle_full)),
            'win_rate': float(np.mean(delta_oracle_full > 0)),
            'cohens_d': float(cohens_d(delta_oracle_full)),
        },
        'random': {
            'mean_nll': float(np.mean(random_full)),
            'win_rate': float(np.mean(delta_random_full > 0)),
            'cohens_d': float(cohens_d(delta_random_full)),
        },
        'oracle_vs_random': {
            't_stat': float(t2),
            'p_value': float(p2),
            'oracle_wins_pct': float(oracle_wins_full),
        },
    },
    'truncated_vs_fullcontext': {
        'oracle': {
            't_stat': float(t3),
            'p_value': float(p3),
            'truncated_wins_pct': float(trunc_wins_oracle),
        },
        'random': {
            't_stat': float(t4),
            'p_value': float(p4),
            'truncated_wins_pct': float(trunc_wins_random),
        },
    },
}

with open(f'{OUTPUT_DIR}/analysis.json', 'w') as f:
    json.dump(analysis, f, indent=2)

print(f"Analysis saved to {OUTPUT_DIR}/analysis.json")

Analysis saved to /home/jupyter/research/directed_kvcache/results/exp18/analysis.json
