# Epistemic Probing: Cross-Model Analysis

This notebook analyzes epistemic transparency across 8 models (4 families Ã— base/instruct).

**Key Question:** Do language models know what they don't know, and does that knowledge leak through entropy?

In [None]:
import sys
import io
from contextlib import redirect_stdout
import pandas as pd
import numpy as np

sys.path.insert(0, '.')
from analysis.loader import load_model_data
from analysis.effects import compute_roc_auc
from analysis.core import failure_mode_analysis

In [None]:
# Model metadata
MODELS = [
    'qwen_base', 'qwen_instruct',
    'mistral_base', 'mistral_instruct',
    'yi_base', 'yi_instruct',
    'llama_base', 'llama_instruct'
]

META = {
    'qwen': ('Custom', 'Chinese'),
    'mistral': ('Custom', 'English'),
    'yi': ('LLaMA-derived', 'Chinese'),
    'llama': ('LLaMA', 'English'),
}

## 1. Load All Models

In [None]:
# Load all model data
models_data = {}
for model in MODELS:
    f = io.StringIO()
    with redirect_stdout(f):
        models_data[model] = load_model_data(model, re_evaluate=True)
    print(f"Loaded {model}: {len(models_data[model].df)} samples")

## 2. Core Metrics Table

In [None]:
# Compute core metrics for all models
results = []
for model in MODELS:
    data = models_data[model]
    family = model.split('_')[0]
    variant = model.split('_')[1]
    
    # ROC/AUC
    f = io.StringIO()
    with redirect_stdout(f):
        roc = compute_roc_auc(data, print_output=False)
    
    # Hallucination detection
    ci = data.df[data.df['category'] == 'confident_incorrect']
    
    results.append({
        'model': model,
        'family': family.capitalize(),
        'variant': variant,
        'arch': META[family][0],
        'training': META[family][1],
        'entropy_auc': roc['entropy']['auc'],
        'probe_auc': roc['best_layer']['auc'],
        'hidden_info': roc['best_layer']['auc'] - roc['entropy']['auc'],
        'hall_det': ci['correct'].mean(),
        'mean_entropy': data.df['entropy'].mean(),
        'std_entropy': data.df['entropy'].std(),
        'overall_acc': data.df['correct'].mean(),
    })

core_df = pd.DataFrame(results)
core_df

## 3. Key Comparison: Training Data vs Architecture

In [None]:
# Base models only - the clean comparison
base_df = core_df[core_df['variant'] == 'base'][['family', 'arch', 'training', 'entropy_auc', 'probe_auc', 'hidden_info']]
base_df = base_df.sort_values('hidden_info')
base_df

In [None]:
# The critical test: Yi vs Llama (same architecture, different training)
yi_llama = base_df[base_df['family'].isin(['Yi', 'Llama'])]
print("Same LLaMA architecture, different training:")
print(yi_llama.to_string(index=False))
print(f"\nHidden info ratio: {yi_llama[yi_llama['family']=='Yi']['hidden_info'].values[0] / yi_llama[yi_llama['family']=='Llama']['hidden_info'].values[0]:.1f}x")

In [None]:
# Group by training origin
print("Mean hidden info by training origin (base models):")
print(base_df.groupby('training')['hidden_info'].mean())

## 4. Instruct Tuning Effects

In [None]:
# Compute deltas for each family
deltas = []
for family in ['Qwen', 'Mistral', 'Yi', 'Llama']:
    base = core_df[(core_df['family'] == family) & (core_df['variant'] == 'base')].iloc[0]
    inst = core_df[(core_df['family'] == family) & (core_df['variant'] == 'instruct')].iloc[0]
    
    deltas.append({
        'family': family,
        'training': base['training'],
        'entropy_auc_delta': inst['entropy_auc'] - base['entropy_auc'],
        'probe_auc_delta': inst['probe_auc'] - base['probe_auc'],
        'hidden_info_delta': inst['hidden_info'] - base['hidden_info'],
        'hall_det_delta': inst['hall_det'] - base['hall_det'],
        'mean_entropy_delta': inst['mean_entropy'] - base['mean_entropy'],
    })

delta_df = pd.DataFrame(deltas)
delta_df

In [None]:
# Summary of instruct tuning effects
print("Mean effect of instruct tuning across all models:")
print(f"  Entropy AUC:  {delta_df['entropy_auc_delta'].mean():+.3f}")
print(f"  Probe AUC:    {delta_df['probe_auc_delta'].mean():+.3f}")
print(f"  Hidden Info:  {delta_df['hidden_info_delta'].mean():+.1%}")
print(f"  Hall. Det:    {delta_df['hall_det_delta'].mean():+.1%}")
print(f"  Mean Entropy: {delta_df['mean_entropy_delta'].mean():+.3f}")

## 5. Entropy Distribution

In [None]:
# Entropy stats by model
entropy_df = core_df[['model', 'variant', 'training', 'mean_entropy', 'std_entropy']].copy()
entropy_df

In [None]:
# Entropy by category for each model
cat_entropy = []
for model, data in models_data.items():
    for cat in data.df['category'].unique():
        cat_df = data.df[data.df['category'] == cat]
        cat_entropy.append({
            'model': model,
            'category': cat,
            'mean_entropy': cat_df['entropy'].mean(),
            'accuracy': cat_df['correct'].mean(),
            'n': len(cat_df)
        })

cat_entropy_df = pd.DataFrame(cat_entropy)
cat_entropy_df.pivot(index='category', columns='model', values='mean_entropy').round(2)

## 6. Hallucination Analysis

In [None]:
# Hallucination detection rates by model
hall_df = core_df[['model', 'family', 'variant', 'training', 'hall_det']].copy()
hall_df['hall_det_pct'] = (hall_df['hall_det'] * 100).round(1).astype(str) + '%'

# Pivot by family (unique) instead of training (duplicates)
hall_pivot = hall_df.pivot(index='family', columns='variant', values='hall_det').round(3)
hall_pivot['improvement'] = hall_pivot['instruct'] - hall_pivot['base']
hall_pivot.sort_values('instruct', ascending=False)

In [None]:
# Best and worst hallucination detection
best_model = core_df.loc[core_df['hall_det'].idxmax(), 'model']
best_rate = core_df['hall_det'].max()

instruct_only = core_df[core_df['variant']=='instruct']
worst_instruct = instruct_only.loc[instruct_only['hall_det'].idxmin(), 'model']
worst_rate = instruct_only['hall_det'].min()

print(f"Best hallucination detection: {best_model} ({best_rate:.1%})")
print(f"Worst instruct model: {worst_instruct} ({worst_rate:.1%})")

## 7. Summary Statistics

In [None]:
# Final summary table for paper/presentation
summary = core_df[['model', 'training', 'entropy_auc', 'probe_auc', 'hidden_info', 'hall_det', 'mean_entropy']].copy()
summary.columns = ['Model', 'Training', 'Entropy AUC', 'Probe AUC', 'Hidden Info', 'Hall. Det', 'Mean Entropy']
summary = summary.round(3)
summary

In [None]:
# Export to CSV if needed
# summary.to_csv('epistemic_summary.csv', index=False)

## Key Findings

1. **Training data drives epistemic transparency, not architecture**
   - Yi (LLaMA arch, Chinese): ~10% hidden info
   - Llama (LLaMA arch, English): ~2% hidden info
   - Same architecture, 4x difference

2. **Instruct tuning degrades entropy informativeness universally**
   - All models show +10-18% hidden info after instruct tuning
   - Entropy becomes compressed (lower mean and SD)

3. **Probe accuracy remains stable**
   - ~94-97% AUC across all models
   - Information exists internally, just hidden from entropy

4. **Hallucination detection improves with instruct tuning**
   - But varies widely by model (19-69%)