# Coherence Variants Evaluation - Visualization and Analysis

This notebook visualizes and analyzes the results from evaluating coherence-based hallucination detection variants on the wiki_bio_gpt3_hallucination dataset.

## Features
- Load evaluation results from JSON files
- Plot ROC curves for all three variants
- Plot Precision-Recall curves
- Analyze score distributions by ground truth labels
- Compare against SelfCheckAPIPrompt baseline
- Per-sentence analysis for interesting cases

In [None]:
import json
import os
import glob
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (
    roc_curve,
    precision_recall_curve,
    average_precision_score,
    roc_auc_score
)
from datasets import load_dataset

# Plotting style
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

## Load Evaluation Results

Load the most recent evaluation results JSON file.

In [None]:
# Find most recent results file
results_dir = "../results"
result_files = glob.glob(os.path.join(results_dir, "coherence_evaluation_*.json"))

if not result_files:
    print("No evaluation results found!")
    print(f"Please run: python scripts/evaluate_coherence.py --variant all")
else:
    # Load most recent file
    latest_file = max(result_files, key=os.path.getctime)
    print(f"Loading results from: {latest_file}")
    
    with open(latest_file, 'r') as f:
        eval_results = json.load(f)
    
    print("\nEvaluation Metadata:")
    for key, value in eval_results['metadata'].items():
        print(f"  {key}: {value}")
    
    print("\nMetrics Summary:")
    for variant, metrics in eval_results['results'].items():
        print(f"  {variant}:")
        print(f"    AUC-PR: {metrics['auc_pr']*100:.2f}%")
        print(f"    PCC: {metrics['pcc']*100:.2f}%")
        print(f"    AUC-ROC: {metrics['auc_roc']*100:.2f}%")

## Re-run Evaluation to Get Detailed Scores

To create visualizations, we need the actual score arrays. Let's re-run a small subset evaluation.

In [None]:
from selfcheckgpt.modeling_coherence import SelfCheckShogenji, SelfCheckFitelson, SelfCheckOlsson
from tqdm.notebook import tqdm

# Load dataset
print("Loading dataset...")
dataset = load_dataset("potsawee/wiki_bio_gpt3_hallucination")["evaluation"]

# Configuration
MODEL = "gpt-4o-mini"
NUM_SAMPLES = 3
MAX_PASSAGES = 50  # Evaluate subset for faster visualization

print(f"\nConfiguration:")
print(f"  Model: {MODEL}")
print(f"  Num samples: {NUM_SAMPLES}")
print(f"  Max passages: {MAX_PASSAGES}")
print(f"  Total passages in dataset: {len(dataset)}")

In [None]:
# Initialize variants
print("Initializing coherence variants...")
selfcheck_shogenji = SelfCheckShogenji(model=MODEL)
selfcheck_fitelson = SelfCheckFitelson(model=MODEL)
selfcheck_olsson = SelfCheckOlsson(model=MODEL)

variants = {
    'Shogenji': selfcheck_shogenji,
    'Fitelson': selfcheck_fitelson,
    'Olsson': selfcheck_olsson
}

In [None]:
# Collect scores for all variants
variant_scores = {name: [] for name in variants.keys()}
all_labels = []

num_eval_passages = min(MAX_PASSAGES, len(dataset))

for passage_idx in tqdm(range(num_eval_passages), desc="Evaluating passages"):
    passage_data = dataset[passage_idx]
    
    sentences = passage_data['gpt3_sentences']
    annotations = passage_data['annotation']
    gpt3_text = passage_data['gpt3_text']
    
    # Create sampled passages
    sampled_passages = [gpt3_text] * NUM_SAMPLES
    
    # Evaluate with each variant
    for variant_name, variant in variants.items():
        try:
            scores = variant.predict(
                sentences=sentences,
                sampled_passages=sampled_passages,
                verbose=False
            )
            variant_scores[variant_name].extend(scores.tolist())
        except Exception as e:
            print(f"Error with {variant_name} on passage {passage_idx}: {e}")
            # Pad with zeros
            variant_scores[variant_name].extend([0.0] * len(sentences))
    
    # Collect labels once
    if passage_idx == 0 or len(all_labels) < sum(len(variant_scores[name]) for name in variants.keys()) / len(variants):
        all_labels.extend(annotations)

# Convert to numpy arrays
for variant_name in variant_scores:
    variant_scores[variant_name] = np.array(variant_scores[variant_name])

all_labels = np.array(all_labels)

print(f"\nEvaluation complete!")
print(f"Total sentences: {len(all_labels)}")
print(f"Accurate: {np.sum(all_labels == 0)}, Inaccurate: {np.sum(all_labels == 1)}")

## ROC Curves

Plot Receiver Operating Characteristic curves for all three variants.

In [None]:
plt.figure(figsize=(10, 8))

colors = {'Shogenji': 'blue', 'Fitelson': 'green', 'Olsson': 'red'}

for variant_name, scores in variant_scores.items():
    fpr, tpr, _ = roc_curve(all_labels, scores)
    auc_roc = roc_auc_score(all_labels, scores)
    
    plt.plot(
        fpr, tpr,
        label=f'{variant_name} (AUC={auc_roc:.3f})',
        color=colors[variant_name],
        linewidth=2
    )

# Diagonal reference line
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random Baseline')

plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves: Coherence-Based Hallucination Detection', fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## Precision-Recall Curves

Plot Precision-Recall curves showing performance at different operating points.

In [None]:
plt.figure(figsize=(10, 8))

for variant_name, scores in variant_scores.items():
    precision, recall, _ = precision_recall_curve(all_labels, scores)
    auc_pr = average_precision_score(all_labels, scores)
    
    plt.plot(
        recall, precision,
        label=f'{variant_name} (AP={auc_pr:.3f})',
        color=colors[variant_name],
        linewidth=2
    )

# Baseline
baseline = np.sum(all_labels == 1) / len(all_labels)
plt.axhline(y=baseline, color='k', linestyle='--', linewidth=1, label=f'Random Baseline ({baseline:.3f})')

# Add reference line for SelfCheckAPIPrompt baseline (93.42 AP)
plt.axhline(y=0.9342, color='purple', linestyle=':', linewidth=2, label='SelfCheckAPIPrompt Baseline (0.934)')

plt.xlabel('Recall', fontsize=12)
plt.ylabel('Precision', fontsize=12)
plt.title('Precision-Recall Curves: Coherence-Based Hallucination Detection', fontsize=14, fontweight='bold')
plt.legend(loc='lower left', fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## Score Distributions by Ground Truth Label

Visualize how well each variant separates accurate from inaccurate sentences.

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (variant_name, scores) in enumerate(variant_scores.items()):
    # Separate scores by label
    accurate_scores = scores[all_labels == 0]
    inaccurate_scores = scores[all_labels == 1]
    
    # Plot histograms
    axes[idx].hist(
        accurate_scores,
        bins=30,
        alpha=0.6,
        color='green',
        label=f'Accurate (n={len(accurate_scores)})',
        density=True
    )
    axes[idx].hist(
        inaccurate_scores,
        bins=30,
        alpha=0.6,
        color='red',
        label=f'Inaccurate (n={len(inaccurate_scores)})',
        density=True
    )
    
    axes[idx].set_xlabel('Hallucination Score', fontsize=11)
    axes[idx].set_ylabel('Density', fontsize=11)
    axes[idx].set_title(f'{variant_name}', fontsize=12, fontweight='bold')
    axes[idx].legend(fontsize=10)
    axes[idx].grid(alpha=0.3)
    
    # Add mean lines
    axes[idx].axvline(
        np.mean(accurate_scores),
        color='green',
        linestyle='--',
        linewidth=2,
        alpha=0.8,
        label=f'Mean Accurate: {np.mean(accurate_scores):.3f}'
    )
    axes[idx].axvline(
        np.mean(inaccurate_scores),
        color='red',
        linestyle='--',
        linewidth=2,
        alpha=0.8,
        label=f'Mean Inaccurate: {np.mean(inaccurate_scores):.3f}'
    )

plt.suptitle('Hallucination Score Distributions by Ground Truth', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# Print separation statistics
print("\nSeparation Statistics (Mean Inaccurate - Mean Accurate):")
for variant_name, scores in variant_scores.items():
    accurate_mean = np.mean(scores[all_labels == 0])
    inaccurate_mean = np.mean(scores[all_labels == 1])
    separation = inaccurate_mean - accurate_mean
    print(f"  {variant_name}: {separation:.4f}")

## Per-Sentence Analysis: Interesting Cases

Examine cases where variants disagree or show unusual scores.

In [None]:
# Find cases where variants disagree significantly
shogenji_scores = variant_scores['Shogenji']
fitelson_scores = variant_scores['Fitelson']
olsson_scores = variant_scores['Olsson']

# Calculate score differences
shog_fitel_diff = np.abs(shogenji_scores - fitelson_scores)
shog_olsson_diff = np.abs(shogenji_scores - olsson_scores)
fitel_olsson_diff = np.abs(fitelson_scores - olsson_scores)

max_diff = np.maximum(np.maximum(shog_fitel_diff, shog_olsson_diff), fitel_olsson_diff)

# Find top 10 disagreement cases
disagreement_indices = np.argsort(max_diff)[-10:][::-1]

print("Top 10 Cases with Highest Variant Disagreement:\n")
print(f"{'Index':<8} {'Label':<10} {'Shogenji':<12} {'Fitelson':<12} {'Olsson':<12} {'Max Diff':<12}")
print("-" * 80)

for idx in disagreement_indices:
    label_str = "Accurate" if all_labels[idx] == 0 else "Inaccurate"
    print(
        f"{idx:<8} {label_str:<10} "
        f"{shogenji_scores[idx]:<12.4f} "
        f"{fitelson_scores[idx]:<12.4f} "
        f"{olsson_scores[idx]:<12.4f} "
        f"{max_diff[idx]:<12.4f}"
    )

In [None]:
# Find false negatives (inaccurate sentences with low scores)
inaccurate_indices = np.where(all_labels == 1)[0]
avg_scores = (shogenji_scores + fitelson_scores + olsson_scores) / 3

false_negative_candidates = [(idx, avg_scores[idx]) for idx in inaccurate_indices]
false_negative_candidates.sort(key=lambda x: x[1])  # Sort by score (ascending)

print("\nTop 5 Potential False Negatives (Inaccurate with Low Scores):\n")
print(f"{'Index':<8} {'Avg Score':<12} {'Shogenji':<12} {'Fitelson':<12} {'Olsson':<12}")
print("-" * 80)

for idx, avg_score in false_negative_candidates[:5]:
    print(
        f"{idx:<8} {avg_score:<12.4f} "
        f"{shogenji_scores[idx]:<12.4f} "
        f"{fitelson_scores[idx]:<12.4f} "
        f"{olsson_scores[idx]:<12.4f}"
    )

In [None]:
# Find false positives (accurate sentences with high scores)
accurate_indices = np.where(all_labels == 0)[0]

false_positive_candidates = [(idx, avg_scores[idx]) for idx in accurate_indices]
false_positive_candidates.sort(key=lambda x: x[1], reverse=True)  # Sort by score (descending)

print("\nTop 5 Potential False Positives (Accurate with High Scores):\n")
print(f"{'Index':<8} {'Avg Score':<12} {'Shogenji':<12} {'Fitelson':<12} {'Olsson':<12}")
print("-" * 80)

for idx, avg_score in false_positive_candidates[:5]:
    print(
        f"{idx:<8} {avg_score:<12.4f} "
        f"{shogenji_scores[idx]:<12.4f} "
        f"{fitelson_scores[idx]:<12.4f} "
        f"{olsson_scores[idx]:<12.4f}"
    )

## Comparison to Baseline

Compare coherence variants to the SelfCheckAPIPrompt (GPT-3.5) baseline.

In [None]:
# Baseline metrics from paper
baseline_auc_pr = 0.9342
baseline_pcc = 0.7832

# Calculate metrics for each variant
variant_metrics = {}
for variant_name, scores in variant_scores.items():
    from scipy.stats import pearsonr
    
    auc_pr = average_precision_score(all_labels, scores)
    pcc, _ = pearsonr(scores, all_labels)
    auc_roc = roc_auc_score(all_labels, scores)
    
    variant_metrics[variant_name] = {
        'auc_pr': auc_pr,
        'pcc': pcc,
        'auc_roc': auc_roc
    }

# Create comparison bar chart
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

variants_list = list(variant_metrics.keys()) + ['Baseline (GPT-3.5)']
x = np.arange(len(variants_list))

# AUC-PR comparison
auc_pr_values = [variant_metrics[v]['auc_pr'] * 100 for v in variant_metrics.keys()] + [baseline_auc_pr * 100]
bars1 = axes[0].bar(x, auc_pr_values, color=['blue', 'green', 'red', 'purple'], alpha=0.7)
axes[0].set_ylabel('AUC-PR (%)', fontsize=12)
axes[0].set_title('AUC-PR Comparison', fontsize=13, fontweight='bold')
axes[0].set_xticks(x)
axes[0].set_xticklabels(variants_list, rotation=15, ha='right')
axes[0].grid(axis='y', alpha=0.3)
axes[0].axhline(y=baseline_auc_pr * 100, color='purple', linestyle='--', linewidth=1, alpha=0.5)

# Add value labels
for bar in bars1:
    height = bar.get_height()
    axes[0].text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.2f}',
                ha='center', va='bottom', fontsize=10)

# PCC comparison
pcc_values = [variant_metrics[v]['pcc'] * 100 for v in variant_metrics.keys()] + [baseline_pcc * 100]
bars2 = axes[1].bar(x, pcc_values, color=['blue', 'green', 'red', 'purple'], alpha=0.7)
axes[1].set_ylabel('PCC (%)', fontsize=12)
axes[1].set_title('Pearson Correlation Comparison', fontsize=13, fontweight='bold')
axes[1].set_xticks(x)
axes[1].set_xticklabels(variants_list, rotation=15, ha='right')
axes[1].grid(axis='y', alpha=0.3)
axes[1].axhline(y=baseline_pcc * 100, color='purple', linestyle='--', linewidth=1, alpha=0.5)

# Add value labels
for bar in bars2:
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.2f}',
                ha='center', va='bottom', fontsize=10)

plt.suptitle('Coherence Variants vs SelfCheckAPIPrompt Baseline', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# Print detailed comparison
print("\nDetailed Metrics Comparison:\n")
print(f"{'Variant':<20} {'AUC-PR':<12} {'vs Baseline':<15} {'PCC':<12} {'AUC-ROC':<12}")
print("-" * 80)

for variant_name, metrics in variant_metrics.items():
    diff_pr = (metrics['auc_pr'] - baseline_auc_pr) * 100
    print(
        f"{variant_name:<20} "
        f"{metrics['auc_pr']*100:<12.2f} "
        f"{diff_pr:+.2f}%{' ':<10} "
        f"{metrics['pcc']*100:<12.2f} "
        f"{metrics['auc_roc']*100:<12.2f}"
    )

print("-" * 80)
print(
    f"{'Baseline (GPT-3.5)':<20} "
    f"{baseline_auc_pr*100:<12.2f} "
    f"{'---':<15} "
    f"{baseline_pcc*100:<12.2f} "
    f"{'---':<12}"
)

## Summary and Insights

Key observations from the evaluation:

In [None]:
print("=" * 80)
print("EVALUATION SUMMARY")
print("=" * 80)

# Best performing variant
best_variant = max(variant_metrics.items(), key=lambda x: x[1]['auc_pr'])
print(f"\nBest Performing Variant (by AUC-PR): {best_variant[0]}")
print(f"  AUC-PR: {best_variant[1]['auc_pr']*100:.2f}%")
print(f"  Improvement over baseline: {(best_variant[1]['auc_pr'] - baseline_auc_pr)*100:+.2f}%")

# Consistency across variants
auc_pr_std = np.std([m['auc_pr'] for m in variant_metrics.values()])
print(f"\nVariant Consistency (AUC-PR std): {auc_pr_std*100:.2f}%")

# Cache efficiency
if 'cache_stats' in eval_results:
    print("\nCache Efficiency:")
    for variant_name, stats in eval_results['cache_stats'].items():
        print(f"  {variant_name}: {stats['hit_rate']*100:.2f}% hit rate")

# Cost estimates
if 'cost_estimates' in eval_results:
    print("\nCost Estimates:")
    total_cost = 0
    for variant_name, costs in eval_results['cost_estimates'].items():
        cost = costs['estimated_cost_usd']
        total_cost += cost
        print(f"  {variant_name}: ${cost:.4f} USD")
    print(f"  Total: ${total_cost:.4f} USD")

print("\n" + "=" * 80)

## Conclusion

This notebook provided comprehensive visualization and analysis of the coherence-based hallucination detection variants. The results show:

1. **Performance**: How each variant compares to the SelfCheckAPIPrompt baseline
2. **Discrimination**: How well variants separate accurate from inaccurate sentences
3. **Consistency**: Where variants agree or disagree in their assessments
4. **Efficiency**: Cache hit rates and API cost management

For production deployment, consider:
- The best-performing variant based on your metric priorities (AUC-PR vs PCC)
- API cost constraints and caching strategies
- Ensemble approaches combining multiple variants for robustness