In [1]:
"""
Stage 1 BnF Comparative Evaluation

Compares Mistral OCR structured predictions against BnF's unstructured OCR text.

Input:  Gold standard from data/gold_standard/cleaned/{magazine_name}/
        Predictions from data/predictions/{magazine_name}/
        BnF OCR from data/bnf_ocr/{magazine_name}/
Output: Comparative metrics and analysis
Schema: schemas/stage1_page.py
"""

from pathlib import Path
import json
from typing import Dict, List, Tuple, Optional, Set
from collections import defaultdict, Counter
import re
import pandas as pd
import numpy as np
import Levenshtein
import unicodedata

# Project imports
from utils.paths import PROJECT_ROOT, PREDICTIONS, GOLD_CLEAN, BNF_OCR
from schemas.stage1_page import Stage1PageModel
from utils.text_processing import (
    normalize_text_strict,
    normalize_text_standard,
    normalize_text_letters_only
)
from utils.ocr_metrics import (
    character_error_rate,
    word_error_rate,
    evaluate_text_quality
)

# Paths
GOLD_ROOT = GOLD_CLEAN
PRED_ROOT = PREDICTIONS
BNF_ROOT = BNF_OCR

print("Stage 1 BnF Comparative Evaluation")
print("=" * 60)
print(f"Project root: {PROJECT_ROOT}")
print("\nDirectories:")
print(f"  Gold standard: {GOLD_ROOT}")
print(f"  Predictions:   {PRED_ROOT}")
print(f"  BnF OCR:       {BNF_ROOT}")
print(f"  Schema:        {Stage1PageModel.__name__}")

Stage 1 BnF Comparative Evaluation
Project root: /home/fabian-ramirez/Documents/These/Code/magazine_graphs

Directories:
  Gold standard: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/gold_standard/cleaned
  Predictions:   /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/predictions
  BnF OCR:       /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/bnf_ocr
  Schema:        Stage1PageModel


In [2]:
"""
Find Magazine Triplets for Comparison
"""

def find_magazine_triplets() -> List[Tuple[str, Path, Path, Path, int]]:
    """
    Find magazines that have gold standard, predictions, AND BnF OCR files.
    
    Returns:
        List of (magazine_name, gold_dir, pred_dir, bnf_dir, num_matching_files) tuples
    """
    gold_magazines = {d.name: d for d in GOLD_ROOT.iterdir() if d.is_dir()}
    pred_magazines = {d.name: d for d in PRED_ROOT.iterdir() if d.is_dir()}
    bnf_magazines = {d.name: d for d in BNF_ROOT.iterdir() if d.is_dir()}
    
    common_magazines = set(gold_magazines.keys()) & set(pred_magazines.keys()) & set(bnf_magazines.keys())
    
    triplets = []
    for mag_name in sorted(common_magazines):
        gold_dir = gold_magazines[mag_name]
        pred_dir = pred_magazines[mag_name]
        bnf_dir = bnf_magazines[mag_name]
        
        # Find matching page files by stem
        gold_files = {f.stem: f for f in gold_dir.glob("*.json")}
        pred_files = {f.stem: f for f in pred_dir.glob("*.json")}
        bnf_files = {f.stem: f for f in bnf_dir.glob("*.txt")}
        
        matching_stems = set(gold_files.keys()) & set(pred_files.keys()) & set(bnf_files.keys())
        
        if matching_stems:
            triplets.append((mag_name, gold_dir, pred_dir, bnf_dir, len(matching_stems)))
    
    return triplets

# Find triplets
print("\n" + "=" * 60)
print("Finding Magazine Triplets")
print("=" * 60 + "\n")

magazine_triplets = find_magazine_triplets()

if not magazine_triplets:
    print("No matching magazine triplets found.")
    print("\nCheck that:")
    print("  1. Gold standard files exist in gold_standard/cleaned/")
    print("  2. Prediction files exist in predictions/")
    print("  3. BnF OCR files exist in bnf_ocr/")
    print("  4. Magazine names and page filenames match across all three")
else:
    print(f"Found {len(magazine_triplets)} magazine(s) for comparison:\n")
    for mag_name, gold_dir, pred_dir, bnf_dir, num_files in magazine_triplets:
        print(f"{mag_name}:")
        print(f"  Gold files: {len(list(gold_dir.glob('*.json')))}")
        print(f"  Pred files: {len(list(pred_dir.glob('*.json')))}")
        print(f"  BnF files:  {len(list(bnf_dir.glob('*.txt')))}")
        print(f"  Matching:   {num_files}")
        print()


Finding Magazine Triplets

Found 1 magazine(s) for comparison:

La_Plume_bpt6k1212187t_15-11-1893:
  Gold files: 1
  Pred files: 34
  BnF files:  1
  Matching:   1



In [3]:
"""
Text Extraction Functions
"""

def load_bnf_text(txt_path: Path) -> str:
    """
    Load BnF OCR text file.
    
    Args:
        txt_path: Path to BnF .txt file
        
    Returns:
        Raw text content
    """
    try:
        return txt_path.read_text(encoding='utf-8')
    except UnicodeDecodeError:
        # Fallback to latin-1 if UTF-8 fails
        return txt_path.read_text(encoding='latin-1')


def extract_text_from_json(json_path: Path) -> str:
    """
    Extract all text from a JSON file (gold or prediction).
    
    Concatenates all item_text_raw fields in order.
    
    Args:
        json_path: Path to JSON file
        
    Returns:
        Concatenated text
    """
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        text_parts = []
        
        for item in data.get('items', []):
            if item.get('item_text_raw'):
                text_parts.append(item['item_text_raw'])
        
        # Join with space to preserve word boundaries
        return ' '.join(text_parts)
    
    except Exception as e:
        print(f"ERROR loading {json_path.name}: {e}")
        return ""


# Test extraction on first available page
print("\n" + "=" * 60)
print("Text Extraction Test")
print("=" * 60 + "\n")

if magazine_triplets:
    mag_name, gold_dir, pred_dir, bnf_dir, _ = magazine_triplets[0]
    
    # Get first matching page
    gold_files = {f.stem: f for f in gold_dir.glob("*.json")}
    pred_files = {f.stem: f for f in pred_dir.glob("*.json")}
    bnf_files = {f.stem: f for f in bnf_dir.glob("*.txt")}
    
    matching_stems = sorted(set(gold_files.keys()) & set(pred_files.keys()) & set(bnf_files.keys()))
    
    if matching_stems:
        test_stem = matching_stems[0]
        
        gold_text = extract_text_from_json(gold_files[test_stem])
        pred_text = extract_text_from_json(pred_files[test_stem])
        bnf_text = load_bnf_text(bnf_files[test_stem])
        
        print(f"Test page: {test_stem}")
        print(f"  Gold length: {len(gold_text):,} chars, {len(gold_text.split()):,} words")
        print(f"  Pred length: {len(pred_text):,} chars, {len(pred_text.split()):,} words")
        print(f"  BnF length:  {len(bnf_text):,} chars, {len(bnf_text.split()):,} words")
        print(f"\n  Pred/Gold ratio: {len(pred_text)/len(gold_text):.2f}")
        print(f"  BnF/Gold ratio:  {len(bnf_text)/len(gold_text):.2f}")
else:
    print("No magazine triplets available for testing")


Text Extraction Test

Test page: La_Plume_bpt6k1212187t_15-11-1893__page-001
  Gold length: 4,474 chars, 717 words
  Pred length: 4,470 chars, 718 words
  BnF length:  4,495 chars, 753 words

  Pred/Gold ratio: 1.00
  BnF/Gold ratio:  1.00


In [4]:
"""
Text Normalization Functions

Three normalization levels matching 01c:
- Strict: Only Unicode NFC normalization (preserves everything)
- Standard: Normalize whitespace to single spaces (RECOMMENDED)
- Letters Only: Remove all whitespace and punctuation (pure character recognition)
"""

from utils.text_processing import (
    normalize_text_strict,
    normalize_text_standard,
    normalize_text_letters_only,
    token_sort_text
)

# Test normalization
print("\n" + "=" * 60)
print("Normalization Test")
print("=" * 60 + "\n")

test_text = "L'Affiche Illustrée — 1893  \n\n  Si loin qu'on remonte..."

print(f"Original text:")
print(f"  '{test_text}'")
print()

for level_name, normalize_func in [
    ('STRICT', normalize_text_strict),
    ('STANDARD', normalize_text_standard),
    ('LETTERS ONLY', normalize_text_letters_only)
]:
    normalized = normalize_func(test_text)
    print(f"{level_name}:")
    print(f"  '{normalized}'")
    print()


Normalization Test

Original text:
  'L'Affiche Illustrée — 1893  

  Si loin qu'on remonte...'

STRICT:
  'L'Affiche Illustrée — 1893  

  Si loin qu'on remonte...'

STANDARD:
  'L'Affiche Illustrée — 1893 Si loin qu'on remonte...'

LETTERS ONLY:
  'LAfficheIllustrée1893Siloinquonremonte'



In [5]:
"""
Error Rate Calculation Functions

Calculate Character Error Rate (CER) and Word Error Rate (WER) using Levenshtein distance.
Supports all three normalization levels.
"""

from utils.ocr_metrics import character_error_rate, word_error_rate


# Test error rate calculations
print("\n" + "=" * 60)
print("Error Rate Calculation Test")
print("=" * 60 + "\n")

if magazine_triplets:
    mag_name, gold_dir, pred_dir, bnf_dir, _ = magazine_triplets[0]
    
    gold_files = {f.stem: f for f in gold_dir.glob("*.json")}
    pred_files = {f.stem: f for f in pred_dir.glob("*.json")}
    bnf_files = {f.stem: f for f in bnf_dir.glob("*.txt")}
    
    matching_stems = sorted(set(gold_files.keys()) & set(pred_files.keys()) & set(bnf_files.keys()))
    
    if matching_stems:
        test_stem = matching_stems[0]
        
        gold_text = extract_text_from_json(gold_files[test_stem])
        pred_text = extract_text_from_json(pred_files[test_stem])
        bnf_text = load_bnf_text(bnf_files[test_stem])
        
        print(f"Test page: {test_stem}\n")
        
        for level in ['strict', 'standard', 'letters_only']:
            print(f"{level.upper().replace('_', ' ')}:")
            
            # Mistral vs Gold
            pred_cer = character_error_rate(gold_text, pred_text, level)
            pred_wer = word_error_rate(gold_text, pred_text, level) if level != 'letters_only' else 0.0
            
            # BnF vs Gold
            bnf_cer = character_error_rate(gold_text, bnf_text, level)
            bnf_wer = word_error_rate(gold_text, bnf_text, level) if level != 'letters_only' else 0.0
            
            print(f"  Mistral CER: {pred_cer:.3f}  |  WER: {pred_wer:.3f}")
            print(f"  BnF CER:     {bnf_cer:.3f}  |  WER: {bnf_wer:.3f}")
            print()
else:
    print("No magazine triplets available for testing")


Error Rate Calculation Test

Test page: La_Plume_bpt6k1212187t_15-11-1893__page-001

STRICT:
  Mistral CER: 0.004  |  WER: 0.013
  BnF CER:     0.058  |  WER: 0.172

STANDARD:
  Mistral CER: 0.004  |  WER: 0.013
  BnF CER:     0.039  |  WER: 0.172

LETTERS ONLY:
  Mistral CER: 0.003  |  WER: 0.000
  BnF CER:     0.024  |  WER: 0.000



In [6]:
"""
Bag-of-Words Coverage Metrics

Order-agnostic word-level comparison:
- Precision: % of predicted words that appear in reference
- Recall: % of reference words that appear in predictions  
- F1: Harmonic mean

This tells us about content coverage regardless of order.
"""

def calculate_word_coverage(reference: str, hypothesis: str, normalization: str = 'standard') -> Dict[str, float]:
    """
    Calculate word-level precision, recall, and F1 (bag-of-words).
    
    Args:
        reference: Reference text (gold standard)
        hypothesis: Hypothesis text (OCR output)
        normalization: Normalization level to apply
        
    Returns:
        Dict with precision, recall, f1, and word counts
    """
    # Apply normalization
    if normalization == 'strict':
        ref = normalize_text_strict(reference)
        hyp = normalize_text_strict(hypothesis)
    elif normalization == 'standard':
        ref = normalize_text_standard(reference)
        hyp = normalize_text_standard(hypothesis)
    elif normalization == 'letters_only':
        # Use standard for word-level (need word boundaries)
        ref = normalize_text_standard(reference)
        hyp = normalize_text_standard(hypothesis)
    else:
        ref = reference
        hyp = hypothesis
    
    words_ref = set(ref.split())
    words_hyp = set(hyp.split())
    
    if len(words_hyp) == 0:
        precision = 0.0
    else:
        # Precision: % of hypothesis words that appear in reference
        precision = len(words_ref & words_hyp) / len(words_hyp)
    
    if len(words_ref) == 0:
        recall = 0.0
    else:
        # Recall: % of reference words that appear in hypothesis
        recall = len(words_ref & words_hyp) / len(words_ref)
    
    if precision + recall == 0:
        f1 = 0.0
    else:
        f1 = 2 * precision * recall / (precision + recall)
    
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'shared_words': len(words_ref & words_hyp),
        'unique_to_hyp': len(words_hyp - words_ref),
        'unique_to_ref': len(words_ref - words_hyp),
        'total_ref_words': len(words_ref),
        'total_hyp_words': len(words_hyp)
    }


# Test bag-of-words coverage
print("\n" + "=" * 60)
print("Bag-of-Words Coverage Test")
print("=" * 60 + "\n")

if magazine_triplets:
    mag_name, gold_dir, pred_dir, bnf_dir, _ = magazine_triplets[0]
    
    gold_files = {f.stem: f for f in gold_dir.glob("*.json")}
    pred_files = {f.stem: f for f in pred_dir.glob("*.json")}
    bnf_files = {f.stem: f for f in bnf_dir.glob("*.txt")}
    
    matching_stems = sorted(set(gold_files.keys()) & set(pred_files.keys()) & set(bnf_files.keys()))
    
    if matching_stems:
        test_stem = matching_stems[0]
        
        gold_text = extract_text_from_json(gold_files[test_stem])
        pred_text = extract_text_from_json(pred_files[test_stem])
        bnf_text = load_bnf_text(bnf_files[test_stem])
        
        print(f"Test page: {test_stem}\n")
        
        for level in ['strict', 'standard']:
            print(f"{level.upper()}:")
            
            # Mistral vs Gold
            pred_cov = calculate_word_coverage(gold_text, pred_text, level)
            
            # BnF vs Gold
            bnf_cov = calculate_word_coverage(gold_text, bnf_text, level)
            
            print(f"  Mistral - P: {pred_cov['precision']:.3f}  R: {pred_cov['recall']:.3f}  F1: {pred_cov['f1']:.3f}")
            print(f"            Shared: {pred_cov['shared_words']}  Unique: {pred_cov['unique_to_hyp']}  Missing: {pred_cov['unique_to_ref']}")
            print(f"  BnF     - P: {bnf_cov['precision']:.3f}  R: {bnf_cov['recall']:.3f}  F1: {bnf_cov['f1']:.3f}")
            print(f"            Shared: {bnf_cov['shared_words']}  Unique: {bnf_cov['unique_to_hyp']}  Missing: {bnf_cov['unique_to_ref']}")
            print()
else:
    print("No magazine triplets available for testing")


Bag-of-Words Coverage Test

Test page: La_Plume_bpt6k1212187t_15-11-1893__page-001

STRICT:
  Mistral - P: 0.984  R: 0.986  F1: 0.985
            Shared: 432  Unique: 7  Missing: 6
  BnF     - P: 0.778  R: 0.842  F1: 0.809
            Shared: 369  Unique: 105  Missing: 69

STANDARD:
  Mistral - P: 0.984  R: 0.986  F1: 0.985
            Shared: 432  Unique: 7  Missing: 6
  BnF     - P: 0.778  R: 0.842  F1: 0.809
            Shared: 369  Unique: 105  Missing: 69



In [7]:
"""
Per-Page Evaluation

Process all matching pages and calculate comprehensive metrics:
1. Direct sequence comparison (preserves order)
2. Order-agnostic comparison (token sort)
3. Bag-of-words coverage
All at three normalization levels: strict, standard, letters_only
"""

def evaluate_page(
    gold_path: Path,
    pred_path: Path,
    bnf_path: Path
) -> Dict:
    """
    Evaluate a single page triplet across all metrics.
    
    Returns:
        Dict with page name and all metrics for both Mistral and BnF
    """
    # Extract texts
    gold_text = extract_text_from_json(gold_path)
    pred_text = extract_text_from_json(pred_path)
    bnf_text = load_bnf_text(bnf_path)
    
    result = {
        'page_name': gold_path.stem,
        'gold_chars': len(gold_text),
        'pred_chars': len(pred_text),
        'bnf_chars': len(bnf_text),
        'gold_words': len(gold_text.split()),
        'pred_words': len(pred_text.split()),
        'bnf_words': len(bnf_text.split()),
        'mistral': {},
        'bnf': {}
    }
    
    # Calculate metrics at each normalization level
    for level in ['strict', 'standard', 'letters_only']:
        # 1. DIRECT SEQUENCE COMPARISON (preserves order)
        pred_cer_direct = character_error_rate(gold_text, pred_text, level)
        pred_wer_direct = word_error_rate(gold_text, pred_text, level) if level != 'letters_only' else 0.0
        
        bnf_cer_direct = character_error_rate(gold_text, bnf_text, level)
        bnf_wer_direct = word_error_rate(gold_text, bnf_text, level) if level != 'letters_only' else 0.0
        
        # 2. ORDER-AGNOSTIC COMPARISON (token sort from 01c)
        if level != 'letters_only':
            # Normalize first, then sort
            if level == 'strict':
                gold_norm = normalize_text_strict(gold_text)
                pred_norm = normalize_text_strict(pred_text)
                bnf_norm = normalize_text_strict(bnf_text)
            else:  # standard
                gold_norm = normalize_text_standard(gold_text)
                pred_norm = normalize_text_standard(pred_text)
                bnf_norm = normalize_text_standard(bnf_text)
            
            gold_sorted = token_sort_text(gold_norm)
            pred_sorted = token_sort_text(pred_norm)
            bnf_sorted = token_sort_text(bnf_norm)
            
            # Calculate CER/WER on sorted text
            pred_cer_sorted = character_error_rate(gold_sorted, pred_sorted, 'strict')  # Already normalized
            pred_wer_sorted = word_error_rate(gold_sorted, pred_sorted, 'strict')
            
            bnf_cer_sorted = character_error_rate(gold_sorted, bnf_sorted, 'strict')
            bnf_wer_sorted = word_error_rate(gold_sorted, bnf_sorted, 'strict')
        else:
            pred_cer_sorted = pred_wer_sorted = 0.0
            bnf_cer_sorted = bnf_wer_sorted = 0.0
        
        # 3. BAG-OF-WORDS COVERAGE
        if level != 'letters_only':
            pred_coverage = calculate_word_coverage(gold_text, pred_text, level)
            bnf_coverage = calculate_word_coverage(gold_text, bnf_text, level)
        else:
            pred_coverage = {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
            bnf_coverage = {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
        
        # Store Mistral metrics
        result['mistral'][level] = {
            'cer_direct': pred_cer_direct,
            'wer_direct': pred_wer_direct,
            'cer_sorted': pred_cer_sorted,
            'wer_sorted': pred_wer_sorted,
            'word_precision': pred_coverage['precision'],
            'word_recall': pred_coverage['recall'],
            'word_f1': pred_coverage['f1']
        }
        
        # Store BnF metrics
        result['bnf'][level] = {
            'cer_direct': bnf_cer_direct,
            'wer_direct': bnf_wer_direct,
            'cer_sorted': bnf_cer_sorted,
            'wer_sorted': bnf_wer_sorted,
            'word_precision': bnf_coverage['precision'],
            'word_recall': bnf_coverage['recall'],
            'word_f1': bnf_coverage['f1']
        }
    
    return result


def evaluate_all_pages() -> List[Dict]:
    """
    Evaluate all matching page triplets across all magazines.
    
    Returns:
        List of page evaluation results
    """
    all_results = []
    
    for mag_name, gold_dir, pred_dir, bnf_dir, _ in magazine_triplets:
        print(f"Processing {mag_name}...")
        
        # Find matching files
        gold_files = {f.stem: f for f in gold_dir.glob("*.json")}
        pred_files = {f.stem: f for f in pred_dir.glob("*.json")}
        bnf_files = {f.stem: f for f in bnf_dir.glob("*.txt")}
        
        matching_stems = sorted(set(gold_files.keys()) & set(pred_files.keys()) & set(bnf_files.keys()))
        
        for stem in matching_stems:
            result = evaluate_page(gold_files[stem], pred_files[stem], bnf_files[stem])
            result['magazine'] = mag_name
            all_results.append(result)
        
        print(f"  ✓ Evaluated {len(matching_stems)} pages")
    
    return all_results


# Run evaluation
print("\n" + "=" * 60)
print("Evaluating All Pages")
print("=" * 60 + "\n")

all_pages = evaluate_all_pages()

print(f"\n✓ Total pages evaluated: {len(all_pages)}")


Evaluating All Pages

Processing La_Plume_bpt6k1212187t_15-11-1893...
  ✓ Evaluated 1 pages

✓ Total pages evaluated: 1


In [8]:
"""
Aggregate Statistics Across All Pages

Calculate means, medians, and standard deviations for all metrics.
Compare Mistral vs BnF performance.
"""

def compute_aggregate_stats(all_pages: List[Dict]) -> Dict:
    """
    Compute aggregate statistics across all evaluated pages.
    
    Returns:
        Dict with statistics for each metric at each normalization level
    """
    stats = {
        'mistral': {},
        'bnf': {}
    }
    
    for system in ['mistral', 'bnf']:
        for level in ['strict', 'standard', 'letters_only']:
            level_metrics = {
                'cer_direct': [],
                'wer_direct': [],
                'cer_sorted': [],
                'wer_sorted': [],
                'word_precision': [],
                'word_recall': [],
                'word_f1': []
            }
            
            for page in all_pages:
                metrics = page[system][level]
                for key in level_metrics.keys():
                    value = metrics[key]
                    # Filter out inf/nan values
                    if value != float('inf') and not np.isnan(value):
                        level_metrics[key].append(value)
            
            # Compute statistics
            stats[system][level] = {}
            for metric, values in level_metrics.items():
                if values:
                    stats[system][level][metric] = {
                        'mean': np.mean(values),
                        'median': np.median(values),
                        'std': np.std(values),
                        'min': np.min(values),
                        'max': np.max(values),
                        'n': len(values)
                    }
                else:
                    stats[system][level][metric] = {
                        'mean': 0, 'median': 0, 'std': 0,
                        'min': 0, 'max': 0, 'n': 0
                    }
    
    # Length statistics
    stats['length'] = {
        'gold_chars': np.mean([p['gold_chars'] for p in all_pages]),
        'pred_chars': np.mean([p['pred_chars'] for p in all_pages]),
        'bnf_chars': np.mean([p['bnf_chars'] for p in all_pages]),
        'gold_words': np.mean([p['gold_words'] for p in all_pages]),
        'pred_words': np.mean([p['pred_words'] for p in all_pages]),
        'bnf_words': np.mean([p['bnf_words'] for p in all_pages]),
    }
    
    return stats


# Compute statistics
print("\n" + "=" * 60)
print("Aggregate Statistics")
print("=" * 60 + "\n")

aggregate_stats = compute_aggregate_stats(all_pages)

# Display statistics by normalization level
for level in ['strict', 'standard', 'letters_only']:
    print(f"\n{level.upper().replace('_', ' ')} Normalization:")
    print("-" * 60)
    
    mistral_stats = aggregate_stats['mistral'][level]
    bnf_stats = aggregate_stats['bnf'][level]
    
    print(f"\nDirect Sequence Comparison (preserves order):")
    print(f"  Character Error Rate (CER):")
    print(f"    Mistral:  {mistral_stats['cer_direct']['mean']:.3f} (±{mistral_stats['cer_direct']['std']:.3f})")
    print(f"    BnF:      {bnf_stats['cer_direct']['mean']:.3f} (±{bnf_stats['cer_direct']['std']:.3f})")
    print(f"    → Mistral is {bnf_stats['cer_direct']['mean'] / mistral_stats['cer_direct']['mean']:.1f}x more accurate" 
          if mistral_stats['cer_direct']['mean'] > 0 else "")
    
    if level != 'letters_only':
        print(f"  Word Error Rate (WER):")
        print(f"    Mistral:  {mistral_stats['wer_direct']['mean']:.3f} (±{mistral_stats['wer_direct']['std']:.3f})")
        print(f"    BnF:      {bnf_stats['wer_direct']['mean']:.3f} (±{bnf_stats['wer_direct']['std']:.3f})")
        print(f"    → Mistral is {bnf_stats['wer_direct']['mean'] / mistral_stats['wer_direct']['mean']:.1f}x more accurate"
              if mistral_stats['wer_direct']['mean'] > 0 else "")
    
    if level != 'letters_only':
        print(f"\nOrder-Agnostic Comparison (token sort):")
        print(f"  Character Error Rate (CER):")
        print(f"    Mistral:  {mistral_stats['cer_sorted']['mean']:.3f} (±{mistral_stats['cer_sorted']['std']:.3f})")
        print(f"    BnF:      {bnf_stats['cer_sorted']['mean']:.3f} (±{bnf_stats['cer_sorted']['std']:.3f})")
        
        print(f"  Word Error Rate (WER):")
        print(f"    Mistral:  {mistral_stats['wer_sorted']['mean']:.3f} (±{mistral_stats['wer_sorted']['std']:.3f})")
        print(f"    BnF:      {bnf_stats['wer_sorted']['mean']:.3f} (±{bnf_stats['wer_sorted']['std']:.3f})")
        
        print(f"\nBag-of-Words Coverage:")
        print(f"  Word F1:")
        print(f"    Mistral:  {mistral_stats['word_f1']['mean']:.3f}")
        print(f"    BnF:      {bnf_stats['word_f1']['mean']:.3f}")
        print(f"  Word Precision:")
        print(f"    Mistral:  {mistral_stats['word_precision']['mean']:.3f}")
        print(f"    BnF:      {bnf_stats['word_precision']['mean']:.3f}")
        print(f"  Word Recall:")
        print(f"    Mistral:  {mistral_stats['word_recall']['mean']:.3f}")
        print(f"    BnF:      {bnf_stats['word_recall']['mean']:.3f}")

print(f"\n\n{'='*60}")
print("Length Statistics")
print("=" * 60)
print(f"Average page length:")
print(f"  Gold Standard:       {aggregate_stats['length']['gold_chars']:.0f} chars, {aggregate_stats['length']['gold_words']:.0f} words")
print(f"  Mistral Predictions: {aggregate_stats['length']['pred_chars']:.0f} chars, {aggregate_stats['length']['pred_words']:.0f} words")
print(f"  BnF OCR:             {aggregate_stats['length']['bnf_chars']:.0f} chars, {aggregate_stats['length']['bnf_words']:.0f} words")
print(f"\n  Mistral/Gold ratio:  {aggregate_stats['length']['pred_chars']/aggregate_stats['length']['gold_chars']:.2f}")
print(f"  BnF/Gold ratio:      {aggregate_stats['length']['bnf_chars']/aggregate_stats['length']['gold_chars']:.2f}")
print("=" * 60)


Aggregate Statistics


STRICT Normalization:
------------------------------------------------------------

Direct Sequence Comparison (preserves order):
  Character Error Rate (CER):
    Mistral:  0.004 (±0.000)
    BnF:      0.058 (±0.000)
    → Mistral is 15.3x more accurate
  Word Error Rate (WER):
    Mistral:  0.013 (±0.000)
    BnF:      0.172 (±0.000)
    → Mistral is 13.7x more accurate

Order-Agnostic Comparison (token sort):
  Character Error Rate (CER):
    Mistral:  0.017 (±0.000)
    BnF:      0.218 (±0.000)
  Word Error Rate (WER):
    Mistral:  0.017 (±0.000)
    BnF:      0.229 (±0.000)

Bag-of-Words Coverage:
  Word F1:
    Mistral:  0.985
    BnF:      0.809
  Word Precision:
    Mistral:  0.984
    BnF:      0.778
  Word Recall:
    Mistral:  0.986
    BnF:      0.842

STANDARD Normalization:
------------------------------------------------------------

Direct Sequence Comparison (preserves order):
  Character Error Rate (CER):
    Mistral:  0.004 (±0.000)
    BnF: 

In [9]:
"""
Create Final Summary Table

Synthesize all findings into a comprehensive comparison table.
"""

def create_summary_table() -> pd.DataFrame:
    """
    Create comprehensive summary table comparing Mistral vs BnF.
    
    Returns:
        DataFrame with all key metrics
    """
    summary_data = []
    
    stats = aggregate_stats
    
    for level in ['strict', 'standard', 'letters_only']:
        mistral_stats = stats['mistral'][level]
        bnf_stats = stats['bnf'][level]
        
        # Direct Sequence - CER
        mistral_cer = mistral_stats['cer_direct']['mean']
        bnf_cer = bnf_stats['cer_direct']['mean']
        improvement = bnf_cer / mistral_cer if mistral_cer > 0 else 0
        
        summary_data.append({
            'Normalization': level.replace('_', ' ').title(),
            'Approach': 'Direct Sequence',
            'Metric': 'CER',
            'Mistral': f"{mistral_cer:.3f}",
            'BnF': f"{bnf_cer:.3f}",
            'Mistral Advantage': f"{improvement:.1f}x better"
        })
        
        # Direct Sequence - WER (not for letters_only)
        if level != 'letters_only':
            mistral_wer = mistral_stats['wer_direct']['mean']
            bnf_wer = bnf_stats['wer_direct']['mean']
            improvement = bnf_wer / mistral_wer if mistral_wer > 0 else 0
            
            summary_data.append({
                'Normalization': level.replace('_', ' ').title(),
                'Approach': 'Direct Sequence',
                'Metric': 'WER',
                'Mistral': f"{mistral_wer:.3f}",
                'BnF': f"{bnf_wer:.3f}",
                'Mistral Advantage': f"{improvement:.1f}x better"
            })
        
        # Order-Agnostic (token sort) - only for strict/standard
        if level != 'letters_only':
            mistral_cer_sorted = mistral_stats['cer_sorted']['mean']
            bnf_cer_sorted = bnf_stats['cer_sorted']['mean']
            improvement = bnf_cer_sorted / mistral_cer_sorted if mistral_cer_sorted > 0 else 0
            
            summary_data.append({
                'Normalization': level.replace('_', ' ').title(),
                'Approach': 'Order-Agnostic',
                'Metric': 'CER (sorted)',
                'Mistral': f"{mistral_cer_sorted:.3f}",
                'BnF': f"{bnf_cer_sorted:.3f}",
                'Mistral Advantage': f"{improvement:.1f}x better"
            })
            
            mistral_wer_sorted = mistral_stats['wer_sorted']['mean']
            bnf_wer_sorted = bnf_stats['wer_sorted']['mean']
            improvement = bnf_wer_sorted / mistral_wer_sorted if mistral_wer_sorted > 0 else 0
            
            summary_data.append({
                'Normalization': level.replace('_', ' ').title(),
                'Approach': 'Order-Agnostic',
                'Metric': 'WER (sorted)',
                'Mistral': f"{mistral_wer_sorted:.3f}",
                'BnF': f"{bnf_wer_sorted:.3f}",
                'Mistral Advantage': f"{improvement:.1f}x better"
            })
            
            # Bag-of-Words
            mistral_f1 = mistral_stats['word_f1']['mean']
            bnf_f1 = bnf_stats['word_f1']['mean']
            
            summary_data.append({
                'Normalization': level.replace('_', ' ').title(),
                'Approach': 'Bag-of-Words',
                'Metric': 'Word F1',
                'Mistral': f"{mistral_f1:.3f}",
                'BnF': f"{bnf_f1:.3f}",
                'Mistral Advantage': f"{(mistral_f1 - bnf_f1)*100:.1f}pp higher"
            })
    
    return pd.DataFrame(summary_data)


# Generate and display summary
print("\n" + "=" * 80)
print("STAGE 1 BnF COMPARATIVE EVALUATION - FINAL SUMMARY")
print("=" * 80)
print()

summary_df = create_summary_table()
print(summary_df.to_string(index=False))

print("\n" + "=" * 80)
print("KEY FINDINGS")
print("=" * 80)

mistral_cer_std = aggregate_stats['mistral']['standard']['cer_direct']['mean']
bnf_cer_std = aggregate_stats['bnf']['standard']['cer_direct']['mean']
mistral_wer_std = aggregate_stats['mistral']['standard']['wer_direct']['mean']
bnf_wer_std = aggregate_stats['bnf']['standard']['wer_direct']['mean']
mistral_f1 = aggregate_stats['mistral']['standard']['word_f1']['mean']
bnf_f1 = aggregate_stats['bnf']['standard']['word_f1']['mean']

print(f"""
1. OCR ACCURACY (Standard normalization):
   - Mistral CER: {mistral_cer_std:.1%}
   - BnF CER:     {bnf_cer_std:.1%}

2. WORD-LEVEL ACCURACY:
   - Mistral WER: {mistral_wer_std:.1%}
   - BnF WER:     {bnf_wer_std:.1%}

3. CONTENT COVERAGE:
   - Mistral captures {mistral_f1:.1%} of words correctly
   - BnF captures {bnf_f1:.1%} of words correctly

""")

print("=" * 80)


STAGE 1 BnF COMPARATIVE EVALUATION - FINAL SUMMARY

Normalization        Approach       Metric Mistral   BnF Mistral Advantage
       Strict Direct Sequence          CER   0.004 0.058      15.3x better
       Strict Direct Sequence          WER   0.013 0.172      13.7x better
       Strict  Order-Agnostic CER (sorted)   0.017 0.218      12.6x better
       Strict  Order-Agnostic WER (sorted)   0.017 0.229      13.7x better
       Strict    Bag-of-Words      Word F1   0.985 0.809     17.6pp higher
     Standard Direct Sequence          CER   0.004 0.039      10.9x better
     Standard Direct Sequence          WER   0.013 0.172      13.7x better
     Standard  Order-Agnostic CER (sorted)   0.017 0.218      12.6x better
     Standard  Order-Agnostic WER (sorted)   0.017 0.229      13.7x better
     Standard    Bag-of-Words      Word F1   0.985 0.809     17.6pp higher
 Letters Only Direct Sequence          CER   0.003 0.024       7.1x better

KEY FINDINGS

1. OCR ACCURACY (Standard normal