In [1]:
"""
Stage 1 BnF Comparative Evaluation

Compares Mistral OCR structured predictions against BnF's unstructured OCR text
using ORDER-AGNOSTIC bag-of-words metrics.

Input:  Gold standard from data/gold_standard/cleaned/{magazine_name}/
        Predictions from data/predictions/{magazine_name}/
        BnF OCR from data/bnf_ocr/{magazine_name}/
Output: Comparative metrics and analysis
Schema: schemas/stage1_page.py
"""

from pathlib import Path
import json
from typing import Dict, List, Tuple, Optional, Set
from collections import defaultdict, Counter
import re
import pandas as pd
import numpy as np
import Levenshtein
import unicodedata

# Project imports
from utils.paths import PROJECT_ROOT, PREDICTIONS, GOLD_CLEAN, BNF_OCR
from schemas.stage1_page import Stage1PageModel
from utils.text_processing import (
    normalize_text_strict,
    normalize_text_standard,
    normalize_text_letters_only
)

# Paths
GOLD_ROOT = GOLD_CLEAN
PRED_ROOT = PREDICTIONS
BNF_ROOT = BNF_OCR

print("Stage 1 BnF Comparative Evaluation")
print("=" * 60)
print(f"Project root: {PROJECT_ROOT}")
print("\nDirectories:")
print(f"  Gold standard: {GOLD_ROOT}")
print(f"  Predictions:   {PRED_ROOT}")
print(f"  BnF OCR:       {BNF_ROOT}")
print(f"  Schema:        {Stage1PageModel.__name__}")

Stage 1 BnF Comparative Evaluation
Project root: /home/fabian-ramirez/Documents/These/Code/magazine_graphs

Directories:
  Gold standard: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/gold_standard/cleaned
  Predictions:   /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/predictions
  BnF OCR:       /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/bnf_ocr
  Schema:        Stage1PageModel


In [2]:
"""
Find Magazine Triplets for Comparison
"""

def find_magazine_triplets() -> List[Tuple[str, Path, Path, Path, int]]:
    """
    Find magazines that have gold standard, predictions, AND BnF OCR files.
    
    Returns:
        List of (magazine_name, gold_dir, pred_dir, bnf_dir, num_matching_files) tuples
    """
    gold_magazines = {d.name: d for d in GOLD_ROOT.iterdir() if d.is_dir()}
    pred_magazines = {d.name: d for d in PRED_ROOT.iterdir() if d.is_dir()}
    bnf_magazines = {d.name: d for d in BNF_ROOT.iterdir() if d.is_dir()}
    
    common_magazines = set(gold_magazines.keys()) & set(pred_magazines.keys()) & set(bnf_magazines.keys())
    
    triplets = []
    for mag_name in sorted(common_magazines):
        gold_dir = gold_magazines[mag_name]
        pred_dir = pred_magazines[mag_name]
        bnf_dir = bnf_magazines[mag_name]
        
        # Find matching page files by stem
        gold_files = {f.stem: f for f in gold_dir.glob("*.json")}
        pred_files = {f.stem: f for f in pred_dir.glob("*.json")}
        bnf_files = {f.stem: f for f in bnf_dir.glob("*.txt")}
        
        matching_stems = set(gold_files.keys()) & set(pred_files.keys()) & set(bnf_files.keys())
        
        if matching_stems:
            triplets.append((mag_name, gold_dir, pred_dir, bnf_dir, len(matching_stems)))
    
    return triplets

# Find triplets
print("\n" + "=" * 60)
print("Finding Magazine Triplets")
print("=" * 60 + "\n")

magazine_triplets = find_magazine_triplets()

if not magazine_triplets:
    print("No matching magazine triplets found.")
    print("\nCheck that:")
    print("  1. Gold standard files exist in gold_standard/cleaned/")
    print("  2. Prediction files exist in predictions/")
    print("  3. BnF OCR files exist in bnf_ocr/")
    print("  4. Magazine names and page filenames match across all three")
else:
    print(f"Found {len(magazine_triplets)} magazine(s) for comparison:\n")
    for mag_name, gold_dir, pred_dir, bnf_dir, num_files in magazine_triplets:
        print(f"{mag_name}:")
        print(f"  Gold files: {len(list(gold_dir.glob('*.json')))}")
        print(f"  Pred files: {len(list(pred_dir.glob('*.json')))}")
        print(f"  BnF files:  {len(list(bnf_dir.glob('*.txt')))}")
        print(f"  Matching:   {num_files}")
        print()


Finding Magazine Triplets

Found 1 magazine(s) for comparison:

La_Plume_bpt6k1212187t_15-11-1893:
  Gold files: 6
  Pred files: 34
  BnF files:  6
  Matching:   6



In [3]:
"""
Text Extraction Functions
"""

def load_bnf_text(txt_path: Path) -> str:
    """
    Load BnF OCR text file.
    
    Args:
        txt_path: Path to BnF .txt file
        
    Returns:
        Raw text content
    """
    try:
        return txt_path.read_text(encoding='utf-8')
    except UnicodeDecodeError:
        # Fallback to latin-1 if UTF-8 fails
        return txt_path.read_text(encoding='latin-1')


def extract_text_from_json(json_path: Path) -> str:
    """
    Extract all text from a JSON file (gold or prediction).
    
    Concatenates all item_text_raw fields in order.
    
    Args:
        json_path: Path to JSON file
        
    Returns:
        Concatenated text
    """
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        text_parts = []
        
        for item in data.get('items', []):
            if item.get('item_text_raw'):
                text_parts.append(item['item_text_raw'])
        
        # Join with space to preserve word boundaries
        return ' '.join(text_parts)
    
    except Exception as e:
        print(f"ERROR loading {json_path.name}: {e}")
        return ""


# Test extraction on first available page
print("\n" + "=" * 60)
print("Text Extraction Test")
print("=" * 60 + "\n")

if magazine_triplets:
    mag_name, gold_dir, pred_dir, bnf_dir, _ = magazine_triplets[0]
    
    # Get first matching page
    gold_files = {f.stem: f for f in gold_dir.glob("*.json")}
    pred_files = {f.stem: f for f in pred_dir.glob("*.json")}
    bnf_files = {f.stem: f for f in bnf_dir.glob("*.txt")}
    
    matching_stems = sorted(set(gold_files.keys()) & set(pred_files.keys()) & set(bnf_files.keys()))
    
    if matching_stems:
        test_stem = matching_stems[0]
        
        gold_text = extract_text_from_json(gold_files[test_stem])
        pred_text = extract_text_from_json(pred_files[test_stem])
        bnf_text = load_bnf_text(bnf_files[test_stem])
        
        print(f"Test page: {test_stem}")
        print(f"  Gold length: {len(gold_text):,} chars, {len(gold_text.split()):,} words")
        print(f"  Pred length: {len(pred_text):,} chars, {len(pred_text.split()):,} words")
        print(f"  BnF length:  {len(bnf_text):,} chars, {len(bnf_text.split()):,} words")
        print(f"\n  Pred/Gold ratio: {len(pred_text)/len(gold_text):.2f}")
        print(f"  BnF/Gold ratio:  {len(bnf_text)/len(gold_text):.2f}")
else:
    print("No magazine triplets available for testing")


Text Extraction Test

Test page: La_Plume_bpt6k1212187t_15-11-1893__page-001
  Gold length: 4,475 chars, 717 words
  Pred length: 4,470 chars, 718 words
  BnF length:  4,495 chars, 753 words

  Pred/Gold ratio: 1.00
  BnF/Gold ratio:  1.00


In [4]:
"""
Text Normalization Functions

Three normalization levels matching 01c:
- Strict: Only Unicode NFC normalization (preserves everything)
- Standard: Normalize whitespace to single spaces
- Letters Only: Remove all whitespace and punctuation (pure character recognition)
"""

# Test normalization on real data
print("\n" + "=" * 60)
print("Normalization Test (Random Sample from Corpus)")
print("=" * 60 + "\n")

if magazine_triplets:
    import random
    
    # Get a random page from available data
    mag_name, gold_dir, pred_dir, bnf_dir, _ = random.choice(magazine_triplets)
    gold_files = list(gold_dir.glob("*.json"))
    
    if gold_files:
        random_file = random.choice(gold_files)
        test_text = extract_text_from_json(random_file)
        
        # Take a random 150-character snippet
        if len(test_text) > 150:
            start = random.randint(0, len(test_text) - 150)
            test_text = test_text[start:start+150]
        
        print(f"Source: {mag_name} / {random_file.stem}")
        print(f"Random snippet (150 chars):\n")
        print(f"ORIGINAL:")
        print(f"  '{test_text}'")
        print()
        
        for level_name, normalize_func in [
            ('STRICT', normalize_text_strict),
            ('STANDARD', normalize_text_standard),
            ('LETTERS ONLY', normalize_text_letters_only)
        ]:
            normalized = normalize_func(test_text)
            print(f"{level_name}:")
            print(f"  '{normalized}'")
            print()
    else:
        print("No gold files found for testing")
else:
    print("No magazine triplets available for testing")


Normalization Test (Random Sample from Corpus)

Source: La_Plume_bpt6k1212187t_15-11-1893 / La_Plume_bpt6k1212187t_15-11-1893__page-006
Random snippet (150 chars):

ORIGINAL:
  'ant, vigoureux, d'un caprice sans pareil. Jules Chéret, n'escamotant jamais la difficulté, en triomphant toujours, malgré les proportions nécessaires '

STRICT:
  'ant, vigoureux, d'un caprice sans pareil. Jules Chéret, n'escamotant jamais la difficulté, en triomphant toujours, malgré les proportions nécessaires '

STANDARD:
  'ant, vigoureux, d'un caprice sans pareil. Jules Chéret, n'escamotant jamais la difficulté, en triomphant toujours, malgré les proportions nécessaires'

LETTERS ONLY:
  'antvigoureuxduncapricesanspareilJulesChéretnescamotantjamaisladifficultéentriomphanttoujoursmalgrélesproportionsnécessaires'



In [5]:
"""
Bag-of-Words Coverage Metrics

Order-agnostic word-level comparison:
- Precision: % of predicted words that appear in reference
- Recall: % of reference words that appear in predictions  
- F1: Harmonic mean

This tells us about content coverage regardless of order.
"""

def calculate_word_coverage(reference: str, hypothesis: str, normalization: str = 'standard') -> Dict[str, float]:
    """
    Calculate word-level precision, recall, and F1 (bag-of-words).
    
    Args:
        reference: Reference text (gold standard)
        hypothesis: Hypothesis text (OCR output)
        normalization: Normalization level to apply
        
    Returns:
        Dict with precision, recall, f1, and word counts
    """
    # Apply normalization
    if normalization == 'strict':
        ref = normalize_text_strict(reference)
        hyp = normalize_text_strict(hypothesis)
    elif normalization == 'standard':
        ref = normalize_text_standard(reference)
        hyp = normalize_text_standard(hypothesis)
    elif normalization == 'letters_only':
        # Use standard for word-level (need word boundaries)
        ref = normalize_text_standard(reference)
        hyp = normalize_text_standard(hypothesis)
    else:
        ref = reference
        hyp = hypothesis
    
    words_ref = set(ref.split())
    words_hyp = set(hyp.split())
    
    if len(words_hyp) == 0:
        precision = 0.0
    else:
        # Precision: % of hypothesis words that appear in reference
        precision = len(words_ref & words_hyp) / len(words_hyp)
    
    if len(words_ref) == 0:
        recall = 0.0
    else:
        # Recall: % of reference words that appear in hypothesis
        recall = len(words_ref & words_hyp) / len(words_ref)
    
    if precision + recall == 0:
        f1 = 0.0
    else:
        f1 = 2 * precision * recall / (precision + recall)
    
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'shared_words': len(words_ref & words_hyp),
        'unique_to_hyp': len(words_hyp - words_ref),
        'unique_to_ref': len(words_ref - words_hyp),
        'total_ref_words': len(words_ref),
        'total_hyp_words': len(words_hyp)
    }


# Test bag-of-words coverage
print("\n" + "=" * 60)
print("Bag-of-Words Coverage Test")
print("=" * 60 + "\n")

if magazine_triplets:
    mag_name, gold_dir, pred_dir, bnf_dir, _ = magazine_triplets[0]
    
    gold_files = {f.stem: f for f in gold_dir.glob("*.json")}
    pred_files = {f.stem: f for f in pred_dir.glob("*.json")}
    bnf_files = {f.stem: f for f in bnf_dir.glob("*.txt")}
    
    matching_stems = sorted(set(gold_files.keys()) & set(pred_files.keys()) & set(bnf_files.keys()))
    
    if matching_stems:
        test_stem = matching_stems[0]
        
        gold_text = extract_text_from_json(gold_files[test_stem])
        pred_text = extract_text_from_json(pred_files[test_stem])
        bnf_text = load_bnf_text(bnf_files[test_stem])
        
        print(f"Test page: {test_stem}\n")
        
        for level in ['strict', 'standard']:
            print(f"{level.upper()}:")
            
            # Mistral vs Gold
            pred_cov = calculate_word_coverage(gold_text, pred_text, level)
            
            # BnF vs Gold
            bnf_cov = calculate_word_coverage(gold_text, bnf_text, level)
            
            print(f"  Mistral - P: {pred_cov['precision']:.3f}  R: {pred_cov['recall']:.3f}  F1: {pred_cov['f1']:.3f}")
            print(f"            Shared: {pred_cov['shared_words']}  Unique: {pred_cov['unique_to_hyp']}  Missing: {pred_cov['unique_to_ref']}")
            print(f"  BnF     - P: {bnf_cov['precision']:.3f}  R: {bnf_cov['recall']:.3f}  F1: {bnf_cov['f1']:.3f}")
            print(f"            Shared: {bnf_cov['shared_words']}  Unique: {bnf_cov['unique_to_hyp']}  Missing: {bnf_cov['unique_to_ref']}")
            print()
else:
    print("No magazine triplets available for testing")


Bag-of-Words Coverage Test

Test page: La_Plume_bpt6k1212187t_15-11-1893__page-001

STRICT:
  Mistral - P: 0.982  R: 0.984  F1: 0.983
            Shared: 431  Unique: 8  Missing: 7
  BnF     - P: 0.781  R: 0.845  F1: 0.811
            Shared: 370  Unique: 104  Missing: 68

STANDARD:
  Mistral - P: 0.982  R: 0.984  F1: 0.983
            Shared: 431  Unique: 8  Missing: 7
  BnF     - P: 0.781  R: 0.845  F1: 0.811
            Shared: 370  Unique: 104  Missing: 68



In [6]:
"""
Per-Page Evaluation

Process all matching pages and calculate bag-of-words coverage metrics.
All metrics are order-agnostic (extraction order doesn't affect scores).
Calculated at two normalization levels: strict and standard.
"""

def evaluate_page(
    gold_path: Path,
    pred_path: Path,
    bnf_path: Path
) -> Dict:
    """
    Evaluate a single page triplet across all metrics.
    
    Returns:
        Dict with page name and all metrics for both Mistral and BnF
    """
    # Extract texts
    gold_text = extract_text_from_json(gold_path)
    pred_text = extract_text_from_json(pred_path)
    bnf_text = load_bnf_text(bnf_path)
    
    result = {
        'page_name': gold_path.stem,
        'gold_chars': len(gold_text),
        'pred_chars': len(pred_text),
        'bnf_chars': len(bnf_text),
        'gold_words': len(gold_text.split()),
        'pred_words': len(pred_text.split()),
        'bnf_words': len(bnf_text.split()),
        'mistral': {},
        'bnf': {}
    }
    
    # Calculate metrics at each normalization level
    for level in ['strict', 'standard']:
        # BAG-OF-WORDS COVERAGE (order-agnostic)
        pred_coverage = calculate_word_coverage(gold_text, pred_text, level)
        bnf_coverage = calculate_word_coverage(gold_text, bnf_text, level)
        # Store Mistral metrics
        result['mistral'][level] = {
            'word_precision': pred_coverage['precision'],
            'word_recall': pred_coverage['recall'],
            'word_f1': pred_coverage['f1'],
            'shared_words': pred_coverage['shared_words'],
            'unique_to_hyp': pred_coverage['unique_to_hyp'],
            'unique_to_ref': pred_coverage['unique_to_ref']
        }
        
        # Store BnF metrics
        result['bnf'][level] = {
            'word_precision': bnf_coverage['precision'],
            'word_recall': bnf_coverage['recall'],
            'word_f1': bnf_coverage['f1'],
            'shared_words': bnf_coverage['shared_words'],
            'unique_to_hyp': bnf_coverage['unique_to_hyp'],
            'unique_to_ref': bnf_coverage['unique_to_ref']
        }
    
    return result


def evaluate_all_pages() -> List[Dict]:
    """
    Evaluate all matching page triplets across all magazines.
    
    Returns:
        List of page evaluation results
    """
    all_results = []
    
    for mag_name, gold_dir, pred_dir, bnf_dir, _ in magazine_triplets:
        print(f"Processing {mag_name}...")
        
        # Find matching files
        gold_files = {f.stem: f for f in gold_dir.glob("*.json")}
        pred_files = {f.stem: f for f in pred_dir.glob("*.json")}
        bnf_files = {f.stem: f for f in bnf_dir.glob("*.txt")}
        
        matching_stems = sorted(set(gold_files.keys()) & set(pred_files.keys()) & set(bnf_files.keys()))
        
        for stem in matching_stems:
            result = evaluate_page(gold_files[stem], pred_files[stem], bnf_files[stem])
            result['magazine'] = mag_name
            all_results.append(result)
        
        print(f"  ✓ Evaluated {len(matching_stems)} pages")
    
    return all_results


# Run evaluation
print("\n" + "=" * 60)
print("Evaluating All Pages")
print("=" * 60 + "\n")

all_pages = evaluate_all_pages()

print(f"\n✓ Total pages evaluated: {len(all_pages)}")


Evaluating All Pages

Processing La_Plume_bpt6k1212187t_15-11-1893...


  ✓ Evaluated 6 pages

✓ Total pages evaluated: 6


In [7]:
"""
Aggregate Statistics Across All Pages

Calculate means, medians, and standard deviations for all metrics.
Compare Mistral vs BnF performance.
"""

def compute_aggregate_stats(all_pages: List[Dict]) -> Dict:
    """
    Compute aggregate statistics across all evaluated pages.
    
    Returns:
        Dict with statistics for each metric at each normalization level
    """
    stats = {
        'mistral': {},
        'bnf': {}
    }
    
    for system in ['mistral', 'bnf']:
        for level in ['strict', 'standard', 'letters_only']:
            level_metrics = {
                'word_precision': [],
                'word_recall': [],
                'word_f1': [],
                'shared_words': [],
                'unique_to_hyp': [],
                'unique_to_ref': []
            }
            
            for page in all_pages:
                # Skip letters_only if not present in page data
                if level not in page[system]:
                    continue
                
                metrics = page[system][level]
                for key in level_metrics.keys():
                    value = metrics.get(key, 0)
                    # Filter out inf/nan values
                    if value != float('inf') and not np.isnan(value):
                        level_metrics[key].append(value)
            
            # Compute statistics
            stats[system][level] = {}
            for metric, values in level_metrics.items():
                if values:
                    stats[system][level][metric] = {
                        'mean': np.mean(values),
                        'median': np.median(values),
                        'std': np.std(values),
                        'min': np.min(values),
                        'max': np.max(values),
                        'n': len(values)
                    }
                else:
                    stats[system][level][metric] = {
                        'mean': 0, 'median': 0, 'std': 0,
                        'min': 0, 'max': 0, 'n': 0
                    }
    
    # Length statistics
    stats['length'] = {
        'gold_chars': np.mean([p['gold_chars'] for p in all_pages]),
        'pred_chars': np.mean([p['pred_chars'] for p in all_pages]),
        'bnf_chars': np.mean([p['bnf_chars'] for p in all_pages]),
        'gold_words': np.mean([p['gold_words'] for p in all_pages]),
        'pred_words': np.mean([p['pred_words'] for p in all_pages]),
        'bnf_words': np.mean([p['bnf_words'] for p in all_pages]),
    }
    
    return stats


# Compute statistics
print("\n" + "=" * 60)
print("Aggregate Statistics")
print("=" * 60 + "\n")

aggregate_stats = compute_aggregate_stats(all_pages)

# Display statistics by normalization level
for level in ['strict', 'standard']:
    print(f"\n{level.upper().replace('_', ' ')} Normalization:")
    print("-" * 60)
    
    mistral_stats = aggregate_stats['mistral'][level]
    bnf_stats = aggregate_stats['bnf'][level]
    
    print(f"\nWord Coverage (Bag-of-Words, order-agnostic):")
    
    print(f"  F1 Score:")
    print(f"    Mistral:  {mistral_stats['word_f1']['mean']:.3f} (±{mistral_stats['word_f1']['std']:.3f})")
    print(f"    BnF:      {bnf_stats['word_f1']['mean']:.3f} (±{bnf_stats['word_f1']['std']:.3f})")
    print(f"    Δ Mistral advantage: {(mistral_stats['word_f1']['mean'] - bnf_stats['word_f1']['mean'])*100:+.1f} percentage points")
    
    print(f"\n  Precision (% of extracted words that are in gold):")
    print(f"    Mistral:  {mistral_stats['word_precision']['mean']:.3f} (±{mistral_stats['word_precision']['std']:.3f})")
    print(f"    BnF:      {bnf_stats['word_precision']['mean']:.3f} (±{bnf_stats['word_precision']['std']:.3f})")
    
    print(f"\n  Recall (% of gold words that were extracted):")
    print(f"    Mistral:  {mistral_stats['word_recall']['mean']:.3f} (±{mistral_stats['word_recall']['std']:.3f})")
    print(f"    BnF:      {bnf_stats['word_recall']['mean']:.3f} (±{bnf_stats['word_recall']['std']:.3f})")
    
    print(f"\n  Average word counts per page:")
    print(f"    Shared (both found):     {mistral_stats['shared_words']['mean']:.0f} vs {bnf_stats['shared_words']['mean']:.0f}")
    print(f"    Unique to hypothesis:    {mistral_stats['unique_to_hyp']['mean']:.0f} vs {bnf_stats['unique_to_hyp']['mean']:.0f}")
    print(f"    Missing from hypothesis: {mistral_stats['unique_to_ref']['mean']:.0f} vs {bnf_stats['unique_to_ref']['mean']:.0f}")

print(f"\n\n{'='*60}")
print("Length Statistics")
print("=" * 60)
print(f"Average page length:")
print(f"  Gold Standard:       {aggregate_stats['length']['gold_chars']:.0f} chars, {aggregate_stats['length']['gold_words']:.0f} words")
print(f"  Mistral Predictions: {aggregate_stats['length']['pred_chars']:.0f} chars, {aggregate_stats['length']['pred_words']:.0f} words")
print(f"  BnF OCR:             {aggregate_stats['length']['bnf_chars']:.0f} chars, {aggregate_stats['length']['bnf_words']:.0f} words")
print(f"\n  Mistral/Gold ratio:  {aggregate_stats['length']['pred_chars']/aggregate_stats['length']['gold_chars']:.2f}")
print(f"  BnF/Gold ratio:      {aggregate_stats['length']['bnf_chars']/aggregate_stats['length']['gold_chars']:.2f}")
print("=" * 60)


Aggregate Statistics


STRICT Normalization:
------------------------------------------------------------

Word Coverage (Bag-of-Words, order-agnostic):
  F1 Score:
    Mistral:  0.946 (±0.072)
    BnF:      0.817 (±0.023)
    Δ Mistral advantage: +12.8 percentage points

  Precision (% of extracted words that are in gold):
    Mistral:  0.973 (±0.023)
    BnF:      0.790 (±0.036)

  Recall (% of gold words that were extracted):
    Mistral:  0.927 (±0.118)
    BnF:      0.848 (±0.021)

  Average word counts per page:
    Shared (both found):     298 vs 263
    Unique to hypothesis:    7 vs 68
    Missing from hypothesis: 11 vs 46

STANDARD Normalization:
------------------------------------------------------------

Word Coverage (Bag-of-Words, order-agnostic):
  F1 Score:
    Mistral:  0.946 (±0.072)
    BnF:      0.817 (±0.023)
    Δ Mistral advantage: +12.8 percentage points

  Precision (% of extracted words that are in gold):
    Mistral:  0.973 (±0.023)
    BnF:      0.790 (±0.0

In [8]:
"""
Create Final Summary Table

Synthesize all findings into a comprehensive comparison table.
"""

def create_summary_table() -> pd.DataFrame:
    """
    Create comprehensive summary table comparing Mistral vs BnF.
    
    Returns:
        DataFrame with all key metrics
    """
    summary_data = []
    
    stats = aggregate_stats
    
    for level in ['strict', 'standard']:
        mistral_stats = stats['mistral'][level]
        bnf_stats = stats['bnf'][level]
        
        # F1 Score
        mistral_f1 = mistral_stats['word_f1']['mean']
        bnf_f1 = bnf_stats['word_f1']['mean']
        
        summary_data.append({
            'Normalization': level.replace('_', ' ').title(),
            'Metric': 'F1 Score',
            'Mistral': f"{mistral_f1:.3f}",
            'BnF': f"{bnf_f1:.3f}",
            'Difference': f"{(mistral_f1 - bnf_f1)*100:+.1f}pp"
        })
        
        # Precision
        mistral_prec = mistral_stats['word_precision']['mean']
        bnf_prec = bnf_stats['word_precision']['mean']
        
        summary_data.append({
            'Normalization': level.replace('_', ' ').title(),
            'Metric': 'Precision',
            'Mistral': f"{mistral_prec:.3f}",
            'BnF': f"{bnf_prec:.3f}",
            'Difference': f"{(mistral_prec - bnf_prec)*100:+.1f}pp"
        })
        
        # Recall
        mistral_rec = mistral_stats['word_recall']['mean']
        bnf_rec = bnf_stats['word_recall']['mean']
        
        summary_data.append({
            'Normalization': level.replace('_', ' ').title(),
            'Metric': 'Recall',
            'Mistral': f"{mistral_rec:.3f}",
            'BnF': f"{bnf_rec:.3f}",
            'Difference': f"{(mistral_rec - bnf_rec)*100:+.1f}pp"
        })
    
    return pd.DataFrame(summary_data)


# Generate and display summary
print("\n" + "=" * 80)
print("STAGE 1 BnF COMPARATIVE EVALUATION - FINAL SUMMARY")
print("=" * 80)
print()

summary_df = create_summary_table()
print(summary_df.to_string(index=False))

print("\n" + "=" * 80)
print("KEY FINDINGS")
print("=" * 80)

mistral_f1 = aggregate_stats['mistral']['standard']['word_f1']['mean']
bnf_f1 = aggregate_stats['bnf']['standard']['word_f1']['mean']
mistral_prec = aggregate_stats['mistral']['standard']['word_precision']['mean']
bnf_prec = aggregate_stats['bnf']['standard']['word_precision']['mean']
mistral_rec = aggregate_stats['mistral']['standard']['word_recall']['mean']
bnf_rec = aggregate_stats['bnf']['standard']['word_recall']['mean']

mistral_shared = aggregate_stats['mistral']['standard']['shared_words']['mean']
bnf_shared = aggregate_stats['bnf']['standard']['shared_words']['mean']
mistral_missing = aggregate_stats['mistral']['standard']['unique_to_ref']['mean']
bnf_missing = aggregate_stats['bnf']['standard']['unique_to_ref']['mean']

print(f"""
1. OVERALL WORD COVERAGE (F1 Score):
   - Mistral: {mistral_f1:.1%}
   - BnF:     {bnf_f1:.1%}
   - Difference: {(mistral_f1 - bnf_f1)*100:+.1f} percentage points in {"Mistral's" if mistral_f1 > bnf_f1 else "BnF's"} favor

2. PRECISION (% of extracted words that are correct):
   - Mistral: {mistral_prec:.1%}
   - BnF:     {bnf_prec:.1%}
   - Interpretation: {"Mistral has fewer false positives" if mistral_prec > bnf_prec else "BnF has fewer false positives"}

3. RECALL (% of gold words that were extracted):
   - Mistral: {mistral_rec:.1%}
   - BnF:     {bnf_rec:.1%}
   - Interpretation: {"Mistral misses fewer words" if mistral_rec > bnf_rec else "BnF misses fewer words"}

4. AVERAGE WORDS PER PAGE:
   - Mistral: {mistral_shared:.0f} correct, {mistral_missing:.0f} missed
   - BnF:     {bnf_shared:.0f} correct, {bnf_missing:.0f} missed

NOTE: These are order-agnostic metrics. Word extraction order differences
      (e.g., extracting columns in different sequence) do not affect scores.
""")

print("=" * 80)


STAGE 1 BnF COMPARATIVE EVALUATION - FINAL SUMMARY

Normalization    Metric Mistral   BnF Difference
       Strict  F1 Score   0.946 0.817    +12.8pp
       Strict Precision   0.973 0.790    +18.4pp
       Strict    Recall   0.927 0.848     +7.9pp
     Standard  F1 Score   0.946 0.817    +12.8pp
     Standard Precision   0.973 0.790    +18.4pp
     Standard    Recall   0.927 0.848     +7.9pp

KEY FINDINGS

1. OVERALL WORD COVERAGE (F1 Score):
   - Mistral: 94.6%
   - BnF:     81.7%
   - Difference: +12.8 percentage points in Mistral's favor

2. PRECISION (% of extracted words that are correct):
   - Mistral: 97.3%
   - BnF:     79.0%
   - Interpretation: Mistral has fewer false positives

3. RECALL (% of gold words that were extracted):
   - Mistral: 92.7%
   - BnF:     84.8%
   - Interpretation: Mistral misses fewer words

4. AVERAGE WORDS PER PAGE:
   - Mistral: 298 correct, 11 missed
   - BnF:     263 correct, 46 missed

NOTE: These are order-agnostic metrics. Word extraction order