In [None]:
import sys
from pathlib import Path
import json
from typing import Dict, List, Tuple, Optional, Set
from collections import defaultdict
from difflib import SequenceMatcher
import Levenshtein
import re
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Path setup
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

# Import schemas for validation
from schemas.stage1_page import Stage1PageModel

# Paths
GOLD_DIR = PROJECT_ROOT / "data" / "gold_standard" / "cleaned"
PRED_DIR = PROJECT_ROOT / "data" / "interim_pages" / "La_Plume_bpt6k1185893k_1_10_1889"

print("Stage 1 OCR Evaluation")
print("\n")
print(f"Project root: {PROJECT_ROOT}")
print(f"Gold standard: {GOLD_DIR}")
print(f"Predictions: {PRED_DIR}")

# Find common files
def load_page_pairs() -> List[Tuple[Path, Path]]:
    """
    Match gold standard files with prediction files by filename.
    Returns list of (gold_path, pred_path) tuples.
    """
    gold_files = {f.name: f for f in GOLD_DIR.glob("*.json")}
    pred_files = {f.name: f for f in PRED_DIR.glob("*.json")}
    
    common_names = set(gold_files.keys()) & set(pred_files.keys())
    
    pairs = [(gold_files[name], pred_files[name]) for name in sorted(common_names)]
    
    print(f"\nDataset:")
    print(f"  Gold files: {len(gold_files)}")
    print(f"  Pred files: {len(pred_files)}")
    print(f"  Matching pairs: {len(pairs)}")
    
    if len(pairs) < len(gold_files):
        missing = set(gold_files.keys()) - set(pred_files.keys())
        print(f"Warning: {len(missing)} gold standard pages without predictions:")
        for name in sorted(missing):
            print(f"   - {name}")
    
    return pairs

page_pairs = load_page_pairs()

Stage 1 OCR Evaluation


Project root: /home/fabian-ramirez/Documents/These/Code/magazine_graphs
Gold standard: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/gold_standard/cleaned
Predictions: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/interim_pages/La_Plume_bpt6k1185893k_1_10_1889

Dataset:
  Gold files: 14
  Pred files: 14
  Matching pairs: 14


In [3]:
"""
Item Matching Functions
Match gold items to predicted items using content-based text similarity.
"""

# Configuration
SIMILARITY_THRESHOLD = 0.7  # Minimum text similarity to consider a match (0.0-1.0)

print("\n")
print("Item Matching Configuration")
print("\n")
print(f"Similarity threshold: {SIMILARITY_THRESHOLD}")


def normalize_text(text: str) -> str:
    """
    Normalize text for similarity comparison.
    
    """
    # Lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Normalize all whitespace (spaces, tabs, newlines) to single spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Strip leading and trailing whitespace
    text = text.strip()
    
    return text


def text_similarity(text1: str, text2: str) -> float:
    """
    Calculate similarity ratio between two texts using SequenceMatcher.
        
    Returns:
        Float between 0.0 (completely different) and 1.0 (identical)
    """
    t1 = normalize_text(text1)
    t2 = normalize_text(text2)
    
    if not t1 and not t2:
        return 1.0
    if not t1 or not t2:
        return 0.0
    
    return SequenceMatcher(None, t1, t2).ratio()


def match_items(
    gold_items: List[Dict], 
    pred_items: List[Dict],
    similarity_threshold: float = SIMILARITY_THRESHOLD
) -> Tuple[List[Tuple[int, int, float]], Set[int], Set[int]]:
    """
    Match gold items to prediction items using greedy best-match algorithm.
    
    Algorithm:
        For each gold item, find the best-matching unmatched pred item.
        Accept the match if similarity exceeds threshold.
    
    Args:
        gold_items: List of gold standard items
        pred_items: List of predicted items
        similarity_threshold: Minimum similarity score to consider a match
    
    Returns:
        Tuple of:
        - matches: List of (gold_idx, pred_idx, similarity_score)
        - unmatched_gold: Set of gold indices with no match
        - unmatched_pred: Set of pred indices with no match
    """
    matches = []
    matched_pred_indices = set()
    unmatched_gold = set()
    
    for gold_idx, gold_item in enumerate(gold_items):
        gold_text = gold_item.get('item_text_raw', '')
        
        best_score = 0.0
        best_pred_idx = None
        
        for pred_idx, pred_item in enumerate(pred_items):
            if pred_idx in matched_pred_indices:
                continue
            
            pred_text = pred_item.get('item_text_raw', '')
            score = text_similarity(gold_text, pred_text)
            
            if score > best_score:
                best_score = score
                best_pred_idx = pred_idx
        
        if best_score >= similarity_threshold and best_pred_idx is not None:
            matches.append((gold_idx, best_pred_idx, best_score))
            matched_pred_indices.add(best_pred_idx)
        else:
            unmatched_gold.add(gold_idx)
    
    unmatched_pred = set(range(len(pred_items))) - matched_pred_indices
    
    return matches, unmatched_gold, unmatched_pred


def load_and_match_page(
    gold_path: Path, 
    pred_path: Path,
    similarity_threshold: float = SIMILARITY_THRESHOLD
) -> Dict:
    """
    Load a page pair and match items.
    
    Args:
        gold_path: Path to gold standard JSON
        pred_path: Path to prediction JSON
        similarity_threshold: Minimum similarity for matching
    
    Returns:
        Dict with:
        - gold_items: All gold items
        - pred_items: All pred items
        - matches: List of (gold_idx, pred_idx, score) tuples
        - unmatched_gold: Set of unmatched gold indices
        - unmatched_pred: Set of unmatched pred indices
        - page_name: Filename
    """
    with open(gold_path, 'r', encoding='utf-8') as f:
        gold_data = json.load(f)
    gold_page = Stage1PageModel.model_validate(gold_data)
    gold_data = gold_page.model_dump()
    
    with open(pred_path, 'r', encoding='utf-8') as f:
        pred_data = json.load(f)
    pred_page = Stage1PageModel.model_validate(pred_data)
    pred_data = pred_page.model_dump()
    
    gold_items = gold_data.get('items', [])
    pred_items = pred_data.get('items', [])
    
    matches, unmatched_gold, unmatched_pred = match_items(
        gold_items, pred_items, similarity_threshold
    )
    
    return {
        'gold_items': gold_items,
        'pred_items': pred_items,
        'matches': matches,
        'unmatched_gold': unmatched_gold,
        'unmatched_pred': unmatched_pred,
        'page_name': gold_path.name
    }


# Test matching on first page
print("\n")
print("Item Matching Test")
print("\n")

if page_pairs:
    test_gold, test_pred = page_pairs[0]
    test_result = load_and_match_page(test_gold, test_pred)
    
    print(f"\nTest page: {test_result['page_name']}")
    print(f"  Gold items: {len(test_result['gold_items'])}")
    print(f"  Pred items: {len(test_result['pred_items'])}")
    print(f"  Matches found: {len(test_result['matches'])}")
    print(f"  Unmatched gold: {len(test_result['unmatched_gold'])}")
    print(f"  Unmatched pred: {len(test_result['unmatched_pred'])}")
    
    if test_result['matches']:
        avg_score = sum(score for _, _, score in test_result['matches']) / len(test_result['matches'])
        print(f"  Average match quality: {avg_score:.2%}")



Item Matching Configuration


Similarity threshold: 0.7


Item Matching Test



Test page: La_Plume_bpt6k1185893k_1_10_1889__page-001.json
  Gold items: 8
  Pred items: 5
  Matches found: 4
  Unmatched gold: 4
  Unmatched pred: 1
  Average match quality: 92.47%


In [4]:
"""
Page-Level Diagnostics
Generate diagnostic metrics for each page based on item matches.
"""

def diagnose_page(page_id: str, gold_items: list, pred_items: list, matches: list) -> dict:
    """
    Generate diagnostic metrics for a single page.
    
    Args:
        page_id: Page identifier
        gold_items: List of gold standard items
        pred_items: List of predicted items
        matches: List of (gold_idx, pred_idx, score) tuples
        
    Returns:
        Dictionary with diagnostic metrics
    """
    # Count items by class
    gold_by_class = {}
    pred_by_class = {}
    
    for item in gold_items:
        item_class = item['item_class']
        gold_by_class[item_class] = gold_by_class.get(item_class, 0) + 1
    
    for item in pred_items:
        item_class = item['item_class']
        pred_by_class[item_class] = pred_by_class.get(item_class, 0) + 1
    
    # Count contributions (prose + verse)
    gold_contrib = gold_by_class.get('prose', 0) + gold_by_class.get('verse', 0)
    pred_contrib = pred_by_class.get('prose', 0) + pred_by_class.get('verse', 0)
    
    # Filter matches by contribution class
    contrib_matches = [
        (g_idx, p_idx, score) for g_idx, p_idx, score in matches
        if gold_items[g_idx]['item_class'] in ('prose', 'verse')
    ]
    
    # Calculate match rates
    match_rate = (len(matches) / len(gold_items) * 100) if gold_items else 0
    contrib_match_rate = (len(contrib_matches) / gold_contrib * 100) if gold_contrib else 0
    
    # Calculate average similarity
    avg_similarity = (sum(score for _, _, score in matches) / len(matches)) if matches else 0
    
    # Count continuation flags
    gold_cont_in = sum(1 for item in gold_items if item.get('is_continuation') is True)
    pred_cont_in = sum(1 for item in pred_items if item.get('is_continuation') is True)
    gold_cont_out = sum(1 for item in gold_items if item.get('continues_on_next_page') is True)
    pred_cont_out = sum(1 for item in pred_items if item.get('continues_on_next_page') is True)
    
    # Track matched indices
    matched_gold = {g_idx for g_idx, _, _ in matches}
    matched_pred = {p_idx for _, p_idx, _ in matches}
    
    unmatched_gold = [i for i in range(len(gold_items)) if i not in matched_gold]
    unmatched_pred = [i for i in range(len(pred_items)) if i not in matched_pred]
    
    # Count matches by class
    matches_by_class = {}
    for g_idx, p_idx, score in matches:
        item_class = gold_items[g_idx]['item_class']
        matches_by_class[item_class] = matches_by_class.get(item_class, 0) + 1
    
    return {
        'page_id': page_id,
        'gold_items': len(gold_items),
        'pred_items': len(pred_items),
        'matched': len(matches),
        'match_rate': match_rate,
        'contrib_match_rate': contrib_match_rate,
        'avg_similarity': avg_similarity,
        'gold_cont_in': gold_cont_in,
        'pred_cont_in': pred_cont_in,
        'gold_cont_out': gold_cont_out,
        'pred_cont_out': pred_cont_out,
        'gold_by_class': gold_by_class,
        'pred_by_class': pred_by_class,
        'matches_by_class': matches_by_class,
        'gold_contrib': gold_contrib,
        'pred_contrib': pred_contrib,
        'contrib_matched': len(contrib_matches),
        'unmatched_gold': unmatched_gold,
        'unmatched_pred': unmatched_pred
    }


def flag_page(metrics: dict) -> str:
    """
    Generate flags for problematic pages based on metrics.
    
    Args:
        metrics: Dictionary from diagnose_page()
        
    Returns:
        Comma-separated string of flags, or empty string if no issues
    """
    flags = []
    
    if metrics['pred_items'] == 0:
        flags.append('ZERO_PREDS')
    
    if metrics['matched'] == 0:
        flags.append('ZERO_MATCHES')
    
    if metrics['match_rate'] < 50:
        flags.append('LOW_MATCH')
    
    if metrics['gold_contrib'] > 0 and metrics['contrib_match_rate'] < 60:
        flags.append('LOW_CONTRIB')
    
    if abs(metrics['gold_items'] - metrics['pred_items']) >= 3:
        flags.append('COUNT_MISMATCH')
    
    return ', '.join(flags)


def run_diagnostics(page_pairs: list) -> pd.DataFrame:
    """
    Run diagnostics on all pages and generate summary table and detailed reports.
    
    Args:
        page_pairs: List of (gold_path, pred_path) tuples from load_page_pairs()
        
    Returns:
        DataFrame with summary metrics for all pages
    """
    print("Running diagnostics on all pages...\n")
    
    all_metrics = []
    
    for gold_path, pred_path in page_pairs:
        # Extract page_id from filename
        page_id = gold_path.stem
        
        # Load and match page
        result = load_and_match_page(gold_path, pred_path)
        gold_items = result['gold_items']
        pred_items = result['pred_items']
        matches = result['matches']
        
        # Generate metrics
        metrics = diagnose_page(page_id, gold_items, pred_items, matches)
        metrics['flags'] = flag_page(metrics)
        all_metrics.append(metrics)
    
    # Create summary DataFrame
    summary_data = []
    for m in all_metrics:
        summary_data.append({
            'page_id': m['page_id'],
            'gold_items': m['gold_items'],
            'pred_items': m['pred_items'],
            'matched': m['matched'],
            'match_rate_%': round(m['match_rate'], 1),
            'contrib_match_rate_%': round(m['contrib_match_rate'], 1),
            'avg_similarity': round(m['avg_similarity'], 3),
            'gold_cont_in': m['gold_cont_in'],
            'pred_cont_in': m['pred_cont_in'],
            'gold_cont_out': m['gold_cont_out'],
            'pred_cont_out': m['pred_cont_out'],
            'flags': m['flags']
        })
    
    summary_df = pd.DataFrame(summary_data)
    
    # Print summary table
    print("\n")
    print("SUMMARY TABLE")
    print("\n")
    print(summary_df.to_string(index=False))
    print("\n")
    
    # Print detailed reports for all pages
    print("="*80)
    print("DETAILED REPORTS")
    print("="*80)
    
    for m in all_metrics:
        print(f"\n=== Page {m['page_id']} ===")
        print(f"Items: {m['gold_items']} gold, {m['pred_items']} pred")
        print(f"Matches: {m['matched']} ({m['match_rate']:.1f}% match rate)")
        
        print("\nBy class:")
        all_classes = sorted(set(m['gold_by_class'].keys()) | set(m['pred_by_class'].keys()))
        for cls in all_classes:
            gold_count = m['gold_by_class'].get(cls, 0)
            pred_count = m['pred_by_class'].get(cls, 0)
            matched_count = m['matches_by_class'].get(cls, 0)
            match_pct = (matched_count / gold_count * 100) if gold_count > 0 else 0
            print(f"  {cls:10s} {gold_count} gold, {pred_count} pred, {matched_count} matched ({match_pct:.1f}%)")

        
        print(f"\nContributions: {m['gold_contrib']} gold, {m['pred_contrib']} pred, "
              f"{m['contrib_matched']} matched ({m['contrib_match_rate']:.1f}%)")
        print(f"Avg similarity: {m['avg_similarity']:.3f}")
        
        print(f"\nContinuations:")
        print(f"  is_continuation: {m['gold_cont_in']} gold, {m['pred_cont_in']} pred")
        print(f"  continues_on_next_page: {m['gold_cont_out']} gold, {m['pred_cont_out']} pred")
        
        print(f"\nUnmatched gold items: {m['unmatched_gold']}")
        print(f"Unmatched pred items: {m['unmatched_pred']}")
        
        if m['flags']:
            print(f"\nFLAGS: {m['flags']}")
    
    return summary_df


# Run diagnostics
diagnostic_df = run_diagnostics(page_pairs)

Running diagnostics on all pages...



SUMMARY TABLE


                                   page_id  gold_items  pred_items  matched  match_rate_%  contrib_match_rate_%  avg_similarity  gold_cont_in  pred_cont_in  gold_cont_out  pred_cont_out                                                            flags
La_Plume_bpt6k1185893k_1_10_1889__page-001           8           5        4          50.0                   0.0           0.925             0             0              0              0                                                   COUNT_MISMATCH
La_Plume_bpt6k1185893k_1_10_1889__page-002           2           2        2         100.0                   0.0           1.000             0             0              0              0                                                                 
La_Plume_bpt6k1185893k_1_10_1889__page-003           3           3        1          33.3                   0.0           0.713             0             1              1              2       

In [5]:
"""
Evaluation Helpers
Utility functions for filtering matches and loading all pages efficiently.
These helpers are used by the evaluation cells that follow.
"""

def filter_matches_by_class(
    matches: List[Tuple[int, int, float]],
    gold_items: List[Dict],
    item_classes: List[str]
) -> List[Tuple[int, int, float]]:
    """
    Filter matches to only include items of specified classes.
    
    Args:
        matches: List of (gold_idx, pred_idx, score) tuples
        gold_items: List of gold standard items
        item_classes: List of classes to include (e.g., ['prose', 'verse'])
    
    Returns:
        Filtered list of matches
    """
    return [
        (g_idx, p_idx, score) 
        for g_idx, p_idx, score in matches
        if gold_items[g_idx]['item_class'] in item_classes
    ]


def get_matched_pairs(
    matches: List[Tuple[int, int, float]],
    gold_items: List[Dict],
    pred_items: List[Dict]
) -> List[Tuple[Dict, Dict, float]]:
    """
    Convert match indices to actual item pairs.
    
    Args:
        matches: List of (gold_idx, pred_idx, score) tuples
        gold_items: List of gold standard items
        pred_items: List of predicted items
    
    Returns:
        List of (gold_item, pred_item, similarity_score) tuples
    """
    return [
        (gold_items[g_idx], pred_items[p_idx], score)
        for g_idx, p_idx, score in matches
    ]


def load_all_pages(page_pairs: List[Tuple[Path, Path]]) -> List[Dict]:
    """
    Load and match all pages at once for efficient batch evaluation.
    
    Args:
        page_pairs: List of (gold_path, pred_path) tuples from load_page_pairs()
    
    Returns:
        List of dictionaries, one per page, each containing:
        - page_id: Page identifier
        - gold_items: All gold items
        - pred_items: All pred items
        - matches: List of (gold_idx, pred_idx, score) tuples
        - unmatched_gold: Set of unmatched gold indices
        - unmatched_pred: Set of unmatched pred indices
    """
    all_pages = []
    
    for gold_path, pred_path in page_pairs:
        result = load_and_match_page(gold_path, pred_path)
        result['page_id'] = gold_path.stem
        all_pages.append(result)
    
    return all_pages


# Load all pages once for reuse in subsequent evaluation cells
print("Loading and matching all pages...")
all_pages = load_all_pages(page_pairs)
print(f"Loaded {len(all_pages)} pages")
print(f"Total matches across all pages: {sum(len(page['matches']) for page in all_pages)}")


Loading and matching all pages...
Loaded 14 pages
Total matches across all pages: 37


In [6]:
"""
Text Quality Evaluation
Calculate CER and WER using two complementary approaches:
1. Order-agnostic: Pure OCR quality regardless of reading order
2. Structure-aware: OCR quality on properly aligned content via matching

Each approach calculates three normalization levels:
- Strict: Preserves all whitespace (including \n vs \n\n differences)
- Standard: Normalizes whitespace to single spaces (RECOMMENDED)
- Letters Only: Removes all whitespace and punctuation (pure character recognition)

References:
- Flexible Character Accuracy (FCA) for handling reading order issues:
  https://ocr-d.de/en/spec/ocrd_eval.html
- Token sort ratio for order-agnostic OCR comparison:
  https://urban-institute.medium.com/choosing-the-right-ocr-service-for-extracting-text-data-d7830399ec5
- Unicode normalization and whitespace handling in OCR evaluation:
  https://ocr-d.de/en/spec/ocrd_eval.html
"""

import unicodedata


def normalize_text_strict(text: str) -> str:
    """
    Strict normalization: only Unicode NFC normalization.
    Preserves all whitespace, punctuation, and capitalization.
    """
    return unicodedata.normalize('NFC', text)


def normalize_text_standard(text: str) -> str:
    """
    Standard normalization for fair OCR evaluation:
    - Unicode NFC normalization
    - All whitespace (spaces, tabs, newlines) → single space
    - Preserves punctuation and capitalization
    """
    text = unicodedata.normalize('NFC', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text


def normalize_text_letters_only(text: str) -> str:
    """
    Letter-only normalization for pure character recognition quality:
    - Unicode NFC normalization
    - Remove all whitespace
    - Remove all punctuation
    - Preserves capitalization and diacritics
    """
    text = unicodedata.normalize('NFC', text)
    text = re.sub(r'[^\w]', '', text)
    return text


def character_error_rate(reference: str, hypothesis: str, normalization: str = 'strict') -> float:
    """
    Calculate Character Error Rate using Levenshtein distance.
    
    Args:
        reference: Ground truth text
        hypothesis: OCR output text
        normalization: 'strict', 'standard', or 'letters_only'
    
    Returns:
        CER = (insertions + deletions + substitutions) / total_reference_chars
    """
    # Apply normalization
    if normalization == 'strict':
        ref = normalize_text_strict(reference)
        hyp = normalize_text_strict(hypothesis)
    elif normalization == 'standard':
        ref = normalize_text_standard(reference)
        hyp = normalize_text_standard(hypothesis)
    elif normalization == 'letters_only':
        ref = normalize_text_letters_only(reference)
        hyp = normalize_text_letters_only(hypothesis)
    else:
        ref = reference
        hyp = hypothesis
    
    if not ref:
        return 1.0 if hyp else 0.0
    distance = Levenshtein.distance(ref, hyp)
    return distance / len(ref)


def word_error_rate(reference: str, hypothesis: str, normalization: str = 'strict') -> float:
    """
    Calculate Word Error Rate using Levenshtein distance on words.
    
    Args:
        reference: Ground truth text
        hypothesis: OCR output text
        normalization: 'strict', 'standard', or 'letters_only'
    
    Returns:
        WER = (insertions + deletions + substitutions) / total_reference_words
    """
    # Apply normalization
    if normalization == 'strict':
        ref = normalize_text_strict(reference)
        hyp = normalize_text_strict(hypothesis)
    elif normalization == 'standard':
        ref = normalize_text_standard(reference)
        hyp = normalize_text_standard(hypothesis)
    elif normalization == 'letters_only':
        # For letters only, WER doesn't make sense without word boundaries
        # So we use standard normalization
        ref = normalize_text_standard(reference)
        hyp = normalize_text_standard(hypothesis)
    else:
        ref = reference
        hyp = hypothesis
    
    ref_words = ref.split()
    hyp_words = hyp.split()
    if not ref_words:
        return 1.0 if hyp_words else 0.0
    distance = Levenshtein.distance(ref_words, hyp_words)
    return distance / len(ref_words)


def token_sort_text(text: str) -> str:
    """
    Sort tokens (words) alphabetically for order-agnostic comparison.
    This removes the impact of reading order on text similarity.
    """
    tokens = text.split()
    return ' '.join(sorted(tokens))


def evaluate_order_agnostic(gold_items: List[Dict], pred_items: List[Dict], 
                            item_classes: Optional[List[str]] = None) -> Dict:
    """
    Evaluate text quality without considering reading order.
    Uses token sort ratio approach - sorts all words before comparison.
    Calculates three normalization levels: strict, standard, letters_only.
    
    Args:
        gold_items: List of gold standard items
        pred_items: List of predicted items
        item_classes: If provided, filter to only these classes
    
    Returns:
        Dict with CER, WER for each normalization level, and text statistics
    """
    # Filter by class if specified
    if item_classes:
        gold_items = [item for item in gold_items if item['item_class'] in item_classes]
        pred_items = [item for item in pred_items if item['item_class'] in item_classes]
    
    # Concatenate all text
    gold_text = ' '.join(item.get('item_text_raw', '') for item in gold_items)
    pred_text = ' '.join(item.get('item_text_raw', '') for item in pred_items)
    
    # Sort tokens for order-agnostic comparison
    gold_sorted = token_sort_text(gold_text)
    pred_sorted = token_sort_text(pred_text)
    
    # Calculate for all three normalization levels
    results = {
        'cer_strict': character_error_rate(gold_sorted, pred_sorted, 'strict'),
        'wer_strict': word_error_rate(gold_sorted, pred_sorted, 'strict'),
        'cer_standard': character_error_rate(gold_sorted, pred_sorted, 'standard'),
        'wer_standard': word_error_rate(gold_sorted, pred_sorted, 'standard'),
        'cer_letters': character_error_rate(gold_sorted, pred_sorted, 'letters_only'),
        'gold_chars': len(gold_text),
        'pred_chars': len(pred_text),
        'gold_words': len(gold_text.split()),
        'pred_words': len(pred_text.split())
    }
    
    return results


def evaluate_structure_aware(gold_items: List[Dict], pred_items: List[Dict],
                             matches: List[Tuple[int, int, float]],
                             item_classes: Optional[List[str]] = None) -> Dict:
    """
    Evaluate text quality on matched pairs, respecting document structure.
    Only compares content that was successfully aligned via matching.
    Calculates three normalization levels: strict, standard, letters_only.
    
    Args:
        gold_items: List of gold standard items
        pred_items: List of predicted items
        matches: List of (gold_idx, pred_idx, score) tuples
        item_classes: If provided, filter matches to only these classes
    
    Returns:
        Dict with matched CER/WER for each normalization level and unmatched content statistics
    """
    # Filter matches by class if specified
    if item_classes:
        filtered_matches = filter_matches_by_class(matches, gold_items, item_classes)
    else:
        filtered_matches = matches
    
    # Get matched pairs
    matched_pairs = get_matched_pairs(filtered_matches, gold_items, pred_items)
    
    # Calculate CER/WER on matched content for all normalization levels
    if matched_pairs:
        # Concatenate matched texts in gold order
        gold_matched_text = ' '.join(gold_item.get('item_text_raw', '') 
                                     for gold_item, _, _ in matched_pairs)
        pred_matched_text = ' '.join(pred_item.get('item_text_raw', '') 
                                     for _, pred_item, _ in matched_pairs)
        
        cer_strict = character_error_rate(gold_matched_text, pred_matched_text, 'strict')
        wer_strict = word_error_rate(gold_matched_text, pred_matched_text, 'strict')
        cer_standard = character_error_rate(gold_matched_text, pred_matched_text, 'standard')
        wer_standard = word_error_rate(gold_matched_text, pred_matched_text, 'standard')
        cer_letters = character_error_rate(gold_matched_text, pred_matched_text, 'letters_only')
        
        matched_gold_chars = len(gold_matched_text)
        matched_pred_chars = len(pred_matched_text)
    else:
        cer_strict = 0.0
        wer_strict = 0.0
        cer_standard = 0.0
        wer_standard = 0.0
        cer_letters = 0.0
        matched_gold_chars = 0
        matched_pred_chars = 0
    
    # Calculate unmatched content
    matched_gold_indices = {g_idx for g_idx, _, _ in filtered_matches}
    matched_pred_indices = {p_idx for _, p_idx, _ in filtered_matches}
    
    if item_classes:
        # Only count unmatched items of the specified classes
        unmatched_gold_items = [
            gold_items[i] for i in range(len(gold_items))
            if i not in matched_gold_indices and gold_items[i]['item_class'] in item_classes
        ]
        unmatched_pred_items = [
            pred_items[i] for i in range(len(pred_items))
            if i not in matched_pred_indices and pred_items[i]['item_class'] in item_classes
        ]
        total_gold_chars = sum(len(item.get('item_text_raw', '')) 
                              for item in gold_items if item['item_class'] in item_classes)
    else:
        unmatched_gold_items = [gold_items[i] for i in range(len(gold_items)) 
                               if i not in matched_gold_indices]
        unmatched_pred_items = [pred_items[i] for i in range(len(pred_items)) 
                               if i not in matched_pred_indices]
        total_gold_chars = sum(len(item.get('item_text_raw', '')) for item in gold_items)
    
    unmatched_gold_chars = sum(len(item.get('item_text_raw', '')) 
                               for item in unmatched_gold_items)
    unmatched_pred_chars = sum(len(item.get('item_text_raw', '')) 
                               for item in unmatched_pred_items)
    
    return {
        'cer_strict': cer_strict,
        'wer_strict': wer_strict,
        'cer_standard': cer_standard,
        'wer_standard': wer_standard,
        'cer_letters': cer_letters,
        'matched_gold_chars': matched_gold_chars,
        'matched_pred_chars': matched_pred_chars,
        'unmatched_gold_chars': unmatched_gold_chars,
        'unmatched_pred_chars': unmatched_pred_chars,
        'total_gold_chars': total_gold_chars,
        'matched_percentage': (matched_gold_chars / total_gold_chars * 100) if total_gold_chars else 0
    }


# Evaluate text quality across all pages
print("Evaluating text quality...")
print("\n")

order_agnostic_all = []
order_agnostic_contrib = []
structure_aware_all = []
structure_aware_contrib = []

for page in all_pages:
    page_id = page['page_id']
    gold_items = page['gold_items']
    pred_items = page['pred_items']
    matches = page['matches']
    
    # Order-agnostic evaluation
    oa_all = evaluate_order_agnostic(gold_items, pred_items)
    oa_all['page_id'] = page_id
    order_agnostic_all.append(oa_all)
    
    oa_contrib = evaluate_order_agnostic(gold_items, pred_items, 
                                         item_classes=['prose', 'verse'])
    oa_contrib['page_id'] = page_id
    order_agnostic_contrib.append(oa_contrib)
    
    # Structure-aware evaluation
    sa_all = evaluate_structure_aware(gold_items, pred_items, matches)
    sa_all['page_id'] = page_id
    structure_aware_all.append(sa_all)
    
    sa_contrib = evaluate_structure_aware(gold_items, pred_items, matches,
                                          item_classes=['prose', 'verse'])
    sa_contrib['page_id'] = page_id
    structure_aware_contrib.append(sa_contrib)

# Calculate averages for order-agnostic evaluation
avg_oa_all = {
    'cer_strict': sum(r['cer_strict'] for r in order_agnostic_all) / len(order_agnostic_all),
    'wer_strict': sum(r['wer_strict'] for r in order_agnostic_all) / len(order_agnostic_all),
    'cer_standard': sum(r['cer_standard'] for r in order_agnostic_all) / len(order_agnostic_all),
    'wer_standard': sum(r['wer_standard'] for r in order_agnostic_all) / len(order_agnostic_all),
    'cer_letters': sum(r['cer_letters'] for r in order_agnostic_all) / len(order_agnostic_all)
}

contrib_with_content = [r for r in order_agnostic_contrib if r['gold_chars'] > 0]
avg_oa_contrib = {
    'cer_strict': sum(r['cer_strict'] for r in contrib_with_content) / len(contrib_with_content),
    'wer_strict': sum(r['wer_strict'] for r in contrib_with_content) / len(contrib_with_content),
    'cer_standard': sum(r['cer_standard'] for r in contrib_with_content) / len(contrib_with_content),
    'wer_standard': sum(r['wer_standard'] for r in contrib_with_content) / len(contrib_with_content),
    'cer_letters': sum(r['cer_letters'] for r in contrib_with_content) / len(contrib_with_content)
}

# Calculate averages for structure-aware evaluation
sa_all_with_matches = [r for r in structure_aware_all if r['matched_gold_chars'] > 0]
avg_sa_all = {
    'cer_strict': sum(r['cer_strict'] for r in sa_all_with_matches) / len(sa_all_with_matches),
    'wer_strict': sum(r['wer_strict'] for r in sa_all_with_matches) / len(sa_all_with_matches),
    'cer_standard': sum(r['cer_standard'] for r in sa_all_with_matches) / len(sa_all_with_matches),
    'wer_standard': sum(r['wer_standard'] for r in sa_all_with_matches) / len(sa_all_with_matches),
    'cer_letters': sum(r['cer_letters'] for r in sa_all_with_matches) / len(sa_all_with_matches)
}

sa_contrib_with_matches = [r for r in structure_aware_contrib if r['matched_gold_chars'] > 0]
avg_sa_contrib = {
    'cer_strict': sum(r['cer_strict'] for r in sa_contrib_with_matches) / len(sa_contrib_with_matches),
    'wer_strict': sum(r['wer_strict'] for r in sa_contrib_with_matches) / len(sa_contrib_with_matches),
    'cer_standard': sum(r['cer_standard'] for r in sa_contrib_with_matches) / len(sa_contrib_with_matches),
    'wer_standard': sum(r['wer_standard'] for r in sa_contrib_with_matches) / len(sa_contrib_with_matches),
    'cer_letters': sum(r['cer_letters'] for r in sa_contrib_with_matches) / len(sa_contrib_with_matches)
}

# Calculate total matched percentages
total_sa_all_matched = sum(r['matched_gold_chars'] for r in structure_aware_all)
total_sa_all_gold = sum(r['total_gold_chars'] for r in structure_aware_all)
total_sa_all_unmatched = sum(r['unmatched_gold_chars'] for r in structure_aware_all)

total_sa_contrib_matched = sum(r['matched_gold_chars'] for r in structure_aware_contrib)
total_sa_contrib_gold = sum(r['total_gold_chars'] for r in structure_aware_contrib)
total_sa_contrib_unmatched = sum(r['unmatched_gold_chars'] for r in structure_aware_contrib)

# Print results
print("="*70)
print("TEXT QUALITY SUMMARY")
print("="*70)

print("\n1. ORDER-AGNOSTIC EVALUATION")
print("   (Pure OCR quality, reading order irrelevant)")
print("-"*70)

print(f"\n   All Items:")
print(f"      Strict (with all whitespace):")
print(f"         CER: {avg_oa_all['cer_strict']:.2%}  |  WER: {avg_oa_all['wer_strict']:.2%}")
print(f"      Standard (normalized whitespace):")
print(f"         CER: {avg_oa_all['cer_standard']:.2%}  |  WER: {avg_oa_all['wer_standard']:.2%}")
print(f"      Letters Only (no whitespace/punctuation):")
print(f"         CER: {avg_oa_all['cer_letters']:.2%}")

print(f"\n   Contributions Only (prose + verse):")
print(f"      Strict (with all whitespace):")
print(f"         CER: {avg_oa_contrib['cer_strict']:.2%}  |  WER: {avg_oa_contrib['wer_strict']:.2%}")
print(f"      Standard (normalized whitespace):")
print(f"         CER: {avg_oa_contrib['cer_standard']:.2%}  |  WER: {avg_oa_contrib['wer_standard']:.2%}")
print(f"      Letters Only (no whitespace/punctuation):")
print(f"         CER: {avg_oa_contrib['cer_letters']:.2%}")

print("\n" + "="*70)
print("2. STRUCTURE-AWARE EVALUATION")
print("   (OCR quality on matched content only)")
print("-"*70)

print(f"\n   Matched Content - All Items:")
print(f"      Strict (with all whitespace):")
print(f"         CER: {avg_sa_all['cer_strict']:.2%}  |  WER: {avg_sa_all['wer_strict']:.2%}")
print(f"      Standard (normalized whitespace):")
print(f"         CER: {avg_sa_all['cer_standard']:.2%}  |  WER: {avg_sa_all['wer_standard']:.2%}")
print(f"      Letters Only (no whitespace/punctuation):")
print(f"         CER: {avg_sa_all['cer_letters']:.2%}")
print(f"      Coverage: {total_sa_all_matched:,} chars matched " +
      f"({total_sa_all_matched/total_sa_all_gold*100:.1f}% of gold)")
print(f"      Unmatched: {total_sa_all_unmatched:,} chars " +
      f"({total_sa_all_unmatched/total_sa_all_gold*100:.1f}% of gold)")

print(f"\n   Matched Content - Contributions Only (prose + verse):")
print(f"      Strict (with all whitespace):")
print(f"         CER: {avg_sa_contrib['cer_strict']:.2%}  |  WER: {avg_sa_contrib['wer_strict']:.2%}")
print(f"      Standard (normalized whitespace):")
print(f"         CER: {avg_sa_contrib['cer_standard']:.2%}  |  WER: {avg_sa_contrib['wer_standard']:.2%}")
print(f"      Letters Only (no whitespace/punctuation):")
print(f"         CER: {avg_sa_contrib['cer_letters']:.2%}")
print(f"      Coverage: {total_sa_contrib_matched:,} chars matched " +
      f"({total_sa_contrib_matched/total_sa_contrib_gold*100:.1f}% of gold)")
print(f"      Unmatched: {total_sa_contrib_unmatched:,} chars " +
      f"({total_sa_contrib_unmatched/total_sa_contrib_gold*100:.1f}% of gold)")

print("\n" + "="*70)
print("INTERPRETATION GUIDE:")
print("-"*70)
print("Strict: Most conservative")
print("Standard: Fair baseline - normalizes whitespace")
print("Letters Only: Most lenient - pure character recognition quality")
print("\n" + "="*70)
print("\nKEY INSIGHTS:")
print(f"- Pure OCR quality (standard normalization): {avg_oa_all['cer_standard']:.2%}")
print(f"- Letter recognition quality: {avg_oa_all['cer_letters']:.2%}")
print(f"- Structure failures (unmatched content): {total_sa_all_unmatched/total_sa_all_gold*100:.1f}%")
print(f"- Contributions:")
print(f"    Standard CER: {avg_sa_contrib['cer_standard']:.2%}")
print(f"    Successfully matched: {total_sa_contrib_matched/total_sa_contrib_gold*100:.1f}%")
print("="*70)

Evaluating text quality...


TEXT QUALITY SUMMARY

1. ORDER-AGNOSTIC EVALUATION
   (Pure OCR quality, reading order irrelevant)
----------------------------------------------------------------------

   All Items:
      Strict (with all whitespace):
         CER: 14.83%  |  WER: 17.66%
      Standard (normalized whitespace):
         CER: 14.83%  |  WER: 17.66%
      Letters Only (no whitespace/punctuation):
         CER: 13.48%

   Contributions Only (prose + verse):
      Strict (with all whitespace):
         CER: 18.29%  |  WER: 20.85%
      Standard (normalized whitespace):
         CER: 18.29%  |  WER: 20.85%
      Letters Only (no whitespace/punctuation):
         CER: 17.94%

2. STRUCTURE-AWARE EVALUATION
   (OCR quality on matched content only)
----------------------------------------------------------------------

   Matched Content - All Items:
      Strict (with all whitespace):
         CER: 15.13%  |  WER: 21.12%
      Standard (normalized whitespace):
         CER: 14.61

In [8]:
"""
Page-by-Page Text Diagnostics
Detailed error analysis for each page with three normalization levels.
Shows error type distribution, worst performing pages, and actual text examples.
"""

import pandas as pd
from difflib import SequenceMatcher


def get_levenshtein_operations(reference: str, hypothesis: str) -> Dict[str, int]:
    """
    Get detailed Levenshtein operations breakdown.
    
    Returns:
        Dict with counts of substitutions, deletions, insertions
    """
    if not reference and not hypothesis:
        return {'substitutions': 0, 'deletions': 0, 'insertions': 0, 'total': 0}
    
    if not reference:
        return {'substitutions': 0, 'deletions': 0, 'insertions': len(hypothesis), 'total': len(hypothesis)}
    
    if not hypothesis:
        return {'substitutions': 0, 'deletions': len(reference), 'insertions': 0, 'total': len(reference)}
    
    # Use SequenceMatcher to get operations
    sm = SequenceMatcher(None, reference, hypothesis)
    
    substitutions = 0
    deletions = 0
    insertions = 0
    
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == 'replace':
            # Both strings differ - count as substitutions
            substitutions += max(i2 - i1, j2 - j1)
        elif tag == 'delete':
            # Only in reference
            deletions += (i2 - i1)
        elif tag == 'insert':
            # Only in hypothesis
            insertions += (j2 - j1)
    
    return {
        'substitutions': substitutions,
        'deletions': deletions,
        'insertions': insertions,
        'total': substitutions + deletions + insertions
    }


def diagnose_page_text_quality(page: Dict, normalization: str = 'standard') -> Dict:
    """
    Detailed text quality diagnosis for a single page.
    
    Args:
        page: Page data from all_pages
        normalization: 'strict', 'standard', or 'letters_only'
    
    Returns:
        Dict with detailed metrics and error breakdowns
    """
    page_id = page['page_id']
    gold_items = page['gold_items']
    pred_items = page['pred_items']
    matches = page['matches']
    
    # Get matched pairs
    matched_pairs = get_matched_pairs(matches, gold_items, pred_items)
    
    if not matched_pairs:
        total_gold_chars = sum(len(item.get('item_text_raw', '')) for item in gold_items)
        return {
            'page_id': page_id,
            'cer': 0.0,
            'wer': 0.0,
            'matched_chars': 0,
            'total_gold_chars': total_gold_chars,
            'match_coverage': 0.0,
            'substitutions': 0,
            'deletions': 0,
            'insertions': 0,
            'total_errors': 0,
            'items_analyzed': []
        }
    
    # Concatenate matched text
    gold_text = ' '.join(gold_item.get('item_text_raw', '') for gold_item, _, _ in matched_pairs)
    pred_text = ' '.join(pred_item.get('item_text_raw', '') for _, pred_item, _ in matched_pairs)
    
    # Calculate CER/WER
    cer = character_error_rate(gold_text, pred_text, normalization)
    wer = word_error_rate(gold_text, pred_text, normalization)
    
    # Get error breakdown using normalized text
    if normalization == 'strict':
        gold_norm = normalize_text_strict(gold_text)
        pred_norm = normalize_text_strict(pred_text)
    elif normalization == 'standard':
        gold_norm = normalize_text_standard(gold_text)
        pred_norm = normalize_text_standard(pred_text)
    else:  # letters_only
        gold_norm = normalize_text_letters_only(gold_text)
        pred_norm = normalize_text_letters_only(pred_text)
    
    ops = get_levenshtein_operations(gold_norm, pred_norm)
    
    # Analyze individual items
    items_analyzed = []
    for gold_item, pred_item, similarity in matched_pairs:
        gold_item_text = gold_item.get('item_text_raw', '')
        pred_item_text = pred_item.get('item_text_raw', '')
        
        item_cer = character_error_rate(gold_item_text, pred_item_text, normalization)
        
        items_analyzed.append({
            'gold_class': gold_item.get('item_class'),
            'cer': item_cer,
            'gold_preview': gold_item_text[:100],
            'pred_preview': pred_item_text[:100],
            'gold_length': len(gold_item_text),
            'pred_length': len(pred_item_text)
        })
    
    total_gold_chars = sum(len(item.get('item_text_raw', '')) for item in gold_items)
    
    return {
        'page_id': page_id,
        'cer': cer,
        'wer': wer,
        'matched_chars': len(gold_text),
        'total_gold_chars': total_gold_chars,
        'match_coverage': len(gold_text) / total_gold_chars * 100 if total_gold_chars > 0 else 0,
        'substitutions': ops['substitutions'],
        'deletions': ops['deletions'],
        'insertions': ops['insertions'],
        'total_errors': ops['total'],
        'items_analyzed': items_analyzed
    }


# Diagnose all pages for all three normalizations
print("Running detailed page-by-page diagnostics...")
print("\n")

page_diagnostics_strict = []
page_diagnostics_standard = []
page_diagnostics_letters = []

for page in all_pages:
    diag_strict = diagnose_page_text_quality(page, 'strict')
    page_diagnostics_strict.append(diag_strict)
    
    diag_standard = diagnose_page_text_quality(page, 'standard')
    page_diagnostics_standard.append(diag_standard)
    
    diag_letters = diagnose_page_text_quality(page, 'letters_only')
    page_diagnostics_letters.append(diag_letters)

# Create summary DataFrames
def create_summary_df(diagnostics, normalization_name):
    """Create summary DataFrame from diagnostics."""
    data = []
    for d in diagnostics:
        if d['matched_chars'] > 0:
            sub_pct = d['substitutions'] / d['matched_chars'] * 100
            del_pct = d['deletions'] / d['matched_chars'] * 100
            ins_pct = d['insertions'] / d['matched_chars'] * 100
        else:
            sub_pct = del_pct = ins_pct = 0
        
        data.append({
            'page_id': d['page_id'],
            'cer_%': round(d['cer'] * 100, 2),
            'wer_%': round(d['wer'] * 100, 2),
            'coverage_%': round(d['match_coverage'], 1),
            'subs_%': round(sub_pct, 2),
            'dels_%': round(del_pct, 2),
            'ins_%': round(ins_pct, 2),
            'matched_chars': d['matched_chars'],
            'total_errors': d['total_errors']
        })
    
    return pd.DataFrame(data)

df_strict = create_summary_df(page_diagnostics_strict, 'Strict')
df_standard = create_summary_df(page_diagnostics_standard, 'Standard')
df_letters = create_summary_df(page_diagnostics_letters, 'Letters Only')

# Print summary tables
print("="*80)
print("PAGE-BY-PAGE TEXT QUALITY SUMMARY")
print("="*80)

print("\n--- STRICT NORMALIZATION (preserves all whitespace) ---")
print(df_strict.to_string(index=False))

print("\n\n--- STANDARD NORMALIZATION (normalized whitespace - RECOMMENDED) ---")
print(df_standard.to_string(index=False))

print("\n\n--- LETTERS ONLY (no whitespace/punctuation) ---")
print(df_letters[['page_id', 'cer_%', 'coverage_%', 'subs_%', 'dels_%', 'ins_%']].to_string(index=False))

# Identify worst pages (using standard normalization)
print("\n\n" + "="*80)
print("WORST PERFORMING PAGES (Standard Normalization)")
print("="*80)

worst_pages = sorted(page_diagnostics_standard, key=lambda x: x['cer'], reverse=True)[:5]

for i, page_diag in enumerate(worst_pages, 1):
    print(f"\n{i}. {page_diag['page_id']}")
    print(f"   CER: {page_diag['cer']:.2%}  |  WER: {page_diag['wer']:.2%}")
    print(f"   Coverage: {page_diag['match_coverage']:.1f}% of gold text")
    print(f"   Errors: {page_diag['substitutions']} subs, {page_diag['deletions']} dels, {page_diag['insertions']} ins")
    
    # Show worst items from this page
    if page_diag['items_analyzed']:
        worst_items = sorted(page_diag['items_analyzed'], key=lambda x: x['cer'], reverse=True)[:2]
        print(f"\n   Worst items on this page:")
        for j, item in enumerate(worst_items, 1):
            print(f"      Item {j} ({item['gold_class']}, CER: {item['cer']:.2%}):")
            print(f"         Gold: \"{item['gold_preview']}{'...' if item['gold_length'] > 100 else ''}\"")
            print(f"         Pred: \"{item['pred_preview']}{'...' if item['pred_length'] > 100 else ''}\"")

# Error distribution analysis
print("\n\n" + "="*80)
print("ERROR TYPE DISTRIBUTION (Standard Normalization)")
print("="*80)

total_errors = sum(d['total_errors'] for d in page_diagnostics_standard)
total_subs = sum(d['substitutions'] for d in page_diagnostics_standard)
total_dels = sum(d['deletions'] for d in page_diagnostics_standard)
total_ins = sum(d['insertions'] for d in page_diagnostics_standard)

print(f"\nTotal errors across all pages: {total_errors:,}")
print(f"   Substitutions: {total_subs:,} ({total_subs/total_errors*100:.1f}%)")
print(f"   Deletions:     {total_dels:,} ({total_dels/total_errors*100:.1f}%)")
print(f"   Insertions:    {total_ins:,} ({total_ins/total_errors*100:.1f}%)")

print("\n" + "="*80)
print("\nKEY FINDINGS:")
print(f"- Average CER (standard): {df_standard['cer_%'].mean():.2f}%")
print(f"- Pages with CER > 20%: {len(df_standard[df_standard['cer_%'] > 20])}")
print(f"- Pages with CER < 5%: {len(df_standard[df_standard['cer_%'] < 5])}")
print(f"- Most common error type: " + 
      ("Substitutions" if total_subs > max(total_dels, total_ins) else 
       "Deletions" if total_dels > total_ins else "Insertions"))
print("="*80)

Running detailed page-by-page diagnostics...


PAGE-BY-PAGE TEXT QUALITY SUMMARY

--- STRICT NORMALIZATION (preserves all whitespace) ---
                                   page_id  cer_%  wer_%  coverage_%  subs_%  dels_%  ins_%  matched_chars  total_errors
La_Plume_bpt6k1185893k_1_10_1889__page-001   6.17   8.70        87.8    1.90    0.76   3.70           1054            67
La_Plume_bpt6k1185893k_1_10_1889__page-002   0.61   0.25       100.0    0.08    0.00   0.53           2475            15
La_Plume_bpt6k1185893k_1_10_1889__page-003  76.32  83.33         1.0    0.00    0.00  76.32             38            29
La_Plume_bpt6k1185893k_1_10_1889__page-004  17.17  20.07        99.8    9.81   15.21   1.36           5219          1377
La_Plume_bpt6k1185893k_1_10_1889__page-005  39.80  42.02        15.1    0.00   39.80   0.00            716           285
La_Plume_bpt6k1185893k_1_10_1889__page-006   2.12   5.83        78.2    6.03   10.75   9.55           3583           943
La_Plume_bpt6k1

In [10]:
"""
Cross-Page Error Analysis
Character-level confusion matrix and systematic error pattern detection.
Analyzes all pages together to identify recurring OCR issues.
"""

from collections import Counter, defaultdict
import unicodedata


def get_character_confusions(reference: str, hypothesis: str) -> list:
    """
    Extract character-level substitutions from aligned strings.
    
    Returns:
        List of (gold_char, pred_char) tuples for substitutions
    """
    confusions = []
    
    sm = SequenceMatcher(None, reference, hypothesis)
    
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == 'replace':
            # Character substitution
            gold_substr = reference[i1:i2]
            pred_substr = hypothesis[j1:j2]
            
            # For single character replacements
            if len(gold_substr) == 1 and len(pred_substr) == 1:
                confusions.append((gold_substr, pred_substr))
            # For multi-character replacements (like œ -> oe)
            elif len(gold_substr) > 0 and len(pred_substr) > 0:
                confusions.append((gold_substr, pred_substr))
    
    return confusions


def analyze_character_patterns(confusions: list) -> dict:
    """
    Detect systematic patterns in character confusions.
    
    Returns:
        Dict with pattern names and counts
    """
    patterns = {
        'accent_removal': 0,
        'accent_confusion': 0,
        'ligature_issues': 0,
        'case_errors': 0,
        'punctuation_errors': 0,
        'similar_shape': 0,
        'space_issues': 0
    }
    
    accent_chars = 'àáâãäåèéêëìíîïòóôõöùúûüýÿñçÀÁÂÃÄÅÈÉÊËÌÍÎÏÒÓÔÕÖÙÚÛÜÝŸÑÇ'
    ligatures = 'œæŒÆ'
    
    for gold, pred in confusions:
        # Accent removal (é -> e, à -> a)
        if len(gold) == 1 and len(pred) == 1:
            gold_base = unicodedata.normalize('NFD', gold)[0]
            pred_normalized = unicodedata.normalize('NFD', pred)[0]
            if gold in accent_chars and gold_base == pred:
                patterns['accent_removal'] += 1
            elif gold in accent_chars and pred in accent_chars and gold != pred:
                patterns['accent_confusion'] += 1
            elif gold.lower() == pred.lower():
                patterns['case_errors'] += 1
        
        # Ligature issues (œ -> oe, æ -> ae)
        if gold in ligatures and pred not in ligatures:
            patterns['ligature_issues'] += 1
        
        # Similar shape confusions (common OCR errors)
        similar_pairs = [
            ('l', 'i'), ('i', 'l'), ('rn', 'm'), ('m', 'rn'),
            ('cl', 'd'), ('d', 'cl'), ('o', '0'), ('0', 'o'),
            ('1', 'l'), ('l', '1'), ('s', '5'), ('5', 's')
        ]
        if (gold, pred) in similar_pairs:
            patterns['similar_shape'] += 1
        
        # Punctuation confusion
        if gold in '.,;:!?\'"' or pred in '.,;:!?\'"':
            patterns['punctuation_errors'] += 1
        
        # Space-related issues
        if ' ' in gold or ' ' in pred:
            patterns['space_issues'] += 1
    
    return patterns


# Collect all character confusions across all pages
print("Analyzing character-level confusions across all pages...")
print("\n")

all_confusions = []

for page in all_pages:
    gold_items = page['gold_items']
    pred_items = page['pred_items']
    matches = page['matches']
    
    matched_pairs = get_matched_pairs(matches, gold_items, pred_items)
    
    for gold_item, pred_item, _ in matched_pairs:
        gold_text = gold_item.get('item_text_raw', '')
        pred_text = pred_item.get('item_text_raw', '')
        
        # Use standard normalization for fair comparison
        gold_norm = normalize_text_standard(gold_text)
        pred_norm = normalize_text_standard(pred_text)
        
        confusions = get_character_confusions(gold_norm, pred_norm)
        all_confusions.extend(confusions)

# Count confusion frequencies
confusion_counter = Counter(all_confusions)

print("="*80)
print("CHARACTER CONFUSION MATRIX")
print("="*80)
print(f"\nTotal character substitutions: {len(all_confusions):,}")
print(f"Unique confusion pairs: {len(confusion_counter):,}")

# Top 30 most common confusions
print("\nTop 30 Most Common Character Substitutions:")
print(f"{'Gold → Pred':<30} {'Count':<10}")
print("-"*80)

for (gold, pred), count in confusion_counter.most_common(30):
    # Escape special characters for display
    gold_display = repr(gold)[1:-1] if gold in '\n\t\r' else gold
    pred_display = repr(pred)[1:-1] if pred in '\n\t\r' else pred
    
    # Create display string
    if len(gold) == 1 and len(pred) == 1:
        display = f"'{gold_display}' → '{pred_display}'"
    else:
        display = f'"{gold_display}" → "{pred_display}"'
    
    print(f"{display:<30} {count:<10}")

# Pattern analysis
print("\n\n" + "="*80)
print("SYSTEMATIC ERROR PATTERNS")
print("="*80)

patterns = analyze_character_patterns(all_confusions)
total_categorized = sum(patterns.values())

print(f"\nTotal confusions: {len(all_confusions):,}")
print(f"Categorized: {total_categorized:,} ({total_categorized/len(all_confusions)*100:.1f}%)")
print(f"Uncategorized: {len(all_confusions) - total_categorized:,} " +
      f"({(len(all_confusions) - total_categorized)/len(all_confusions)*100:.1f}%)")

print("\nPattern Breakdown:")
for pattern, count in sorted(patterns.items(), key=lambda x: x[1], reverse=True):
    if count > 0:
        pct = count / len(all_confusions) * 100
        pattern_name = pattern.replace('_', ' ').title()
        print(f"   {pattern_name:<25} {count:>6,} ({pct:>5.1f}%)")

# Specific accent analysis
print("\n\n" + "="*80)
print("ACCENT & DIACRITIC ANALYSIS")
print("="*80)

accent_confusions = [(g, p) for g, p in all_confusions 
                     if len(g) == 1 and len(p) == 1 
                     and any(c in 'àáâãäåèéêëìíîïòóôõöùúûüýÿñçÀÁÂÃÄÅÈÉÊËÌÍÎÏÒÓÔÕÖÙÚÛÜÝŸÑÇ' for c in g)]

if accent_confusions:
    accent_counter = Counter(accent_confusions)
    print(f"\nAccented character confusions: {len(accent_confusions):,}")
    print("\nMost common accented character errors:")
    for (gold, pred), count in accent_counter.most_common(15):
        print(f"   '{gold}' → '{pred}': {count} times")
else:
    print("\nNo accented character confusions detected.")

# Ligature analysis  
print("\n\n" + "="*80)
print("LIGATURE ANALYSIS")
print("="*80)

ligature_confusions = [(g, p) for g, p in all_confusions if g in 'œæŒÆ' or p in 'œæŒÆ']

if ligature_confusions:
    ligature_counter = Counter(ligature_confusions)
    print(f"\nLigature-related confusions: {len(ligature_confusions):,}")
    print("\nLigature substitutions:")
    for (gold, pred), count in ligature_counter.most_common(10):
        print(f"   '{gold}' → '{pred}': {count} times")
else:
    print("\nNo ligature confusions detected.")

# Case sensitivity analysis
print("\n\n" + "="*80)
print("CASE SENSITIVITY ANALYSIS")
print("="*80)

case_confusions = [(g, p) for g, p in all_confusions 
                   if len(g) == 1 and len(p) == 1 and g.lower() == p.lower() and g != p]

if case_confusions:
    case_counter = Counter(case_confusions)
    print(f"\nCase-only differences: {len(case_confusions):,}")
    print("\nMost common case errors:")
    for (gold, pred), count in case_counter.most_common(10):
        print(f"   '{gold}' → '{pred}': {count} times")
else:
    print("\nNo case-only confusions detected.")

# Recommendations
print("\n\n" + "="*80)
print("RECOMMENDATIONS")
print("="*80)

print("\nBased on the error analysis:")

# Check for high accent issues
accent_pct = patterns['accent_removal'] / len(all_confusions) * 100 if all_confusions else 0
if accent_pct > 5:
    print(f"\n⚠ HIGH ACCENT REMOVAL RATE ({accent_pct:.1f}%)")
    print("   - Consider post-processing to restore accents using dictionary lookup")
    print("   - May need model fine-tuning on accented French text")

# Check for ligature issues
ligature_pct = patterns['ligature_issues'] / len(all_confusions) * 100 if all_confusions else 0
if ligature_pct > 2:
    print(f"\n⚠ LIGATURE HANDLING ISSUES ({ligature_pct:.1f}%)")
    print("   - Ligatures (œ, æ) being split or confused")
    print("   - Common in historical French texts")

# Check for case errors
case_pct = patterns['case_errors'] / len(all_confusions) * 100 if all_confusions else 0
if case_pct > 3:
    print(f"\n⚠ CASE SENSITIVITY ISSUES ({case_pct:.1f}%)")
    print("   - Model confusing upper/lowercase")
    print("   - May indicate line/title detection problems")

# General observation
if len(all_confusions) > 0:
    unique_ratio = len(confusion_counter) / len(all_confusions)
    if unique_ratio > 0.5:
        print(f"\n✓ ERROR DIVERSITY IS HIGH (unique ratio: {unique_ratio:.2f})")
        print("   - Errors are diverse, not systematic")
        print("   - Suggests random OCR noise rather than systematic bias")
    else:
        print(f"\n⚠ ERROR CONCENTRATION DETECTED (unique ratio: {unique_ratio:.2f})")
        print("   - Same errors repeat frequently")
        print("   - Suggests systematic model bias that could be corrected")

print("\n" + "="*80)

Analyzing character-level confusions across all pages...


CHARACTER CONFUSION MATRIX

Total character substitutions: 130
Unique confusion pairs: 115

Top 30 Most Common Character Substitutions:
Gold → Pred                    Count     
--------------------------------------------------------------------------------
" ; E" → "; É"                 8         
'a' → 'à'                      3         
'E' → 'É'                      2         
'î' → 'i'                      2         
'e' → 'è'                      2         
"t) " → "l)"                   2         
".)" → ")"                     2         
'd' → 'D'                      2         
"ance" → "ir"                  1         
'l' → 'L'                      1         
'C' → 'G'                      1         
"de" → "DE"                    1         
'é' → 'e'                      1         
" dé" → "dé- "                 1         
'e' → 'é'                      1         
"nec" → "ce"                   1         
'i' → 'I' 

In [None]:
"""
Classification Accuracy Evaluation
Evaluate how well the model classifies items into the five categories:
prose, verse, ad, paratext, unknown

Structure:
1. Overall classification metrics across all pages
2. Per-page classification breakdown
"""

def evaluate_classification(gold_items: List[Dict], pred_items: List[Dict],
                           matches: List[Tuple[int, int, float]]) -> Dict:
    """
    Evaluate classification accuracy on matched pairs.
    
    Args:
        gold_items: List of gold standard items
        pred_items: List of predicted items
        matches: List of (gold_idx, pred_idx, score) tuples
    
    Returns:
        Dict with classification metrics
    """
    if not matches:
        return {
            'gold_classes': [],
            'pred_classes': [],
            'correct': 0,
            'total': 0,
            'accuracy': 0.0
        }
    
    matched_pairs = get_matched_pairs(matches, gold_items, pred_items)
    
    gold_classes = []
    pred_classes = []
    
    for gold_item, pred_item, _ in matched_pairs:
        gold_classes.append(gold_item['item_class'])
        pred_classes.append(pred_item['item_class'])
    
    correct = sum(1 for g, p in zip(gold_classes, pred_classes) if g == p)
    total = len(gold_classes)
    accuracy = correct / total if total > 0 else 0.0
    
    return {
        'gold_classes': gold_classes,
        'pred_classes': pred_classes,
        'correct': correct,
        'total': total,
        'accuracy': accuracy
    }


# Collect classification data from all pages
print("Evaluating classification accuracy...")
print("\n")

all_gold_classes = []
all_pred_classes = []
page_classification_results = []

for page in all_pages:
    page_id = page['page_id']
    gold_items = page['gold_items']
    pred_items = page['pred_items']
    matches = page['matches']
    
    result = evaluate_classification(gold_items, pred_items, matches)
    result['page_id'] = page_id
    page_classification_results.append(result)
    
    all_gold_classes.extend(result['gold_classes'])
    all_pred_classes.extend(result['pred_classes'])

# Calculate overall metrics
total_matched = len(all_gold_classes)
total_correct = sum(1 for g, p in zip(all_gold_classes, all_pred_classes) if g == p)
overall_accuracy = total_correct / total_matched if total_matched > 0 else 0.0

print("="*80)
print("OVERALL CLASSIFICATION METRICS")
print("="*80)

print(f"\nTotal matched items evaluated: {total_matched}")
print(f"Correctly classified: {total_correct} ({overall_accuracy:.2%})")
print(f"Misclassified: {total_matched - total_correct} ({(1-overall_accuracy):.2%})")

# Class labels
class_labels = ['prose', 'verse', 'ad', 'paratext', 'unknown']

# Confusion matrix
if total_matched > 0:
    cm = confusion_matrix(all_gold_classes, all_pred_classes, labels=class_labels)
    
    print("\n\nCONFUSION MATRIX")
    print("-"*80)
    print(f"{'':>12}", end='')
    for label in class_labels:
        print(f"{label:>10}", end='')
    print()
    print("-"*80)
    
    for i, label in enumerate(class_labels):
        print(f"{label:>12}", end='')
        for j in range(len(class_labels)):
            print(f"{cm[i][j]:>10}", end='')
        print()
    
    # Per-class metrics
    print("\n\nPER-CLASS METRICS")
    print("-"*80)
    print(f"{'Class':<12} {'Precision':<12} {'Recall':<12} {'F1-Score':<12} {'Support':<12}")
    print("-"*80)
    
    for i, label in enumerate(class_labels):
        # Calculate metrics for this class
        tp = cm[i][i]
        fp = cm[:, i].sum() - tp
        fn = cm[i, :].sum() - tp
        support = cm[i, :].sum()
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
        
        print(f"{label:<12} {precision:<12.2%} {recall:<12.2%} {f1:<12.3f} {support:<12}")
    
    # Macro and weighted averages
    precisions = []
    recalls = []
    f1s = []
    supports = []
    
    for i in range(len(class_labels)):
        tp = cm[i][i]
        fp = cm[:, i].sum() - tp
        fn = cm[i, :].sum() - tp
        support = cm[i, :].sum()
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
        supports.append(support)
    
    macro_precision = np.mean(precisions)
    macro_recall = np.mean(recalls)
    macro_f1 = np.mean(f1s)
    
    total_support = sum(supports)
    weighted_precision = sum(p * s for p, s in zip(precisions, supports)) / total_support
    weighted_recall = sum(r * s for r, s in zip(recalls, supports)) / total_support
    weighted_f1 = sum(f * s for f, s in zip(f1s, supports)) / total_support
    
    print("-"*80)
    print(f"{'Macro Avg':<12} {macro_precision:<12.2%} {macro_recall:<12.2%} {macro_f1:<12.3f} {total_support:<12}")
    print(f"{'Weighted Avg':<12} {weighted_precision:<12.2%} {weighted_recall:<12.2%} {weighted_f1:<12.3f} {total_support:<12}")
    
    # Most common misclassifications
    print("\n\nMOST COMMON MISCLASSIFICATIONS")
    print("-"*80)
    
    misclass_counts = []
    for i, gold_label in enumerate(class_labels):
        for j, pred_label in enumerate(class_labels):
            if i != j and cm[i][j] > 0:
                misclass_counts.append((gold_label, pred_label, cm[i][j]))
    
    misclass_counts.sort(key=lambda x: x[2], reverse=True)
    
    if misclass_counts:
        print(f"{'Gold → Predicted':<30} {'Count':<10} {'% of Gold Class'}")
        print("-"*80)
        for gold_label, pred_label, count in misclass_counts[:10]:
            gold_total = cm[class_labels.index(gold_label), :].sum()
            pct = count / gold_total * 100 if gold_total > 0 else 0
            print(f"{gold_label} → {pred_label:<20} {count:<10} {pct:.1f}%")
    else:
        print("No misclassifications detected!")

# Contributions-specific analysis
print("\n\n" + "="*80)
print("CONTRIBUTIONS ANALYSIS (Prose + Verse)")
print("="*80)

contrib_gold = [g for g in all_gold_classes if g in ['prose', 'verse']]
contrib_pred = [p for g, p in zip(all_gold_classes, all_pred_classes) if g in ['prose', 'verse']]

if contrib_gold:
    contrib_correct = sum(1 for g, p in zip(contrib_gold, contrib_pred) if g == p)
    contrib_accuracy = contrib_correct / len(contrib_gold)
    
    print(f"\nTotal contribution items: {len(contrib_gold)}")
    print(f"Correctly classified: {contrib_correct} ({contrib_accuracy:.2%})")
    print(f"Misclassified: {len(contrib_gold) - contrib_correct} ({(1-contrib_accuracy):.2%})")
    
    # Contribution confusion
    contrib_labels = ['prose', 'verse']
    contrib_cm = confusion_matrix(contrib_gold, contrib_pred, labels=contrib_labels)
    
    print("\nContributions Confusion Matrix:")
    print(f"{'':>12}", end='')
    for label in contrib_labels:
        print(f"{label:>10}", end='')
    print()
    print("-"*40)
    for i, label in enumerate(contrib_labels):
        print(f"{label:>12}", end='')
        for j in range(len(contrib_labels)):
            print(f"{contrib_cm[i][j]:>10}", end='')
        print()

# Per-page classification breakdown
print("\n\n" + "="*80)
print("PER-PAGE CLASSIFICATION BREAKDOWN")
print("="*80)

page_class_df_data = []
for result in page_classification_results:
    page_class_df_data.append({
        'page_id': result['page_id'],
        'total_items': result['total'],
        'correct': result['correct'],
        'accuracy_%': round(result['accuracy'] * 100, 1) if result['total'] > 0 else 0.0,
        'misclassified': result['total'] - result['correct']
    })

page_class_df = pd.DataFrame(page_class_df_data)
print("\n" + page_class_df.to_string(index=False))

# Detailed per-page analysis
print("\n\nDETAILED PER-PAGE CLASSIFICATION")
print("="*80)

for result in page_classification_results:
    if result['total'] == 0:
        continue
    
    page_id = result['page_id']
    gold_classes = result['gold_classes']
    pred_classes = result['pred_classes']
    
    print(f"\n{page_id}")
    print("-"*80)
    print(f"Accuracy: {result['accuracy']:.2%} ({result['correct']}/{result['total']})")
    
    # Class distribution
    from collections import Counter
    gold_dist = Counter(gold_classes)
    
    print(f"\nClass distribution:")
    for cls in ['prose', 'verse', 'ad', 'paratext', 'unknown']:
        if cls in gold_dist:
            gold_count = gold_dist[cls]
            pred_count = sum(1 for g, p in zip(gold_classes, pred_classes) 
                           if g == cls and p == cls)
            print(f"   {cls:<12} {pred_count}/{gold_count} correct")
    
    # Misclassifications for this page
    misclass_page = [(g, p) for g, p in zip(gold_classes, pred_classes) if g != p]
    if misclass_page:
        print(f"\nMisclassifications ({len(misclass_page)}):")
        misclass_counter = Counter(misclass_page)
        for (gold, pred), count in misclass_counter.most_common():
            print(f"   {gold} → {pred}: {count} time{'s' if count > 1 else ''}")

print("\n" + "="*80)
print("\nKEY FINDINGS:")
print(f"- Overall classification accuracy: {overall_accuracy:.2%}")
print(f"- Best performing class: {class_labels[np.argmax([recalls[i] for i in range(len(class_labels))])]}")
print(f"- Most challenging class: {class_labels[np.argmin([recalls[i] for i in range(len(class_labels))])]}")
if misclass_counts:
    print(f"- Most common confusion: {misclass_counts[0][0]} → {misclass_counts[0][1]} ({misclass_counts[0][2]} times)")
print("="*80)

Evaluating classification accuracy...


OVERALL CLASSIFICATION METRICS

Total matched items evaluated: 37
Correctly classified: 25 (67.57%)
Misclassified: 12 (32.43%)


CONFUSION MATRIX
--------------------------------------------------------------------------------
                 prose     verse        ad  paratext   unknown
--------------------------------------------------------------------------------
       prose         7         1         0         0         0
       verse         0         7         0         0         0
          ad         1         0         0         0         0
    paratext        10         0         0        11         0
     unknown         0         0         0         0         0


PER-CLASS METRICS
--------------------------------------------------------------------------------
Class        Precision    Recall       F1-Score     Support     
--------------------------------------------------------------------------------
prose        38.89%       8

In [12]:
"""
Metadata Extraction Evaluation
Evaluate title and author extraction accuracy on matched items.

Metrics:
- Exact match: Field matches exactly
- Partial match: String similarity above threshold
- Presence: Field is present (not None) in both gold and pred
- Precision: Of predicted fields, how many are correct?
- Recall: Of gold fields, how many were extracted?
- F1: Harmonic mean of precision and recall
"""


def normalize_metadata_string(s: Optional[str]) -> str:
    """
    Normalize metadata string for comparison.
    - Lowercase
    - Remove extra whitespace
    - Remove punctuation at start/end
    """
    if s is None:
        return ""
    s = s.lower().strip()
    s = re.sub(r'\s+', ' ', s)
    s = s.strip('.,;:!?')
    return s


def metadata_similarity(gold: Optional[str], pred: Optional[str]) -> float:
    """
    Calculate similarity between two metadata strings.
    Returns 1.0 for exact match, 0.0 for no match, partial scores for similarity.
    """
    gold_norm = normalize_metadata_string(gold)
    pred_norm = normalize_metadata_string(pred)
    
    if not gold_norm and not pred_norm:
        return 1.0  # Both null
    if not gold_norm or not pred_norm:
        return 0.0  # One null, one not
    
    if gold_norm == pred_norm:
        return 1.0  # Exact match
    
    # Use SequenceMatcher for partial similarity
    return SequenceMatcher(None, gold_norm, pred_norm).ratio()


def evaluate_metadata_field(gold_items: List[Dict], pred_items: List[Dict],
                            matches: List[Tuple[int, int, float]],
                            field_name: str,
                            similarity_threshold: float = 0.8) -> Dict:
    """
    Evaluate a specific metadata field (title or author).
    
    Args:
        gold_items: List of gold items
        pred_items: List of pred items
        matches: List of (gold_idx, pred_idx, score) tuples
        field_name: 'item_title' or 'item_author'
        similarity_threshold: Minimum similarity for partial match
    
    Returns:
        Dict with metrics
    """
    if not matches:
        return {
            'gold_present': 0,
            'pred_present': 0,
            'exact_matches': 0,
            'partial_matches': 0,
            'precision': 0.0,
            'recall': 0.0,
            'f1': 0.0,
            'examples': []
        }
    
    matched_pairs = get_matched_pairs(matches, gold_items, pred_items)
    
    gold_present = 0  # Gold has non-null value
    pred_present = 0  # Pred has non-null value
    exact_matches = 0
    partial_matches = 0
    examples = []
    
    for gold_item, pred_item, _ in matched_pairs:
        gold_value = gold_item.get(field_name)
        pred_value = pred_item.get(field_name)
        
        gold_has_value = gold_value is not None and gold_value.strip() != ''
        pred_has_value = pred_value is not None and pred_value.strip() != ''
        
        if gold_has_value:
            gold_present += 1
        
        if pred_has_value:
            pred_present += 1
        
        if gold_has_value and pred_has_value:
            similarity = metadata_similarity(gold_value, pred_value)
            
            if similarity == 1.0:
                exact_matches += 1
                partial_matches += 1
            elif similarity >= similarity_threshold:
                partial_matches += 1
                # Store example for partial matches
                if len(examples) < 5:
                    examples.append({
                        'gold': gold_value,
                        'pred': pred_value,
                        'similarity': similarity,
                        'item_class': gold_item.get('item_class')
                    })
    
    # Calculate metrics based on partial matches (more lenient)
    # Precision: of predicted values, how many match gold?
    precision = partial_matches / pred_present if pred_present > 0 else 0.0
    
    # Recall: of gold values, how many were extracted?
    recall = partial_matches / gold_present if gold_present > 0 else 0.0
    
    # F1
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    
    return {
        'gold_present': gold_present,
        'pred_present': pred_present,
        'exact_matches': exact_matches,
        'partial_matches': partial_matches,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'examples': examples
    }


# Evaluate metadata across all pages
print("Evaluating metadata extraction (titles and authors)...")
print("\n")

page_metadata_results = []

for page in all_pages:
    page_id = page['page_id']
    gold_items = page['gold_items']
    pred_items = page['pred_items']
    matches = page['matches']
    
    # Evaluate titles
    title_metrics = evaluate_metadata_field(gold_items, pred_items, matches, 'item_title')
    
    # Evaluate authors
    author_metrics = evaluate_metadata_field(gold_items, pred_items, matches, 'item_author')
    
    page_metadata_results.append({
        'page_id': page_id,
        'title': title_metrics,
        'author': author_metrics
    })

# Aggregate overall metrics
overall_title = {
    'gold_present': sum(r['title']['gold_present'] for r in page_metadata_results),
    'pred_present': sum(r['title']['pred_present'] for r in page_metadata_results),
    'exact_matches': sum(r['title']['exact_matches'] for r in page_metadata_results),
    'partial_matches': sum(r['title']['partial_matches'] for r in page_metadata_results)
}

overall_author = {
    'gold_present': sum(r['author']['gold_present'] for r in page_metadata_results),
    'pred_present': sum(r['author']['pred_present'] for r in page_metadata_results),
    'exact_matches': sum(r['author']['exact_matches'] for r in page_metadata_results),
    'partial_matches': sum(r['author']['partial_matches'] for r in page_metadata_results)
}

# Calculate overall precision, recall, F1
overall_title['precision'] = (overall_title['partial_matches'] / overall_title['pred_present'] 
                              if overall_title['pred_present'] > 0 else 0.0)
overall_title['recall'] = (overall_title['partial_matches'] / overall_title['gold_present'] 
                          if overall_title['gold_present'] > 0 else 0.0)
overall_title['f1'] = (2 * overall_title['precision'] * overall_title['recall'] / 
                       (overall_title['precision'] + overall_title['recall']) 
                       if (overall_title['precision'] + overall_title['recall']) > 0 else 0.0)

overall_author['precision'] = (overall_author['partial_matches'] / overall_author['pred_present'] 
                               if overall_author['pred_present'] > 0 else 0.0)
overall_author['recall'] = (overall_author['partial_matches'] / overall_author['gold_present'] 
                           if overall_author['gold_present'] > 0 else 0.0)
overall_author['f1'] = (2 * overall_author['precision'] * overall_author['recall'] / 
                        (overall_author['precision'] + overall_author['recall']) 
                        if (overall_author['precision'] + overall_author['recall']) > 0 else 0.0)

# Print overall results
print("="*80)
print("OVERALL METADATA EXTRACTION METRICS")
print("="*80)

print("\n--- TITLE EXTRACTION ---")
print(f"Gold items with titles:       {overall_title['gold_present']}")
print(f"Predicted items with titles:  {overall_title['pred_present']}")
print(f"Exact matches:                {overall_title['exact_matches']}")
print(f"Partial matches (≥80% sim):   {overall_title['partial_matches']}")
print(f"\nPrecision: {overall_title['precision']:.2%}")
print(f"Recall:    {overall_title['recall']:.2%}")
print(f"F1 Score:  {overall_title['f1']:.3f}")

print("\n--- AUTHOR EXTRACTION ---")
print(f"Gold items with authors:      {overall_author['gold_present']}")
print(f"Predicted items with authors: {overall_author['pred_present']}")
print(f"Exact matches:                {overall_author['exact_matches']}")
print(f"Partial matches (≥80% sim):   {overall_author['partial_matches']}")
print(f"\nPrecision: {overall_author['precision']:.2%}")
print(f"Recall:    {overall_author['recall']:.2%}")
print(f"F1 Score:  {overall_author['f1']:.3f}")

# Collect examples from all pages
all_title_examples = []
all_author_examples = []

for result in page_metadata_results:
    all_title_examples.extend(result['title']['examples'])
    all_author_examples.extend(result['author']['examples'])

# Show examples of partial matches (not exact)
if all_title_examples:
    print("\n\nEXAMPLES OF PARTIAL TITLE MATCHES")
    print("-"*80)
    for i, ex in enumerate(all_title_examples[:5], 1):
        print(f"\n{i}. {ex['item_class'].upper()} (Similarity: {ex['similarity']:.2%})")
        print(f"   Gold: \"{ex['gold']}\"")
        print(f"   Pred: \"{ex['pred']}\"")

if all_author_examples:
    print("\n\nEXAMPLES OF PARTIAL AUTHOR MATCHES")
    print("-"*80)
    for i, ex in enumerate(all_author_examples[:5], 1):
        print(f"\n{i}. {ex['item_class'].upper()} (Similarity: {ex['similarity']:.2%})")
        print(f"   Gold: \"{ex['gold']}\"")
        print(f"   Pred: \"{ex['pred']}\"")

# Per-page breakdown
print("\n\n" + "="*80)
print("PER-PAGE METADATA EXTRACTION")
print("="*80)

page_meta_df_data = []
for result in page_metadata_results:
    page_meta_df_data.append({
        'page_id': result['page_id'],
        'title_gold': result['title']['gold_present'],
        'title_pred': result['title']['pred_present'],
        'title_F1': round(result['title']['f1'], 3),
        'author_gold': result['author']['gold_present'],
        'author_pred': result['author']['pred_present'],
        'author_F1': round(result['author']['f1'], 3)
    })

page_meta_df = pd.DataFrame(page_meta_df_data)
print("\n" + page_meta_df.to_string(index=False))

# Detailed per-page analysis
print("\n\nDETAILED PER-PAGE ANALYSIS")
print("="*80)

for result in page_metadata_results:
    page_id = result['page_id']
    title_metrics = result['title']
    author_metrics = result['author']
    
    if title_metrics['gold_present'] == 0 and author_metrics['gold_present'] == 0:
        continue
    
    print(f"\n{page_id}")
    print("-"*80)
    
    if title_metrics['gold_present'] > 0:
        print(f"Titles:  {title_metrics['partial_matches']}/{title_metrics['gold_present']} extracted " +
              f"(P: {title_metrics['precision']:.2%}, R: {title_metrics['recall']:.2%}, " +
              f"F1: {title_metrics['f1']:.3f})")
    else:
        print(f"Titles:  No gold titles on this page")
    
    if author_metrics['gold_present'] > 0:
        print(f"Authors: {author_metrics['partial_matches']}/{author_metrics['gold_present']} extracted " +
              f"(P: {author_metrics['precision']:.2%}, R: {author_metrics['recall']:.2%}, " +
              f"F1: {author_metrics['f1']:.3f})")
    else:
        print(f"Authors: No gold authors on this page")

print("\n" + "="*80)
print("\nKEY FINDINGS:")
print(f"- Title extraction F1: {overall_title['f1']:.3f}")
print(f"- Author extraction F1: {overall_author['f1']:.3f}")
print(f"- Title exact match rate: {overall_title['exact_matches']}/{overall_title['gold_present']} " +
      f"({overall_title['exact_matches']/overall_title['gold_present']*100:.1f}%)" 
      if overall_title['gold_present'] > 0 else "- Title exact match rate: N/A")
print(f"- Author exact match rate: {overall_author['exact_matches']}/{overall_author['gold_present']} " +
      f"({overall_author['exact_matches']/overall_author['gold_present']*100:.1f}%)"
      if overall_author['gold_present'] > 0 else "- Author exact match rate: N/A")
print("="*80)

Evaluating metadata extraction (titles and authors)...


OVERALL METADATA EXTRACTION METRICS

--- TITLE EXTRACTION ---
Gold items with titles:       13
Predicted items with titles:  12
Exact matches:                11
Partial matches (≥80% sim):   11

Precision: 91.67%
Recall:    84.62%
F1 Score:  0.880

--- AUTHOR EXTRACTION ---
Gold items with authors:      12
Predicted items with authors: 10
Exact matches:                10
Partial matches (≥80% sim):   10

Precision: 100.00%
Recall:    83.33%
F1 Score:  0.909


PER-PAGE METADATA EXTRACTION

                                   page_id  title_gold  title_pred  title_F1  author_gold  author_pred  author_F1
La_Plume_bpt6k1185893k_1_10_1889__page-001           1           0     0.000            0            0      0.000
La_Plume_bpt6k1185893k_1_10_1889__page-002           1           1     1.000            0            0      0.000
La_Plume_bpt6k1185893k_1_10_1889__page-003           0           0     0.000            0            0     

In [21]:
"""
Continuation Tracking Evaluation

Evaluates the accuracy of continuation fields (is_continuation, continues_on_next_page)
across ALL items in the dataset, including unmatched items.

Fields are treated as binary:
- True = continuation exists
- False/None = no continuation (treated identically)

Evaluation logic:
- Matched items: Compare gold vs pred continuation fields directly
- Unmatched gold items with continuation=True: False Negatives (model missed them)
- Unmatched pred items with continuation=True: False Positives (model hallucinated them)

Metrics: Precision, Recall, F1 for each field
Reports: Global aggregates first, then per-page breakdown
"""

def evaluate_continuation_all_items(
    gold_items: List[Dict],
    pred_items: List[Dict],
    matches: List[Tuple[int, int, float]],
    unmatched_gold: Set[int],
    unmatched_pred: Set[int]
) -> Dict:
    """
    Evaluate continuation field accuracy across ALL items.
    
    Args:
        gold_items: Gold standard items
        pred_items: Predicted items
        matches: List of (gold_idx, pred_idx, similarity) tuples
        unmatched_gold: Set of unmatched gold indices
        unmatched_pred: Set of unmatched pred indices
        
    Returns:
        Dict with metrics for is_continuation and continues_on_next_page
    """
    # Initialize counters for both fields
    is_cont_tp = is_cont_fp = is_cont_fn = is_cont_tn = 0
    continues_tp = continues_fp = continues_fn = continues_tn = 0
    
    # 1. Evaluate matched items
    for gold_idx, pred_idx, _ in matches:
        gold_item = gold_items[gold_idx]
        pred_item = pred_items[pred_idx]
        
        # Evaluate is_continuation
        gold_is_cont = gold_item.get('is_continuation') is True
        pred_is_cont = pred_item.get('is_continuation') is True
        
        if gold_is_cont and pred_is_cont:
            is_cont_tp += 1
        elif not gold_is_cont and pred_is_cont:
            is_cont_fp += 1
        elif gold_is_cont and not pred_is_cont:
            is_cont_fn += 1
        else:
            is_cont_tn += 1
        
        # Evaluate continues_on_next_page
        gold_continues = gold_item.get('continues_on_next_page') is True
        pred_continues = pred_item.get('continues_on_next_page') is True
        
        if gold_continues and pred_continues:
            continues_tp += 1
        elif not gold_continues and pred_continues:
            continues_fp += 1
        elif gold_continues and not pred_continues:
            continues_fn += 1
        else:
            continues_tn += 1
    
    # 2. Evaluate unmatched gold items (missed continuations = False Negatives)
    for gold_idx in unmatched_gold:
        gold_item = gold_items[gold_idx]
        
        # If gold has continuation=True but item wasn't matched, that's a FN
        if gold_item.get('is_continuation') is True:
            is_cont_fn += 1
        
        if gold_item.get('continues_on_next_page') is True:
            continues_fn += 1
    
    # 3. Evaluate unmatched pred items (hallucinated continuations = False Positives)
    for pred_idx in unmatched_pred:
        pred_item = pred_items[pred_idx]
        
        # If pred has continuation=True but item wasn't matched, that's a FP
        if pred_item.get('is_continuation') is True:
            is_cont_fp += 1
        
        if pred_item.get('continues_on_next_page') is True:
            continues_fp += 1
    
    # Calculate metrics for is_continuation
    is_cont_p = is_cont_tp / (is_cont_tp + is_cont_fp) if (is_cont_tp + is_cont_fp) > 0 else 0.0
    is_cont_r = is_cont_tp / (is_cont_tp + is_cont_fn) if (is_cont_tp + is_cont_fn) > 0 else 0.0
    is_cont_f1 = 2 * is_cont_p * is_cont_r / (is_cont_p + is_cont_r) if (is_cont_p + is_cont_r) > 0 else 0.0
    
    # Calculate metrics for continues_on_next_page
    continues_p = continues_tp / (continues_tp + continues_fp) if (continues_tp + continues_fp) > 0 else 0.0
    continues_r = continues_tp / (continues_tp + continues_fn) if (continues_tp + continues_fn) > 0 else 0.0
    continues_f1 = 2 * continues_p * continues_r / (continues_p + continues_r) if (continues_p + continues_r) > 0 else 0.0
    
    return {
        'is_continuation': {
            'tp': is_cont_tp,
            'fp': is_cont_fp,
            'fn': is_cont_fn,
            'tn': is_cont_tn,
            'precision': is_cont_p,
            'recall': is_cont_r,
            'f1': is_cont_f1
        },
        'continues_on_next_page': {
            'tp': continues_tp,
            'fp': continues_fp,
            'fn': continues_fn,
            'tn': continues_tn,
            'precision': continues_p,
            'recall': continues_r,
            'f1': continues_f1
        },
        'n_matched': len(matches),
        'n_unmatched_gold': len(unmatched_gold),
        'n_unmatched_pred': len(unmatched_pred)
    }


# Evaluate continuation tracking on all pages
print("Evaluating continuation tracking (all items)...")
print()

continuation_results = []

for page in all_pages:
    gold_items = page['gold_items']
    pred_items = page['pred_items']
    matches = page['matches']
    unmatched_gold = page['unmatched_gold']
    unmatched_pred = page['unmatched_pred']
    page_name = page['page_name']
    
    result = evaluate_continuation_all_items(
        gold_items, pred_items, matches, 
        unmatched_gold, unmatched_pred
    )
    result['page'] = page_name
    
    continuation_results.append(result)

# Aggregate global metrics
total_is_cont = {
    'tp': sum(r['is_continuation']['tp'] for r in continuation_results),
    'fp': sum(r['is_continuation']['fp'] for r in continuation_results),
    'fn': sum(r['is_continuation']['fn'] for r in continuation_results),
    'tn': sum(r['is_continuation']['tn'] for r in continuation_results)
}

total_continues = {
    'tp': sum(r['continues_on_next_page']['tp'] for r in continuation_results),
    'fp': sum(r['continues_on_next_page']['fp'] for r in continuation_results),
    'fn': sum(r['continues_on_next_page']['fn'] for r in continuation_results),
    'tn': sum(r['continues_on_next_page']['tn'] for r in continuation_results)
}

# Calculate global metrics
is_cont_p = total_is_cont['tp'] / (total_is_cont['tp'] + total_is_cont['fp']) if (total_is_cont['tp'] + total_is_cont['fp']) > 0 else 0.0
is_cont_r = total_is_cont['tp'] / (total_is_cont['tp'] + total_is_cont['fn']) if (total_is_cont['tp'] + total_is_cont['fn']) > 0 else 0.0
is_cont_f1 = 2 * is_cont_p * is_cont_r / (is_cont_p + is_cont_r) if (is_cont_p + is_cont_r) > 0 else 0.0

continues_p = total_continues['tp'] / (total_continues['tp'] + total_continues['fp']) if (total_continues['tp'] + total_continues['fp']) > 0 else 0.0
continues_r = total_continues['tp'] / (total_continues['tp'] + total_continues['fn']) if (total_continues['tp'] + total_continues['fn']) > 0 else 0.0
continues_f1 = 2 * continues_p * continues_r / (continues_p + continues_r) if (continues_p + continues_r) > 0 else 0.0

total_matched = sum(r['n_matched'] for r in continuation_results)
total_unmatched_gold = sum(r['n_unmatched_gold'] for r in continuation_results)
total_unmatched_pred = sum(r['n_unmatched_pred'] for r in continuation_results)

# Count how many items have True values in gold
gold_is_cont_count = total_is_cont['tp'] + total_is_cont['fn']
gold_continues_count = total_continues['tp'] + total_continues['fn']

# Count how many True values the model predicted
pred_is_cont_count = total_is_cont['tp'] + total_is_cont['fp']
pred_continues_count = total_continues['tp'] + total_continues['fp']

# Calculate quantity mismatch
is_cont_mismatch = pred_is_cont_count - gold_is_cont_count
continues_mismatch = pred_continues_count - gold_continues_count

# Print global summary
print(f"{'='*70}")
print(f"CONTINUATION TRACKING - GLOBAL SUMMARY (All Items)")
print(f"{'='*70}")
print()
print(f"Dataset coverage:")
print(f"  Matched items:        {total_matched}")
print(f"  Unmatched gold items: {total_unmatched_gold}")
print(f"  Unmatched pred items: {total_unmatched_pred}")
print()
print(f"is_continuation field:")
print(f"  Gold positives (True):     {gold_is_cont_count}")
print(f"  Pred positives (True):     {pred_is_cont_count}  (mismatch: {is_cont_mismatch:+d})")
print(f"  True Positives (TP):       {total_is_cont['tp']}")
print(f"  False Positives (FP):      {total_is_cont['fp']}")
print(f"  False Negatives (FN):      {total_is_cont['fn']}")
print(f"  True Negatives (TN):       {total_is_cont['tn']}")
print(f"  Precision:                 {is_cont_p:.2%}")
print(f"  Recall:                    {is_cont_r:.2%}")
print(f"  F1 Score:                  {is_cont_f1:.3f}")
print()
print(f"continues_on_next_page field:")
print(f"  Gold positives (True):     {gold_continues_count}")
print(f"  Pred positives (True):     {pred_continues_count}  (mismatch: {continues_mismatch:+d})")
print(f"  True Positives (TP):       {total_continues['tp']}")
print(f"  False Positives (FP):      {total_continues['fp']}")
print(f"  False Negatives (FN):      {total_continues['fn']}")
print(f"  True Negatives (TN):       {total_continues['tn']}")
print(f"  Precision:                 {continues_p:.2%}")
print(f"  Recall:                    {continues_r:.2%}")
print(f"  F1 Score:                  {continues_f1:.3f}")
print()

# Per-page breakdown
print(f"{'='*70}")
print(f"PER-PAGE BREAKDOWN")
print(f"{'='*70}")
print()

for result in continuation_results:
    page = result['page']
    n_matched = result['n_matched']
    n_unmatch_gold = result['n_unmatched_gold']
    n_unmatch_pred = result['n_unmatched_pred']
    
    is_cont = result['is_continuation']
    continues = result['continues_on_next_page']
    
    # Count gold positives for this page
    page_is_cont_gold = is_cont['tp'] + is_cont['fn']
    page_continues_gold = continues['tp'] + continues['fn']
    
    # Count pred positives for this page
    page_is_cont_pred = is_cont['tp'] + is_cont['fp']
    page_continues_pred = continues['tp'] + continues['fp']
    
    # Calculate mismatch
    is_cont_mismatch = page_is_cont_pred - page_is_cont_gold
    continues_mismatch = page_continues_pred - page_continues_gold
    
    print(f"{page}")
    print(f"  Matched: {n_matched}  |  Unmatched gold: {n_unmatch_gold}  |  Unmatched pred: {n_unmatch_pred}")
    print()
    print(f"  is_continuation:")
    print(f"    Gold: {page_is_cont_gold}  Pred: {page_is_cont_pred}  (mismatch: {is_cont_mismatch:+d})")
    print(f"    TP: {is_cont['tp']}  FP: {is_cont['fp']}  FN: {is_cont['fn']}")
    print(f"    P: {is_cont['precision']:.2%}  R: {is_cont['recall']:.2%}  F1: {is_cont['f1']:.3f}")
    print()
    print(f"  continues_on_next_page:")
    print(f"    Gold: {page_continues_gold}  Pred: {page_continues_pred}  (mismatch: {continues_mismatch:+d})")
    print(f"    TP: {continues['tp']}  FP: {continues['fp']}  FN: {continues['fn']}")
    print(f"    P: {continues['precision']:.2%}  R: {continues['recall']:.2%}  F1: {continues['f1']:.3f}")
    print()

Evaluating continuation tracking (all items)...

CONTINUATION TRACKING - GLOBAL SUMMARY (All Items)

Dataset coverage:
  Matched items:        37
  Unmatched gold items: 33
  Unmatched pred items: 15

is_continuation field:
  Gold positives (True):     7
  Pred positives (True):     3  (mismatch: -4)
  True Positives (TP):       1
  False Positives (FP):      2
  False Negatives (FN):      6
  True Negatives (TN):       33
  Precision:                 33.33%
  Recall:                    14.29%
  F1 Score:                  0.200

continues_on_next_page field:
  Gold positives (True):     7
  Pred positives (True):     7  (mismatch: +0)
  True Positives (TP):       3
  False Positives (FP):      4
  False Negatives (FN):      4
  True Negatives (TN):       34
  Precision:                 42.86%
  Recall:                    42.86%
  F1 Score:                  0.429

PER-PAGE BREAKDOWN

La_Plume_bpt6k1185893k_1_10_1889__page-001.json
  Matched: 4  |  Unmatched gold: 4  |  Unmatched pred: 1

In [25]:
"""
Final Summary Report

Synthesizes all evaluation findings into a summary that aggregates metrics across all evaluation dimensions and identifies problematic pages
"""

# Collect aggregate metrics from all previous evaluations
# These values should be computed from the previous cells

# Helper function to create summary table
def create_summary_table():
    """
    Create aggregate metrics table summarizing all evaluation dimensions.
    """
    summary_data = []
    
    # 1. STRUCTURE METRICS (from Cell 3)
    total_gold_items = sum(len(page['gold_items']) for page in all_pages)
    total_pred_items = sum(len(page['pred_items']) for page in all_pages)
    total_matches = sum(len(page['matches']) for page in all_pages)
    
    # Count contributions
    total_gold_contrib = sum(
        len([item for item in page['gold_items'] 
             if item['item_class'] in ['prose', 'verse']])
        for page in all_pages
    )
    total_pred_contrib = sum(
        len([item for item in page['pred_items'] 
             if item['item_class'] in ['prose', 'verse']])
        for page in all_pages
    )
    contrib_matches = sum(
        len(filter_matches_by_class(page['matches'], page['gold_items'], ['prose', 'verse']))
        for page in all_pages
    )
    
    summary_data.append({
        'Dimension': 'Structure Detection',
        'Metric': 'Item Match Rate',
        'Value': f"{(total_matches/total_gold_items)*100:.1f}%",
        'Details': f"{total_matches}/{total_gold_items} items matched"
    })
    
    summary_data.append({
        'Dimension': 'Structure Detection',
        'Metric': 'Contribution Match Rate',
        'Value': f"{(contrib_matches/total_gold_contrib)*100:.1f}%",
        'Details': f"{contrib_matches}/{total_gold_contrib} contributions matched"
    })
    
    # 2. TEXT QUALITY METRICS (from Cell 5 - reference computed values)
    summary_data.append({
        'Dimension': 'Text Quality (OCR)',
        'Metric': 'CER (Standard, All)',
        'Value': f"{avg_oa_all['cer_standard']:.2%}",
        'Details': 'Order-agnostic evaluation'
    })

    summary_data.append({
        'Dimension': 'Text Quality (OCR)',
        'Metric': 'CER (Standard, Contrib)',
        'Value': f"{avg_sa_contrib['cer_standard']:.2%}",
        'Details': 'Structure-aware, matched only'
    })

    summary_data.append({
        'Dimension': 'Text Quality (OCR)',
        'Metric': 'Coverage',
        'Value': f"{total_sa_contrib_matched/total_sa_contrib_gold*100:.1f}%",
        'Details': 'Contribution chars successfully matched'
    })
    
    # 3. CLASSIFICATION METRICS (from Cell 8 - recalculate)
    total_matched = 0
    total_correct = 0
    contrib_matched = 0
    contrib_correct = 0
    
    for page in all_pages:
        gold_items = page['gold_items']
        pred_items = page['pred_items']
        matches = page['matches']
        
        for g_idx, p_idx, _ in matches:
            gold_class = gold_items[g_idx]['item_class']
            pred_class = pred_items[p_idx]['item_class']
            
            total_matched += 1
            if gold_class == pred_class:
                total_correct += 1
            
            if gold_class in ['prose', 'verse']:
                contrib_matched += 1
                if gold_class == pred_class:
                    contrib_correct += 1
    
    overall_acc = (total_correct / total_matched * 100) if total_matched > 0 else 0
    contrib_acc = (contrib_correct / contrib_matched * 100) if contrib_matched > 0 else 0
    
    summary_data.append({
        'Dimension': 'Classification',
        'Metric': 'Overall Accuracy',
        'Value': f"{overall_acc:.1f}%",
        'Details': f"{total_correct}/{total_matched} items"
    })
    
    summary_data.append({
        'Dimension': 'Classification',
        'Metric': 'Contribution Accuracy',
        'Value': f"{contrib_acc:.1f}%",
        'Details': f"{contrib_correct}/{contrib_matched} prose/verse items"
    })
    
    # 4. METADATA METRICS (from Cell 9 - recalculate)
    
    METADATA_SIMILARITY_THRESHOLD = 0.8  # Same as Cell 9
    
    title_gold = 0
    title_pred = 0
    title_correct = 0
    author_gold = 0
    author_pred = 0
    author_correct = 0
    
    for page in all_pages:
        gold_items = page['gold_items']
        pred_items = page['pred_items']
        matches = page['matches']
        
        for g_idx, p_idx, _ in matches:
            gold_item = gold_items[g_idx]
            pred_item = pred_items[p_idx]
            
            # Title evaluation (exact or ≥80% similar using text_similarity)
            if gold_item.get('item_title'):
                title_gold += 1
                if pred_item.get('item_title'):
                    title_pred += 1
                    # Check exact match or high similarity
                    if (gold_item['item_title'].strip() == pred_item['item_title'].strip() or
                        text_similarity(gold_item['item_title'], pred_item['item_title']) >= METADATA_SIMILARITY_THRESHOLD):
                        title_correct += 1
            
            # Author evaluation (exact or ≥80% similar using text_similarity)
            if gold_item.get('item_author'):
                author_gold += 1
                if pred_item.get('item_author'):
                    author_pred += 1
                    # Check exact match or high similarity
                    if (gold_item['item_author'].strip() == pred_item['item_author'].strip() or
                        text_similarity(gold_item['item_author'], pred_item['item_author']) >= METADATA_SIMILARITY_THRESHOLD):
                        author_correct += 1
    
    title_p = (title_correct / title_pred * 100) if title_pred > 0 else 0
    title_r = (title_correct / title_gold * 100) if title_gold > 0 else 0
    title_f1 = 2 * title_p * title_r / (title_p + title_r) if (title_p + title_r) > 0 else 0
    
    author_p = (author_correct / author_pred * 100) if author_pred > 0 else 0
    author_r = (author_correct / author_gold * 100) if author_gold > 0 else 0
    author_f1 = 2 * author_p * author_r / (author_p + author_r) if (author_p + author_r) > 0 else 0
    
    summary_data.append({
        'Dimension': 'Metadata Extraction',
        'Metric': 'Title F1',
        'Value': f"{title_f1/100:.3f}",
        'Details': f"P: {title_p:.0f}%, R: {title_r:.0f}%"
    })
    
    summary_data.append({
        'Dimension': 'Metadata Extraction',
        'Metric': 'Author F1',
        'Value': f"{author_f1/100:.3f}",
        'Details': f"P: {author_p:.0f}%, R: {author_r:.0f}%"
    })
    
    # 5. CONTINUATION TRACKING (from Cell 10 - recalculate with ALL items)
    is_cont_tp = is_cont_fp = is_cont_fn = 0
    continues_tp = continues_fp = continues_fn = 0
    
    for page in all_pages:
        gold_items = page['gold_items']
        pred_items = page['pred_items']
        matches = page['matches']
        unmatched_gold = page['unmatched_gold']
        unmatched_pred = page['unmatched_pred']
        
        # 1. Evaluate matched items
        for g_idx, p_idx, _ in matches:
            gold_item = gold_items[g_idx]
            pred_item = pred_items[p_idx]
            
            # is_continuation
            gold_is_cont = gold_item.get('is_continuation') is True
            pred_is_cont = pred_item.get('is_continuation') is True
            
            if gold_is_cont and pred_is_cont:
                is_cont_tp += 1
            elif not gold_is_cont and pred_is_cont:
                is_cont_fp += 1
            elif gold_is_cont and not pred_is_cont:
                is_cont_fn += 1
            
            # continues_on_next_page
            gold_continues = gold_item.get('continues_on_next_page') is True
            pred_continues = pred_item.get('continues_on_next_page') is True
            
            if gold_continues and pred_continues:
                continues_tp += 1
            elif not gold_continues and pred_continues:
                continues_fp += 1
            elif gold_continues and not pred_continues:
                continues_fn += 1
        
        # 2. Evaluate unmatched gold items (missed continuations = FN)
        for gold_idx in unmatched_gold:
            gold_item = gold_items[gold_idx]
            if gold_item.get('is_continuation') is True:
                is_cont_fn += 1
            if gold_item.get('continues_on_next_page') is True:
                continues_fn += 1
        
        # 3. Evaluate unmatched pred items (hallucinated continuations = FP)
        for pred_idx in unmatched_pred:
            pred_item = pred_items[pred_idx]
            if pred_item.get('is_continuation') is True:
                is_cont_fp += 1
            if pred_item.get('continues_on_next_page') is True:
                continues_fp += 1
    
    is_cont_p = is_cont_tp / (is_cont_tp + is_cont_fp) if (is_cont_tp + is_cont_fp) > 0 else 0
    is_cont_r = is_cont_tp / (is_cont_tp + is_cont_fn) if (is_cont_tp + is_cont_fn) > 0 else 0
    is_cont_f1 = 2 * is_cont_p * is_cont_r / (is_cont_p + is_cont_r) if (is_cont_p + is_cont_r) > 0 else 0
    
    continues_p = continues_tp / (continues_tp + continues_fp) if (continues_tp + continues_fp) > 0 else 0
    continues_r = continues_tp / (continues_tp + continues_fn) if (continues_tp + continues_fn) > 0 else 0
    continues_f1 = 2 * continues_p * continues_r / (continues_p + continues_r) if (continues_p + continues_r) > 0 else 0
    
    summary_data.append({
        'Dimension': 'Continuation Tracking',
        'Metric': 'is_continuation F1',
        'Value': f"{is_cont_f1:.3f}",
        'Details': f"P: {is_cont_p*100:.0f}%, R: {is_cont_r*100:.0f}%"
    })
    
    summary_data.append({
        'Dimension': 'Continuation Tracking',
        'Metric': 'continues_on_next F1',
        'Value': f"{continues_f1:.3f}",
        'Details': f"P: {continues_p*100:.0f}%, R: {continues_r*100:.0f}%"
    })
    
    return pd.DataFrame(summary_data)


def identify_problem_pages():
    """
    Identify pages with significant issues across multiple dimensions.
    Returns list of (page_name, issues) tuples.
    """
    problem_pages = {}
    
    for page in all_pages:
        page_name = page['page_name']
        issues = []
        
        gold_items = page['gold_items']
        pred_items = page['pred_items']
        matches = page['matches']
        
        # Skip empty pages
        if len(gold_items) == 0:
            continue
        
        # Issue 1: Low match rate
        match_rate = len(matches) / len(gold_items) if len(gold_items) > 0 else 0
        if match_rate < 0.5:
            issues.append(f"Low match rate ({match_rate*100:.0f}%)")
        
        # Issue 2: Zero predictions
        if len(pred_items) == 0:
            issues.append("Zero predictions")
        
        # Issue 3: Low contribution match rate
        gold_contrib = [item for item in gold_items if item['item_class'] in ['prose', 'verse']]
        if gold_contrib:
            contrib_matches = filter_matches_by_class(matches, gold_items, ['prose', 'verse'])
            contrib_match_rate = len(contrib_matches) / len(gold_contrib) if len(gold_contrib) > 0 else 0
            if contrib_match_rate < 0.5:
                issues.append(f"Low contribution matching ({contrib_match_rate*100:.0f}%)")
        
        # Issue 4: High classification errors
        if matches:
            misclassified = 0
            for g_idx, p_idx, _ in matches:
                if gold_items[g_idx]['item_class'] != pred_items[p_idx]['item_class']:
                    misclassified += 1
            error_rate = misclassified / len(matches)
            if error_rate > 0.3:
                issues.append(f"High classification errors ({error_rate*100:.0f}%)")
        
        if issues:
            problem_pages[page_name] = issues
    
    return problem_pages


# Generate summary table
print("=" * 80)
print("STAGE 1 OCR EVALUATION - FINAL SUMMARY")
print("=" * 80)
print()

summary_df = create_summary_table()
print("AGGREGATE METRICS")
print("-" * 80)
print(summary_df.to_string(index=False))
print()

# Problem pages
print("=" * 80)
print("PROBLEM PAGES")
print("=" * 80)
print()

problem_pages = identify_problem_pages()
if problem_pages:
    print(f"Identified {len(problem_pages)} pages with significant issues:")
    print()
    
    # Sort by number of issues
    sorted_problems = sorted(problem_pages.items(), key=lambda x: len(x[1]), reverse=True)
    
    for page_name, issues in sorted_problems:
        print(f"{page_name}:")
        for issue in issues:
            print(f"  • {issue}")
        print()
else:
    print("No pages identified with critical issues.")
    print()

print("=" * 80)

STAGE 1 OCR EVALUATION - FINAL SUMMARY

AGGREGATE METRICS
--------------------------------------------------------------------------------
            Dimension                  Metric  Value                                 Details
  Structure Detection         Item Match Rate  52.9%                     37/70 items matched
  Structure Detection Contribution Match Rate  60.0%             15/25 contributions matched
   Text Quality (OCR)     CER (Standard, All) 14.83%               Order-agnostic evaluation
   Text Quality (OCR) CER (Standard, Contrib) 12.42%           Structure-aware, matched only
   Text Quality (OCR)                Coverage  48.8% Contribution chars successfully matched
       Classification        Overall Accuracy  67.6%                             25/37 items
       Classification   Contribution Accuracy  93.3%                 14/15 prose/verse items
  Metadata Extraction                Title F1  0.960                         P: 100%, R: 92%
  Metadata Extraction   