In [1]:
import sys
from pathlib import Path
import json
from typing import Dict, List, Tuple, Optional, Set
from collections import defaultdict
from difflib import SequenceMatcher
import Levenshtein
import re
import pandas as pd

# Path setup
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

# Import schemas for validation
from schemas.stage1_page import Stage1PageModel

# Paths
GOLD_DIR = PROJECT_ROOT / "data" / "gold_standard" / "cleaned"
PRED_DIR = PROJECT_ROOT / "data" / "interim_pages" / "La_Plume_bpt6k1185893k_1_10_1889"

print("Stage 1 OCR Evaluation")
print("\n")
print(f"Project root: {PROJECT_ROOT}")
print(f"Gold standard: {GOLD_DIR}")
print(f"Predictions: {PRED_DIR}")

# Find common files
def load_page_pairs() -> List[Tuple[Path, Path]]:
    """
    Match gold standard files with prediction files by filename.
    Returns list of (gold_path, pred_path) tuples.
    """
    gold_files = {f.name: f for f in GOLD_DIR.glob("*.json")}
    pred_files = {f.name: f for f in PRED_DIR.glob("*.json")}
    
    common_names = set(gold_files.keys()) & set(pred_files.keys())
    
    pairs = [(gold_files[name], pred_files[name]) for name in sorted(common_names)]
    
    print(f"\nDataset:")
    print(f"  Gold files: {len(gold_files)}")
    print(f"  Pred files: {len(pred_files)}")
    print(f"  Matching pairs: {len(pairs)}")
    
    if len(pairs) < len(gold_files):
        missing = set(gold_files.keys()) - set(pred_files.keys())
        print(f"Warning: {len(missing)} gold standard pages without predictions:")
        for name in sorted(missing):
            print(f"   - {name}")
    
    return pairs

page_pairs = load_page_pairs()

Stage 1 OCR Evaluation


Project root: /home/fabian-ramirez/Documents/These/Code/magazine_graphs
Gold standard: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/gold_standard/cleaned
Predictions: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/interim_pages/La_Plume_bpt6k1185893k_1_10_1889

Dataset:
  Gold files: 14
  Pred files: 14
  Matching pairs: 14


In [2]:
"""
Item Matching Functions
Match gold items to predicted items using content-based text similarity.
"""

# Configuration
SIMILARITY_THRESHOLD = 0.7  # Minimum text similarity to consider a match (0.0-1.0)

print("\n")
print("Item Matching Configuration")
print("\n")
print(f"Similarity threshold: {SIMILARITY_THRESHOLD}")


def normalize_text(text: str) -> str:
    """
    Normalize text for similarity comparison.
    
    """
    # Lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Normalize all whitespace (spaces, tabs, newlines) to single spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Strip leading and trailing whitespace
    text = text.strip()
    
    return text


def text_similarity(text1: str, text2: str) -> float:
    """
    Calculate similarity ratio between two texts using SequenceMatcher.
        
    Returns:
        Float between 0.0 (completely different) and 1.0 (identical)
    """
    t1 = normalize_text(text1)
    t2 = normalize_text(text2)
    
    if not t1 and not t2:
        return 1.0
    if not t1 or not t2:
        return 0.0
    
    return SequenceMatcher(None, t1, t2).ratio()


def match_items(
    gold_items: List[Dict], 
    pred_items: List[Dict],
    similarity_threshold: float = SIMILARITY_THRESHOLD
) -> Tuple[List[Tuple[int, int, float]], Set[int], Set[int]]:
    """
    Match gold items to prediction items using greedy best-match algorithm.
    
    Algorithm:
        For each gold item, find the best-matching unmatched pred item.
        Accept the match if similarity exceeds threshold.
    
    Args:
        gold_items: List of gold standard items
        pred_items: List of predicted items
        similarity_threshold: Minimum similarity score to consider a match
    
    Returns:
        Tuple of:
        - matches: List of (gold_idx, pred_idx, similarity_score)
        - unmatched_gold: Set of gold indices with no match
        - unmatched_pred: Set of pred indices with no match
    """
    matches = []
    matched_pred_indices = set()
    unmatched_gold = set()
    
    for gold_idx, gold_item in enumerate(gold_items):
        gold_text = gold_item.get('item_text_raw', '')
        
        best_score = 0.0
        best_pred_idx = None
        
        for pred_idx, pred_item in enumerate(pred_items):
            if pred_idx in matched_pred_indices:
                continue
            
            pred_text = pred_item.get('item_text_raw', '')
            score = text_similarity(gold_text, pred_text)
            
            if score > best_score:
                best_score = score
                best_pred_idx = pred_idx
        
        if best_score >= similarity_threshold and best_pred_idx is not None:
            matches.append((gold_idx, best_pred_idx, best_score))
            matched_pred_indices.add(best_pred_idx)
        else:
            unmatched_gold.add(gold_idx)
    
    unmatched_pred = set(range(len(pred_items))) - matched_pred_indices
    
    return matches, unmatched_gold, unmatched_pred


def load_and_match_page(
    gold_path: Path, 
    pred_path: Path,
    similarity_threshold: float = SIMILARITY_THRESHOLD
) -> Dict:
    """
    Load a page pair and match items.
    
    Args:
        gold_path: Path to gold standard JSON
        pred_path: Path to prediction JSON
        similarity_threshold: Minimum similarity for matching
    
    Returns:
        Dict with:
        - gold_items: All gold items
        - pred_items: All pred items
        - matches: List of (gold_idx, pred_idx, score) tuples
        - unmatched_gold: Set of unmatched gold indices
        - unmatched_pred: Set of unmatched pred indices
        - page_name: Filename
    """
    with open(gold_path, 'r', encoding='utf-8') as f:
        gold_data = json.load(f)
    gold_page = Stage1PageModel.model_validate(gold_data)
    gold_data = gold_page.model_dump()
    
    with open(pred_path, 'r', encoding='utf-8') as f:
        pred_data = json.load(f)
    pred_page = Stage1PageModel.model_validate(pred_data)
    pred_data = pred_page.model_dump()
    
    gold_items = gold_data.get('items', [])
    pred_items = pred_data.get('items', [])
    
    matches, unmatched_gold, unmatched_pred = match_items(
        gold_items, pred_items, similarity_threshold
    )
    
    return {
        'gold_items': gold_items,
        'pred_items': pred_items,
        'matches': matches,
        'unmatched_gold': unmatched_gold,
        'unmatched_pred': unmatched_pred,
        'page_name': gold_path.name
    }


# Test matching on first page
print("\n")
print("Item Matching Test")
print("\n")

if page_pairs:
    test_gold, test_pred = page_pairs[0]
    test_result = load_and_match_page(test_gold, test_pred)
    
    print(f"\nTest page: {test_result['page_name']}")
    print(f"  Gold items: {len(test_result['gold_items'])}")
    print(f"  Pred items: {len(test_result['pred_items'])}")
    print(f"  Matches found: {len(test_result['matches'])}")
    print(f"  Unmatched gold: {len(test_result['unmatched_gold'])}")
    print(f"  Unmatched pred: {len(test_result['unmatched_pred'])}")
    
    if test_result['matches']:
        avg_score = sum(score for _, _, score in test_result['matches']) / len(test_result['matches'])
        print(f"  Average match quality: {avg_score:.2%}")



Item Matching Configuration


Similarity threshold: 0.7


Item Matching Test



Test page: La_Plume_bpt6k1185893k_1_10_1889__page-001.json
  Gold items: 8
  Pred items: 5
  Matches found: 4
  Unmatched gold: 4
  Unmatched pred: 1
  Average match quality: 92.47%


In [3]:
"""
Page-Level Diagnostics
Generate diagnostic metrics for each page based on item matches.
"""

def diagnose_page(page_id: str, gold_items: list, pred_items: list, matches: list) -> dict:
    """
    Generate diagnostic metrics for a single page.
    
    Args:
        page_id: Page identifier
        gold_items: List of gold standard items
        pred_items: List of predicted items
        matches: List of (gold_idx, pred_idx, score) tuples
        
    Returns:
        Dictionary with diagnostic metrics
    """
    # Count items by class
    gold_by_class = {}
    pred_by_class = {}
    
    for item in gold_items:
        item_class = item['item_class']
        gold_by_class[item_class] = gold_by_class.get(item_class, 0) + 1
    
    for item in pred_items:
        item_class = item['item_class']
        pred_by_class[item_class] = pred_by_class.get(item_class, 0) + 1
    
    # Count contributions (prose + verse)
    gold_contrib = gold_by_class.get('prose', 0) + gold_by_class.get('verse', 0)
    pred_contrib = pred_by_class.get('prose', 0) + pred_by_class.get('verse', 0)
    
    # Filter matches by contribution class
    contrib_matches = [
        (g_idx, p_idx, score) for g_idx, p_idx, score in matches
        if gold_items[g_idx]['item_class'] in ('prose', 'verse')
    ]
    
    # Calculate match rates
    match_rate = (len(matches) / len(gold_items) * 100) if gold_items else 0
    contrib_match_rate = (len(contrib_matches) / gold_contrib * 100) if gold_contrib else 0
    
    # Calculate average similarity
    avg_similarity = (sum(score for _, _, score in matches) / len(matches)) if matches else 0
    
    # Count continuation flags
    gold_cont_in = sum(1 for item in gold_items if item.get('is_continuation') is True)
    pred_cont_in = sum(1 for item in pred_items if item.get('is_continuation') is True)
    gold_cont_out = sum(1 for item in gold_items if item.get('continues_on_next_page') is True)
    pred_cont_out = sum(1 for item in pred_items if item.get('continues_on_next_page') is True)
    
    # Track matched indices
    matched_gold = {g_idx for g_idx, _, _ in matches}
    matched_pred = {p_idx for _, p_idx, _ in matches}
    
    unmatched_gold = [i for i in range(len(gold_items)) if i not in matched_gold]
    unmatched_pred = [i for i in range(len(pred_items)) if i not in matched_pred]
    
    # Count matches by class
    matches_by_class = {}
    for g_idx, p_idx, score in matches:
        item_class = gold_items[g_idx]['item_class']
        matches_by_class[item_class] = matches_by_class.get(item_class, 0) + 1
    
    return {
        'page_id': page_id,
        'gold_items': len(gold_items),
        'pred_items': len(pred_items),
        'matched': len(matches),
        'match_rate': match_rate,
        'contrib_match_rate': contrib_match_rate,
        'avg_similarity': avg_similarity,
        'gold_cont_in': gold_cont_in,
        'pred_cont_in': pred_cont_in,
        'gold_cont_out': gold_cont_out,
        'pred_cont_out': pred_cont_out,
        'gold_by_class': gold_by_class,
        'pred_by_class': pred_by_class,
        'matches_by_class': matches_by_class,
        'gold_contrib': gold_contrib,
        'pred_contrib': pred_contrib,
        'contrib_matched': len(contrib_matches),
        'unmatched_gold': unmatched_gold,
        'unmatched_pred': unmatched_pred
    }


def flag_page(metrics: dict) -> str:
    """
    Generate flags for problematic pages based on metrics.
    
    Args:
        metrics: Dictionary from diagnose_page()
        
    Returns:
        Comma-separated string of flags, or empty string if no issues
    """
    flags = []
    
    if metrics['pred_items'] == 0:
        flags.append('ZERO_PREDS')
    
    if metrics['matched'] == 0:
        flags.append('ZERO_MATCHES')
    
    if metrics['match_rate'] < 50:
        flags.append('LOW_MATCH')
    
    if metrics['gold_contrib'] > 0 and metrics['contrib_match_rate'] < 60:
        flags.append('LOW_CONTRIB')
    
    if abs(metrics['gold_items'] - metrics['pred_items']) >= 3:
        flags.append('COUNT_MISMATCH')
    
    return ', '.join(flags)


def run_diagnostics(page_pairs: list) -> pd.DataFrame:
    """
    Run diagnostics on all pages and generate summary table and detailed reports.
    
    Args:
        page_pairs: List of (gold_path, pred_path) tuples from load_page_pairs()
        
    Returns:
        DataFrame with summary metrics for all pages
    """
    print("Running diagnostics on all pages...\n")
    
    all_metrics = []
    
    for gold_path, pred_path in page_pairs:
        # Extract page_id from filename
        page_id = gold_path.stem
        
        # Load and match page
        result = load_and_match_page(gold_path, pred_path)
        gold_items = result['gold_items']
        pred_items = result['pred_items']
        matches = result['matches']
        
        # Generate metrics
        metrics = diagnose_page(page_id, gold_items, pred_items, matches)
        metrics['flags'] = flag_page(metrics)
        all_metrics.append(metrics)
    
    # Create summary DataFrame
    summary_data = []
    for m in all_metrics:
        summary_data.append({
            'page_id': m['page_id'],
            'gold_items': m['gold_items'],
            'pred_items': m['pred_items'],
            'matched': m['matched'],
            'match_rate_%': round(m['match_rate'], 1),
            'contrib_match_rate_%': round(m['contrib_match_rate'], 1),
            'avg_similarity': round(m['avg_similarity'], 3),
            'gold_cont_in': m['gold_cont_in'],
            'pred_cont_in': m['pred_cont_in'],
            'gold_cont_out': m['gold_cont_out'],
            'pred_cont_out': m['pred_cont_out'],
            'flags': m['flags']
        })
    
    summary_df = pd.DataFrame(summary_data)
    
    # Print summary table
    print("\n")
    print("SUMMARY TABLE")
    print("\n")
    print(summary_df.to_string(index=False))
    print("\n")
    
    # Print detailed reports for all pages
    print("="*80)
    print("DETAILED REPORTS")
    print("="*80)
    
    for m in all_metrics:
        print(f"\n=== Page {m['page_id']} ===")
        print(f"Items: {m['gold_items']} gold, {m['pred_items']} pred")
        print(f"Matches: {m['matched']} ({m['match_rate']:.1f}% match rate)")
        
        print("\nBy class:")
        all_classes = sorted(set(m['gold_by_class'].keys()) | set(m['pred_by_class'].keys()))
        for cls in all_classes:
            gold_count = m['gold_by_class'].get(cls, 0)
            pred_count = m['pred_by_class'].get(cls, 0)
            matched_count = m['matches_by_class'].get(cls, 0)
            match_pct = (matched_count / gold_count * 100) if gold_count > 0 else 0
            print(f"  {cls:10s} {gold_count} gold, {pred_count} pred, {matched_count} matched ({match_pct:.1f}%)")

        
        print(f"\nContributions: {m['gold_contrib']} gold, {m['pred_contrib']} pred, "
              f"{m['contrib_matched']} matched ({m['contrib_match_rate']:.1f}%)")
        print(f"Avg similarity: {m['avg_similarity']:.3f}")
        
        print(f"\nContinuations:")
        print(f"  is_continuation: {m['gold_cont_in']} gold, {m['pred_cont_in']} pred")
        print(f"  continues_on_next_page: {m['gold_cont_out']} gold, {m['pred_cont_out']} pred")
        
        print(f"\nUnmatched gold items: {m['unmatched_gold']}")
        print(f"Unmatched pred items: {m['unmatched_pred']}")
        
        if m['flags']:
            print(f"\nFLAGS: {m['flags']}")
    
    return summary_df


# Run diagnostics
diagnostic_df = run_diagnostics(page_pairs)

Running diagnostics on all pages...



SUMMARY TABLE


                                   page_id  gold_items  pred_items  matched  match_rate_%  contrib_match_rate_%  avg_similarity  gold_cont_in  pred_cont_in  gold_cont_out  pred_cont_out                                                            flags
La_Plume_bpt6k1185893k_1_10_1889__page-001           8           5        4          50.0                   0.0           0.925             0             0              0              0                                                   COUNT_MISMATCH
La_Plume_bpt6k1185893k_1_10_1889__page-002           2           2        2         100.0                   0.0           1.000             0             0              0              0                                                                 
La_Plume_bpt6k1185893k_1_10_1889__page-003           3           3        1          33.3                   0.0           0.713             0             1              1              2       

In [4]:
"""
Evaluation Helpers
Utility functions for filtering matches and loading all pages efficiently.
These helpers are used by the evaluation cells that follow.
"""

def filter_matches_by_class(
    matches: List[Tuple[int, int, float]],
    gold_items: List[Dict],
    item_classes: List[str]
) -> List[Tuple[int, int, float]]:
    """
    Filter matches to only include items of specified classes.
    
    Args:
        matches: List of (gold_idx, pred_idx, score) tuples
        gold_items: List of gold standard items
        item_classes: List of classes to include (e.g., ['prose', 'verse'])
    
    Returns:
        Filtered list of matches
    """
    return [
        (g_idx, p_idx, score) 
        for g_idx, p_idx, score in matches
        if gold_items[g_idx]['item_class'] in item_classes
    ]


def get_matched_pairs(
    matches: List[Tuple[int, int, float]],
    gold_items: List[Dict],
    pred_items: List[Dict]
) -> List[Tuple[Dict, Dict, float]]:
    """
    Convert match indices to actual item pairs.
    
    Args:
        matches: List of (gold_idx, pred_idx, score) tuples
        gold_items: List of gold standard items
        pred_items: List of predicted items
    
    Returns:
        List of (gold_item, pred_item, similarity_score) tuples
    """
    return [
        (gold_items[g_idx], pred_items[p_idx], score)
        for g_idx, p_idx, score in matches
    ]


def load_all_pages(page_pairs: List[Tuple[Path, Path]]) -> List[Dict]:
    """
    Load and match all pages at once for efficient batch evaluation.
    
    Args:
        page_pairs: List of (gold_path, pred_path) tuples from load_page_pairs()
    
    Returns:
        List of dictionaries, one per page, each containing:
        - page_id: Page identifier
        - gold_items: All gold items
        - pred_items: All pred items
        - matches: List of (gold_idx, pred_idx, score) tuples
        - unmatched_gold: Set of unmatched gold indices
        - unmatched_pred: Set of unmatched pred indices
    """
    all_pages = []
    
    for gold_path, pred_path in page_pairs:
        result = load_and_match_page(gold_path, pred_path)
        result['page_id'] = gold_path.stem
        all_pages.append(result)
    
    return all_pages


# Load all pages once for reuse in subsequent evaluation cells
print("Loading and matching all pages...")
all_pages = load_all_pages(page_pairs)
print(f"Loaded {len(all_pages)} pages")
print(f"Total matches across all pages: {sum(len(page['matches']) for page in all_pages)}")


Loading and matching all pages...
Loaded 14 pages
Total matches across all pages: 37


In [9]:
# %%
"""
Text Quality Evaluation
Calculate CER and WER using two complementary approaches:
1. Order-agnostic: Pure OCR quality regardless of reading order
2. Structure-aware: OCR quality on properly aligned content via matching

Each approach calculates three normalization levels:
- Strict: Preserves all whitespace (including \n vs \n\n differences)
- Standard: Normalizes whitespace to single spaces (RECOMMENDED)
- Letters Only: Removes all whitespace and punctuation (pure character recognition)

References:
- Flexible Character Accuracy (FCA) for handling reading order issues:
  https://ocr-d.de/en/spec/ocrd_eval.html
- Token sort ratio for order-agnostic OCR comparison:
  https://urban-institute.medium.com/choosing-the-right-ocr-service-for-extracting-text-data-d7830399ec5
- Unicode normalization and whitespace handling in OCR evaluation:
  https://ocr-d.de/en/spec/ocrd_eval.html
"""

import unicodedata


def normalize_text_strict(text: str) -> str:
    """
    Strict normalization: only Unicode NFC normalization.
    Preserves all whitespace, punctuation, and capitalization.
    """
    return unicodedata.normalize('NFC', text)


def normalize_text_standard(text: str) -> str:
    """
    Standard normalization for fair OCR evaluation:
    - Unicode NFC normalization
    - All whitespace (spaces, tabs, newlines) ‚Üí single space
    - Preserves punctuation and capitalization
    """
    text = unicodedata.normalize('NFC', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text


def normalize_text_letters_only(text: str) -> str:
    """
    Letter-only normalization for pure character recognition quality:
    - Unicode NFC normalization
    - Remove all whitespace
    - Remove all punctuation
    - Preserves capitalization and diacritics
    """
    text = unicodedata.normalize('NFC', text)
    text = re.sub(r'[^\w]', '', text)
    return text


def character_error_rate(reference: str, hypothesis: str, normalization: str = 'strict') -> float:
    """
    Calculate Character Error Rate using Levenshtein distance.
    
    Args:
        reference: Ground truth text
        hypothesis: OCR output text
        normalization: 'strict', 'standard', or 'letters_only'
    
    Returns:
        CER = (insertions + deletions + substitutions) / total_reference_chars
    """
    # Apply normalization
    if normalization == 'strict':
        ref = normalize_text_strict(reference)
        hyp = normalize_text_strict(hypothesis)
    elif normalization == 'standard':
        ref = normalize_text_standard(reference)
        hyp = normalize_text_standard(hypothesis)
    elif normalization == 'letters_only':
        ref = normalize_text_letters_only(reference)
        hyp = normalize_text_letters_only(hypothesis)
    else:
        ref = reference
        hyp = hypothesis
    
    if not ref:
        return 1.0 if hyp else 0.0
    distance = Levenshtein.distance(ref, hyp)
    return distance / len(ref)


def word_error_rate(reference: str, hypothesis: str, normalization: str = 'strict') -> float:
    """
    Calculate Word Error Rate using Levenshtein distance on words.
    
    Args:
        reference: Ground truth text
        hypothesis: OCR output text
        normalization: 'strict', 'standard', or 'letters_only'
    
    Returns:
        WER = (insertions + deletions + substitutions) / total_reference_words
    """
    # Apply normalization
    if normalization == 'strict':
        ref = normalize_text_strict(reference)
        hyp = normalize_text_strict(hypothesis)
    elif normalization == 'standard':
        ref = normalize_text_standard(reference)
        hyp = normalize_text_standard(hypothesis)
    elif normalization == 'letters_only':
        # For letters only, WER doesn't make sense without word boundaries
        # So we use standard normalization
        ref = normalize_text_standard(reference)
        hyp = normalize_text_standard(hypothesis)
    else:
        ref = reference
        hyp = hypothesis
    
    ref_words = ref.split()
    hyp_words = hyp.split()
    if not ref_words:
        return 1.0 if hyp_words else 0.0
    distance = Levenshtein.distance(ref_words, hyp_words)
    return distance / len(ref_words)


def token_sort_text(text: str) -> str:
    """
    Sort tokens (words) alphabetically for order-agnostic comparison.
    This removes the impact of reading order on text similarity.
    """
    tokens = text.split()
    return ' '.join(sorted(tokens))


def evaluate_order_agnostic(gold_items: List[Dict], pred_items: List[Dict], 
                            item_classes: Optional[List[str]] = None) -> Dict:
    """
    Evaluate text quality without considering reading order.
    Uses token sort ratio approach - sorts all words before comparison.
    Calculates three normalization levels: strict, standard, letters_only.
    
    Args:
        gold_items: List of gold standard items
        pred_items: List of predicted items
        item_classes: If provided, filter to only these classes
    
    Returns:
        Dict with CER, WER for each normalization level, and text statistics
    """
    # Filter by class if specified
    if item_classes:
        gold_items = [item for item in gold_items if item['item_class'] in item_classes]
        pred_items = [item for item in pred_items if item['item_class'] in item_classes]
    
    # Concatenate all text
    gold_text = ' '.join(item.get('item_text_raw', '') for item in gold_items)
    pred_text = ' '.join(item.get('item_text_raw', '') for item in pred_items)
    
    # Sort tokens for order-agnostic comparison
    gold_sorted = token_sort_text(gold_text)
    pred_sorted = token_sort_text(pred_text)
    
    # Calculate for all three normalization levels
    results = {
        'cer_strict': character_error_rate(gold_sorted, pred_sorted, 'strict'),
        'wer_strict': word_error_rate(gold_sorted, pred_sorted, 'strict'),
        'cer_standard': character_error_rate(gold_sorted, pred_sorted, 'standard'),
        'wer_standard': word_error_rate(gold_sorted, pred_sorted, 'standard'),
        'cer_letters': character_error_rate(gold_sorted, pred_sorted, 'letters_only'),
        'gold_chars': len(gold_text),
        'pred_chars': len(pred_text),
        'gold_words': len(gold_text.split()),
        'pred_words': len(pred_text.split())
    }
    
    return results


def evaluate_structure_aware(gold_items: List[Dict], pred_items: List[Dict],
                             matches: List[Tuple[int, int, float]],
                             item_classes: Optional[List[str]] = None) -> Dict:
    """
    Evaluate text quality on matched pairs, respecting document structure.
    Only compares content that was successfully aligned via matching.
    Calculates three normalization levels: strict, standard, letters_only.
    
    Args:
        gold_items: List of gold standard items
        pred_items: List of predicted items
        matches: List of (gold_idx, pred_idx, score) tuples
        item_classes: If provided, filter matches to only these classes
    
    Returns:
        Dict with matched CER/WER for each normalization level and unmatched content statistics
    """
    # Filter matches by class if specified
    if item_classes:
        filtered_matches = filter_matches_by_class(matches, gold_items, item_classes)
    else:
        filtered_matches = matches
    
    # Get matched pairs
    matched_pairs = get_matched_pairs(filtered_matches, gold_items, pred_items)
    
    # Calculate CER/WER on matched content for all normalization levels
    if matched_pairs:
        # Concatenate matched texts in gold order
        gold_matched_text = ' '.join(gold_item.get('item_text_raw', '') 
                                     for gold_item, _, _ in matched_pairs)
        pred_matched_text = ' '.join(pred_item.get('item_text_raw', '') 
                                     for _, pred_item, _ in matched_pairs)
        
        cer_strict = character_error_rate(gold_matched_text, pred_matched_text, 'strict')
        wer_strict = word_error_rate(gold_matched_text, pred_matched_text, 'strict')
        cer_standard = character_error_rate(gold_matched_text, pred_matched_text, 'standard')
        wer_standard = word_error_rate(gold_matched_text, pred_matched_text, 'standard')
        cer_letters = character_error_rate(gold_matched_text, pred_matched_text, 'letters_only')
        
        matched_gold_chars = len(gold_matched_text)
        matched_pred_chars = len(pred_matched_text)
    else:
        cer_strict = 0.0
        wer_strict = 0.0
        cer_standard = 0.0
        wer_standard = 0.0
        cer_letters = 0.0
        matched_gold_chars = 0
        matched_pred_chars = 0
    
    # Calculate unmatched content
    matched_gold_indices = {g_idx for g_idx, _, _ in filtered_matches}
    matched_pred_indices = {p_idx for _, p_idx, _ in filtered_matches}
    
    if item_classes:
        # Only count unmatched items of the specified classes
        unmatched_gold_items = [
            gold_items[i] for i in range(len(gold_items))
            if i not in matched_gold_indices and gold_items[i]['item_class'] in item_classes
        ]
        unmatched_pred_items = [
            pred_items[i] for i in range(len(pred_items))
            if i not in matched_pred_indices and pred_items[i]['item_class'] in item_classes
        ]
        total_gold_chars = sum(len(item.get('item_text_raw', '')) 
                              for item in gold_items if item['item_class'] in item_classes)
    else:
        unmatched_gold_items = [gold_items[i] for i in range(len(gold_items)) 
                               if i not in matched_gold_indices]
        unmatched_pred_items = [pred_items[i] for i in range(len(pred_items)) 
                               if i not in matched_pred_indices]
        total_gold_chars = sum(len(item.get('item_text_raw', '')) for item in gold_items)
    
    unmatched_gold_chars = sum(len(item.get('item_text_raw', '')) 
                               for item in unmatched_gold_items)
    unmatched_pred_chars = sum(len(item.get('item_text_raw', '')) 
                               for item in unmatched_pred_items)
    
    return {
        'cer_strict': cer_strict,
        'wer_strict': wer_strict,
        'cer_standard': cer_standard,
        'wer_standard': wer_standard,
        'cer_letters': cer_letters,
        'matched_gold_chars': matched_gold_chars,
        'matched_pred_chars': matched_pred_chars,
        'unmatched_gold_chars': unmatched_gold_chars,
        'unmatched_pred_chars': unmatched_pred_chars,
        'total_gold_chars': total_gold_chars,
        'matched_percentage': (matched_gold_chars / total_gold_chars * 100) if total_gold_chars else 0
    }


# Evaluate text quality across all pages
print("Evaluating text quality...")
print("\n")

order_agnostic_all = []
order_agnostic_contrib = []
structure_aware_all = []
structure_aware_contrib = []

for page in all_pages:
    page_id = page['page_id']
    gold_items = page['gold_items']
    pred_items = page['pred_items']
    matches = page['matches']
    
    # Order-agnostic evaluation
    oa_all = evaluate_order_agnostic(gold_items, pred_items)
    oa_all['page_id'] = page_id
    order_agnostic_all.append(oa_all)
    
    oa_contrib = evaluate_order_agnostic(gold_items, pred_items, 
                                         item_classes=['prose', 'verse'])
    oa_contrib['page_id'] = page_id
    order_agnostic_contrib.append(oa_contrib)
    
    # Structure-aware evaluation
    sa_all = evaluate_structure_aware(gold_items, pred_items, matches)
    sa_all['page_id'] = page_id
    structure_aware_all.append(sa_all)
    
    sa_contrib = evaluate_structure_aware(gold_items, pred_items, matches,
                                          item_classes=['prose', 'verse'])
    sa_contrib['page_id'] = page_id
    structure_aware_contrib.append(sa_contrib)

# Calculate averages for order-agnostic evaluation
avg_oa_all = {
    'cer_strict': sum(r['cer_strict'] for r in order_agnostic_all) / len(order_agnostic_all),
    'wer_strict': sum(r['wer_strict'] for r in order_agnostic_all) / len(order_agnostic_all),
    'cer_standard': sum(r['cer_standard'] for r in order_agnostic_all) / len(order_agnostic_all),
    'wer_standard': sum(r['wer_standard'] for r in order_agnostic_all) / len(order_agnostic_all),
    'cer_letters': sum(r['cer_letters'] for r in order_agnostic_all) / len(order_agnostic_all)
}

contrib_with_content = [r for r in order_agnostic_contrib if r['gold_chars'] > 0]
avg_oa_contrib = {
    'cer_strict': sum(r['cer_strict'] for r in contrib_with_content) / len(contrib_with_content),
    'wer_strict': sum(r['wer_strict'] for r in contrib_with_content) / len(contrib_with_content),
    'cer_standard': sum(r['cer_standard'] for r in contrib_with_content) / len(contrib_with_content),
    'wer_standard': sum(r['wer_standard'] for r in contrib_with_content) / len(contrib_with_content),
    'cer_letters': sum(r['cer_letters'] for r in contrib_with_content) / len(contrib_with_content)
}

# Calculate averages for structure-aware evaluation
sa_all_with_matches = [r for r in structure_aware_all if r['matched_gold_chars'] > 0]
avg_sa_all = {
    'cer_strict': sum(r['cer_strict'] for r in sa_all_with_matches) / len(sa_all_with_matches),
    'wer_strict': sum(r['wer_strict'] for r in sa_all_with_matches) / len(sa_all_with_matches),
    'cer_standard': sum(r['cer_standard'] for r in sa_all_with_matches) / len(sa_all_with_matches),
    'wer_standard': sum(r['wer_standard'] for r in sa_all_with_matches) / len(sa_all_with_matches),
    'cer_letters': sum(r['cer_letters'] for r in sa_all_with_matches) / len(sa_all_with_matches)
}

sa_contrib_with_matches = [r for r in structure_aware_contrib if r['matched_gold_chars'] > 0]
avg_sa_contrib = {
    'cer_strict': sum(r['cer_strict'] for r in sa_contrib_with_matches) / len(sa_contrib_with_matches),
    'wer_strict': sum(r['wer_strict'] for r in sa_contrib_with_matches) / len(sa_contrib_with_matches),
    'cer_standard': sum(r['cer_standard'] for r in sa_contrib_with_matches) / len(sa_contrib_with_matches),
    'wer_standard': sum(r['wer_standard'] for r in sa_contrib_with_matches) / len(sa_contrib_with_matches),
    'cer_letters': sum(r['cer_letters'] for r in sa_contrib_with_matches) / len(sa_contrib_with_matches)
}

# Calculate total matched percentages
total_sa_all_matched = sum(r['matched_gold_chars'] for r in structure_aware_all)
total_sa_all_gold = sum(r['total_gold_chars'] for r in structure_aware_all)
total_sa_all_unmatched = sum(r['unmatched_gold_chars'] for r in structure_aware_all)

total_sa_contrib_matched = sum(r['matched_gold_chars'] for r in structure_aware_contrib)
total_sa_contrib_gold = sum(r['total_gold_chars'] for r in structure_aware_contrib)
total_sa_contrib_unmatched = sum(r['unmatched_gold_chars'] for r in structure_aware_contrib)

# Print results
print("="*70)
print("TEXT QUALITY SUMMARY")
print("="*70)

print("\n1. ORDER-AGNOSTIC EVALUATION")
print("   (Pure OCR quality, reading order irrelevant)")
print("-"*70)

print(f"\n   All Items:")
print(f"      Strict (with all whitespace):")
print(f"         CER: {avg_oa_all['cer_strict']:.2%}  |  WER: {avg_oa_all['wer_strict']:.2%}")
print(f"      Standard (normalized whitespace):")
print(f"         CER: {avg_oa_all['cer_standard']:.2%}  |  WER: {avg_oa_all['wer_standard']:.2%}")
print(f"      Letters Only (no whitespace/punctuation):")
print(f"         CER: {avg_oa_all['cer_letters']:.2%}")

print(f"\n   Contributions Only (prose + verse):")
print(f"      Strict (with all whitespace):")
print(f"         CER: {avg_oa_contrib['cer_strict']:.2%}  |  WER: {avg_oa_contrib['wer_strict']:.2%}")
print(f"      Standard (normalized whitespace):")
print(f"         CER: {avg_oa_contrib['cer_standard']:.2%}  |  WER: {avg_oa_contrib['wer_standard']:.2%}")
print(f"      Letters Only (no whitespace/punctuation):")
print(f"         CER: {avg_oa_contrib['cer_letters']:.2%}")

print("\n" + "="*70)
print("2. STRUCTURE-AWARE EVALUATION")
print("   (OCR quality on matched content only)")
print("-"*70)

print(f"\n   Matched Content - All Items:")
print(f"      Strict (with all whitespace):")
print(f"         CER: {avg_sa_all['cer_strict']:.2%}  |  WER: {avg_sa_all['wer_strict']:.2%}")
print(f"      Standard (normalized whitespace):")
print(f"         CER: {avg_sa_all['cer_standard']:.2%}  |  WER: {avg_sa_all['wer_standard']:.2%}")
print(f"      Letters Only (no whitespace/punctuation):")
print(f"         CER: {avg_sa_all['cer_letters']:.2%}")
print(f"      Coverage: {total_sa_all_matched:,} chars matched " +
      f"({total_sa_all_matched/total_sa_all_gold*100:.1f}% of gold)")
print(f"      Unmatched: {total_sa_all_unmatched:,} chars " +
      f"({total_sa_all_unmatched/total_sa_all_gold*100:.1f}% of gold)")

print(f"\n   Matched Content - Contributions Only (prose + verse):")
print(f"      Strict (with all whitespace):")
print(f"         CER: {avg_sa_contrib['cer_strict']:.2%}  |  WER: {avg_sa_contrib['wer_strict']:.2%}")
print(f"      Standard (normalized whitespace):")
print(f"         CER: {avg_sa_contrib['cer_standard']:.2%}  |  WER: {avg_sa_contrib['wer_standard']:.2%}")
print(f"      Letters Only (no whitespace/punctuation):")
print(f"         CER: {avg_sa_contrib['cer_letters']:.2%}")
print(f"      Coverage: {total_sa_contrib_matched:,} chars matched " +
      f"({total_sa_contrib_matched/total_sa_contrib_gold*100:.1f}% of gold)")
print(f"      Unmatched: {total_sa_contrib_unmatched:,} chars " +
      f"({total_sa_contrib_unmatched/total_sa_contrib_gold*100:.1f}% of gold)")

print("\n" + "="*70)
print("INTERPRETATION GUIDE:")
print("-"*70)
print("Strict: Most conservative")
print("Standard: Fair baseline - normalizes whitespace")
print("Letters Only: Most lenient - pure character recognition quality")
print("\n" + "="*70)
print("\nKEY INSIGHTS:")
print(f"- Pure OCR quality (standard normalization): {avg_oa_all['cer_standard']:.2%}")
print(f"- Letter recognition quality: {avg_oa_all['cer_letters']:.2%}")
print(f"- Structure failures (unmatched content): {total_sa_all_unmatched/total_sa_all_gold*100:.1f}%")
print(f"- Contributions:")
print(f"    Standard CER: {avg_sa_contrib['cer_standard']:.2%}")
print(f"    Successfully matched: {total_sa_contrib_matched/total_sa_contrib_gold*100:.1f}%")
print("="*70)

Evaluating text quality...


TEXT QUALITY SUMMARY

1. ORDER-AGNOSTIC EVALUATION
   (Pure OCR quality, reading order irrelevant)
----------------------------------------------------------------------

   All Items:
      Strict (with all whitespace):
         CER: 14.83%  |  WER: 17.66%
      Standard (normalized whitespace):
         CER: 14.83%  |  WER: 17.66%
      Letters Only (no whitespace/punctuation):
         CER: 13.48%

   Contributions Only (prose + verse):
      Strict (with all whitespace):
         CER: 18.29%  |  WER: 20.85%
      Standard (normalized whitespace):
         CER: 18.29%  |  WER: 20.85%
      Letters Only (no whitespace/punctuation):
         CER: 17.94%

2. STRUCTURE-AWARE EVALUATION
   (OCR quality on matched content only)
----------------------------------------------------------------------

   Matched Content - All Items:
      Strict (with all whitespace):
         CER: 15.13%  |  WER: 21.12%
      Standard (normalized whitespace):
         CER: 14.61

In [4]:
def evaluate_item_boundaries(gold_path: Path, pred_path: Path, 
                            item_classes: Optional[List[str]] = None,
                            tolerance: int = 20) -> Dict:
    """
    Evaluate whether item boundaries are correctly detected.
    
    Args:
        gold_path: Path to gold standard JSON
        pred_path: Path to prediction JSON
        item_classes: If provided, only evaluate these item classes
        tolerance: Character tolerance window for boundary matching
    
    Returns:
        Dict with precision, recall, F1
    """
    # Load and validate gold standard
    with open(gold_path, 'r', encoding='utf-8') as f:
        gold_data = json.load(f)
    gold_page = Stage1PageModel.model_validate(gold_data)
    gold_data = gold_page.model_dump()
    
    # Load and validate prediction
    with open(pred_path, 'r', encoding='utf-8') as f:
        pred_data = json.load(f)
    pred_page = Stage1PageModel.model_validate(pred_data)
    pred_data = pred_page.model_dump()
    
    def get_boundaries(items, filter_classes=None):
        """Get character positions where items start."""
        boundaries = []
        pos = 0
        for item in items:
            if filter_classes is None or item.get('item_class') in filter_classes:
                boundaries.append(pos)
            # Add text length + separator
            pos += len(item.get('item_text_raw', '')) + 2
        return set(boundaries)
    
    gold_items = gold_data.get('items', [])
    pred_items = pred_data.get('items', [])
    
    gold_bounds = get_boundaries(gold_items, item_classes)
    pred_bounds = get_boundaries(pred_items, item_classes)
    
    # Match boundaries within tolerance
    tp = 0
    for pred_b in pred_bounds:
        if any(abs(pred_b - gold_b) <= tolerance for gold_b in gold_bounds):
            tp += 1
    
    fp = len(pred_bounds) - tp
    fn = len(gold_bounds) - tp
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'tp': tp,
        'fp': fp,
        'fn': fn,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'gold_boundaries': len(gold_bounds),
        'pred_boundaries': len(pred_bounds)
    }

# Evaluate item boundaries
print("Evaluating item boundary detection...\n")

boundary_results_all = []
boundary_results_contrib = []

for gold_path, pred_path in page_pairs:
    # All items
    result_all = evaluate_item_boundaries(gold_path, pred_path)
    result_all['page'] = gold_path.name
    boundary_results_all.append(result_all)
    
    # Contributions only
    result_contrib = evaluate_item_boundaries(gold_path, pred_path,
                                             item_classes=['prose', 'verse'])
    result_contrib['page'] = gold_path.name
    boundary_results_contrib.append(result_contrib)
    
    print(f"‚úì {gold_path.name}")
    print(f"   All items - P: {result_all['precision']:.2%}, "
          f"R: {result_all['recall']:.2%}, F1: {result_all['f1']:.3f}")
    print(f"   Contributions - P: {result_contrib['precision']:.2%}, "
          f"R: {result_contrib['recall']:.2%}, F1: {result_contrib['f1']:.3f}\n")

# Compute micro-averages (sum all TP/FP/FN, then compute metrics)
total_tp_all = sum(r['tp'] for r in boundary_results_all)
total_fp_all = sum(r['fp'] for r in boundary_results_all)
total_fn_all = sum(r['fn'] for r in boundary_results_all)

precision_all = total_tp_all / (total_tp_all + total_fp_all) if (total_tp_all + total_fp_all) > 0 else 0
recall_all = total_tp_all / (total_tp_all + total_fn_all) if (total_tp_all + total_fn_all) > 0 else 0
f1_all = 2 * precision_all * recall_all / (precision_all + recall_all) if (precision_all + recall_all) > 0 else 0

total_tp_contrib = sum(r['tp'] for r in boundary_results_contrib)
total_fp_contrib = sum(r['fp'] for r in boundary_results_contrib)
total_fn_contrib = sum(r['fn'] for r in boundary_results_contrib)

precision_contrib = total_tp_contrib / (total_tp_contrib + total_fp_contrib) if (total_tp_contrib + total_fp_contrib) > 0 else 0
recall_contrib = total_tp_contrib / (total_tp_contrib + total_fn_contrib) if (total_tp_contrib + total_fn_contrib) > 0 else 0
f1_contrib = 2 * precision_contrib * recall_contrib / (precision_contrib + recall_contrib) if (precision_contrib + recall_contrib) > 0 else 0

print(f"{'='*60}")
print(f"ITEM BOUNDARY DETECTION SUMMARY")
print(f"{'='*60}")
print(f"\nAll Items:")
print(f"   Precision: {precision_all:.2%}")
print(f"   Recall: {recall_all:.2%}")
print(f"   F1: {f1_all:.3f}")
print(f"\nContributions Only (prose + verse):")
print(f"   Precision: {precision_contrib:.2%}")
print(f"   Recall: {recall_contrib:.2%}")
print(f"   F1: {f1_contrib:.3f}")


Evaluating item boundary detection...

‚úì La_Plume_bpt6k1185893k_1_10_1889__page-001.json
   All items - P: 40.00%, R: 25.00%, F1: 0.308
   Contributions - P: 0.00%, R: 0.00%, F1: 0.000

‚úì La_Plume_bpt6k1185893k_1_10_1889__page-002.json
   All items - P: 100.00%, R: 100.00%, F1: 1.000
   Contributions - P: 0.00%, R: 0.00%, F1: 0.000

‚úì La_Plume_bpt6k1185893k_1_10_1889__page-003.json
   All items - P: 66.67%, R: 66.67%, F1: 0.667
   Contributions - P: 50.00%, R: 100.00%, F1: 0.667

‚úì La_Plume_bpt6k1185893k_1_10_1889__page-004.json
   All items - P: 75.00%, R: 60.00%, F1: 0.667
   Contributions - P: 75.00%, R: 100.00%, F1: 0.857

‚úì La_Plume_bpt6k1185893k_1_10_1889__page-005.json
   All items - P: 25.00%, R: 20.00%, F1: 0.222
   Contributions - P: 25.00%, R: 33.33%, F1: 0.286

‚úì La_Plume_bpt6k1185893k_1_10_1889__page-006.json
   All items - P: 50.00%, R: 33.33%, F1: 0.400
   Contributions - P: 50.00%, R: 50.00%, F1: 0.500

‚úì La_Plume_bpt6k1185893k_1_10_1889__page-007.json
   

In [5]:
def evaluate_classification(gold_path: Path, pred_path: Path,
                           item_classes: Optional[List[str]] = None) -> Dict:
    """
    Evaluate item_class classification accuracy.
    
    Assumes items are in same order (or uses simple alignment).
    
    Returns:
        Dict with accuracy, per-class metrics, confusion matrix
    """
    # Load and validate gold standard
    with open(gold_path, 'r', encoding='utf-8') as f:
        gold_data = json.load(f)
    gold_page = Stage1PageModel.model_validate(gold_data)
    gold_data = gold_page.model_dump()
    
    # Load and validate prediction
    with open(pred_path, 'r', encoding='utf-8') as f:
        pred_data = json.load(f)
    pred_page = Stage1PageModel.model_validate(pred_data)
    pred_data = pred_page.model_dump()
    
    gold_items = gold_data.get('items', [])
    pred_items = pred_data.get('items', [])
    
    # Filter by item class if specified
    if item_classes:
        gold_items = [item for item in gold_items 
                     if item.get('item_class') in item_classes]
        pred_items = [item for item in pred_items 
                     if item.get('item_class') in item_classes]
    
    # Simple alignment: assume same number and order
    if len(gold_items) != len(pred_items):
        print(f"Item count mismatch: gold={len(gold_items)}, pred={len(pred_items)}")
    
    min_len = min(len(gold_items), len(pred_items))
    
    correct = 0
    confusion = defaultdict(lambda: defaultdict(int))
    
    for i in range(min_len):
        gold_class = gold_items[i].get('item_class', 'unknown')
        pred_class = pred_items[i].get('item_class', 'unknown')
        
        confusion[gold_class][pred_class] += 1
        if gold_class == pred_class:
            correct += 1
    
    accuracy = correct / min_len if min_len > 0 else 0
    
    return {
        'correct': correct,
        'total': min_len,
        'accuracy': accuracy,
        'confusion': dict(confusion),
        'gold_count': len(gold_items),
        'pred_count': len(pred_items)
    }

# Evaluate classification
print("Evaluating item classification...\n")

classification_results_all = []
classification_results_contrib = []

for gold_path, pred_path in page_pairs:
    # All items
    result_all = evaluate_classification(gold_path, pred_path)
    result_all['page'] = gold_path.name
    classification_results_all.append(result_all)
    
    # Contributions only
    result_contrib = evaluate_classification(gold_path, pred_path,
                                            item_classes=['prose', 'verse'])
    result_contrib['page'] = gold_path.name
    classification_results_contrib.append(result_contrib)
    
    print(f"‚úì {gold_path.name}")
    print(f"   All items - Accuracy: {result_all['accuracy']:.2%} "
          f"({result_all['correct']}/{result_all['total']})")
    if result_contrib['total'] > 0:
        print(f"   Contributions - Accuracy: {result_contrib['accuracy']:.2%} "
              f"({result_contrib['correct']}/{result_contrib['total']})")
    print()

# Compute overall accuracy
total_correct_all = sum(r['correct'] for r in classification_results_all)
total_items_all = sum(r['total'] for r in classification_results_all)
overall_accuracy_all = total_correct_all / total_items_all if total_items_all > 0 else 0

total_correct_contrib = sum(r['correct'] for r in classification_results_contrib)
total_items_contrib = sum(r['total'] for r in classification_results_contrib)
overall_accuracy_contrib = total_correct_contrib / total_items_contrib if total_items_contrib > 0 else 0

# Aggregate confusion matrix
all_confusion = defaultdict(lambda: defaultdict(int))
for result in classification_results_all:
    for gold_class, pred_dict in result['confusion'].items():
        for pred_class, count in pred_dict.items():
            all_confusion[gold_class][pred_class] += count

print(f"{'='*60}")
print(f"CLASSIFICATION ACCURACY SUMMARY")
print(f"{'='*60}")
print(f"\nAll Items:")
print(f"   Overall Accuracy: {overall_accuracy_all:.2%} ({total_correct_all}/{total_items_all})")
print(f"\nContributions Only (prose + verse):")
print(f"   Overall Accuracy: {overall_accuracy_contrib:.2%} ({total_correct_contrib}/{total_items_contrib})")

print(f"\nConfusion Matrix (All Items):")
print(f"{'Gold / Pred':<15}", end="")
all_classes = sorted(set(list(all_confusion.keys()) + 
                        [pred for preds in all_confusion.values() for pred in preds.keys()]))
for pred_class in all_classes:
    print(f"{pred_class:<12}", end="")
print()
for gold_class in all_classes:
    print(f"{gold_class:<15}", end="")
    for pred_class in all_classes:
        count = all_confusion[gold_class][pred_class]
        print(f"{count:<12}", end="")
    print()

Evaluating item classification...

Item count mismatch: gold=8, pred=5
‚úì La_Plume_bpt6k1185893k_1_10_1889__page-001.json
   All items - Accuracy: 100.00% (5/5)

Item count mismatch: gold=0, pred=1
‚úì La_Plume_bpt6k1185893k_1_10_1889__page-002.json
   All items - Accuracy: 50.00% (1/2)

Item count mismatch: gold=1, pred=2
‚úì La_Plume_bpt6k1185893k_1_10_1889__page-003.json
   All items - Accuracy: 66.67% (2/3)
   Contributions - Accuracy: 100.00% (1/1)

Item count mismatch: gold=5, pred=4
Item count mismatch: gold=3, pred=4
‚úì La_Plume_bpt6k1185893k_1_10_1889__page-004.json
   All items - Accuracy: 0.00% (0/4)
   Contributions - Accuracy: 33.33% (1/3)

Item count mismatch: gold=5, pred=4
Item count mismatch: gold=3, pred=4
‚úì La_Plume_bpt6k1185893k_1_10_1889__page-005.json
   All items - Accuracy: 50.00% (2/4)
   Contributions - Accuracy: 100.00% (3/3)

Item count mismatch: gold=6, pred=4
‚úì La_Plume_bpt6k1185893k_1_10_1889__page-006.json
   All items - Accuracy: 50.00% (2/4)
   C

In [6]:
def evaluate_metadata(gold_path: Path, pred_path: Path,
                     item_classes: Optional[List[str]] = None) -> Dict:
    """
    Evaluate title and author extraction accuracy.
    
    Returns:
        Dict with title/author presence detection and exact match metrics
    """
    # Load and validate gold standard
    with open(gold_path, 'r', encoding='utf-8') as f:
        gold_data = json.load(f)
    gold_page = Stage1PageModel.model_validate(gold_data)
    gold_data = gold_page.model_dump()
    
    # Load and validate prediction
    with open(pred_path, 'r', encoding='utf-8') as f:
        pred_data = json.load(f)
    pred_page = Stage1PageModel.model_validate(pred_data)
    pred_data = pred_page.model_dump()
    
    gold_items = gold_data.get('items', [])
    pred_items = pred_data.get('items', [])
    
    # Filter by item class if specified
    if item_classes:
        gold_items = [item for item in gold_items 
                     if item.get('item_class') in item_classes]
        pred_items = [item for item in pred_items 
                     if item.get('item_class') in item_classes]
    
    min_len = min(len(gold_items), len(pred_items))
    
    title_metrics = {'tp': 0, 'fp': 0, 'fn': 0, 'exact_match': 0}
    author_metrics = {'tp': 0, 'fp': 0, 'fn': 0, 'exact_match': 0}
    
    for i in range(min_len):
        gold_item = gold_items[i]
        pred_item = pred_items[i]
        
        # Title evaluation
        gold_title = gold_item.get('item_title')
        pred_title = pred_item.get('item_title')
        
        if gold_title and pred_title:
            title_metrics['tp'] += 1
            if gold_title == pred_title:
                title_metrics['exact_match'] += 1
        elif not gold_title and pred_title:
            title_metrics['fp'] += 1
        elif gold_title and not pred_title:
            title_metrics['fn'] += 1
        
        # Author evaluation
        gold_author = gold_item.get('item_author')
        pred_author = pred_item.get('item_author')
        
        if gold_author and pred_author:
            author_metrics['tp'] += 1
            if gold_author == pred_author:
                author_metrics['exact_match'] += 1
        elif not gold_author and pred_author:
            author_metrics['fp'] += 1
        elif gold_author and not pred_author:
            author_metrics['fn'] += 1
    
    # Compute F1 for title
    title_p = title_metrics['tp'] / (title_metrics['tp'] + title_metrics['fp']) if (title_metrics['tp'] + title_metrics['fp']) > 0 else 0
    title_r = title_metrics['tp'] / (title_metrics['tp'] + title_metrics['fn']) if (title_metrics['tp'] + title_metrics['fn']) > 0 else 0
    title_f1 = 2 * title_p * title_r / (title_p + title_r) if (title_p + title_r) > 0 else 0
    
    # Compute F1 for author
    author_p = author_metrics['tp'] / (author_metrics['tp'] + author_metrics['fp']) if (author_metrics['tp'] + author_metrics['fp']) > 0 else 0
    author_r = author_metrics['tp'] / (author_metrics['tp'] + author_metrics['fn']) if (author_metrics['tp'] + author_metrics['fn']) > 0 else 0
    author_f1 = 2 * author_p * author_r / (author_p + author_r) if (author_p + author_r) > 0 else 0
    
    return {
        'title': {
            **title_metrics,
            'precision': title_p,
            'recall': title_r,
            'f1': title_f1
        },
        'author': {
            **author_metrics,
            'precision': author_p,
            'recall': author_r,
            'f1': author_f1
        }
    }

# Evaluate metadata extraction
print("Evaluating metadata extraction...\n")

metadata_results_all = []
metadata_results_contrib = []

for gold_path, pred_path in page_pairs:
    # All items
    result_all = evaluate_metadata(gold_path, pred_path)
    result_all['page'] = gold_path.name
    metadata_results_all.append(result_all)
    
    # Contributions only
    result_contrib = evaluate_metadata(gold_path, pred_path,
                                      item_classes=['prose', 'verse'])
    result_contrib['page'] = gold_path.name
    metadata_results_contrib.append(result_contrib)
    
    print(f"‚úì {gold_path.name}")
    print(f"   Title F1: {result_all['title']['f1']:.3f}, "
          f"Author F1: {result_all['author']['f1']:.3f}")

# Aggregate metrics
def aggregate_metadata_metrics(results):
    total_title_tp = sum(r['title']['tp'] for r in results)
    total_title_fp = sum(r['title']['fp'] for r in results)
    total_title_fn = sum(r['title']['fn'] for r in results)
    total_title_exact = sum(r['title']['exact_match'] for r in results)
    
    title_p = total_title_tp / (total_title_tp + total_title_fp) if (total_title_tp + total_title_fp) > 0 else 0
    title_r = total_title_tp / (total_title_tp + total_title_fn) if (total_title_tp + total_title_fn) > 0 else 0
    title_f1 = 2 * title_p * title_r / (title_p + title_r) if (title_p + title_r) > 0 else 0
    
    total_author_tp = sum(r['author']['tp'] for r in results)
    total_author_fp = sum(r['author']['fp'] for r in results)
    total_author_fn = sum(r['author']['fn'] for r in results)
    total_author_exact = sum(r['author']['exact_match'] for r in results)
    
    author_p = total_author_tp / (total_author_tp + total_author_fp) if (total_author_tp + total_author_fp) > 0 else 0
    author_r = total_author_tp / (total_author_tp + total_author_fn) if (total_author_tp + total_author_fn) > 0 else 0
    author_f1 = 2 * author_p * author_r / (author_p + author_r) if (author_p + author_r) > 0 else 0
    
    return {
        'title': {'precision': title_p, 'recall': title_r, 'f1': title_f1, 'exact_match': total_title_exact, 'tp': total_title_tp},
        'author': {'precision': author_p, 'recall': author_r, 'f1': author_f1, 'exact_match': total_author_exact, 'tp': total_author_tp}
    }

agg_all = aggregate_metadata_metrics(metadata_results_all)
agg_contrib = aggregate_metadata_metrics(metadata_results_contrib)

print(f"\n{'='*60}")
print(f"METADATA EXTRACTION SUMMARY")
print(f"{'='*60}")
print(f"\nAll Items:")
print(f"   Title - P: {agg_all['title']['precision']:.2%}, R: {agg_all['title']['recall']:.2%}, F1: {agg_all['title']['f1']:.3f}")
print(f"           Exact matches: {agg_all['title']['exact_match']}/{agg_all['title']['tp']}")
print(f"   Author - P: {agg_all['author']['precision']:.2%}, R: {agg_all['author']['recall']:.2%}, F1: {agg_all['author']['f1']:.3f}")
print(f"            Exact matches: {agg_all['author']['exact_match']}/{agg_all['author']['tp']}")
print(f"\nContributions Only (prose + verse):")
print(f"   Title - P: {agg_contrib['title']['precision']:.2%}, R: {agg_contrib['title']['recall']:.2%}, F1: {agg_contrib['title']['f1']:.3f}")
print(f"   Author - P: {agg_contrib['author']['precision']:.2%}, R: {agg_contrib['author']['recall']:.2%}, F1: {agg_contrib['author']['f1']:.3f}")

Evaluating metadata extraction...

‚úì La_Plume_bpt6k1185893k_1_10_1889__page-001.json
   Title F1: 0.000, Author F1: 0.000
‚úì La_Plume_bpt6k1185893k_1_10_1889__page-002.json
   Title F1: 1.000, Author F1: 0.000
‚úì La_Plume_bpt6k1185893k_1_10_1889__page-003.json
   Title F1: 0.000, Author F1: 0.000
‚úì La_Plume_bpt6k1185893k_1_10_1889__page-004.json
   Title F1: 0.667, Author F1: 0.667
‚úì La_Plume_bpt6k1185893k_1_10_1889__page-005.json
   Title F1: 0.667, Author F1: 0.000
‚úì La_Plume_bpt6k1185893k_1_10_1889__page-006.json
   Title F1: 0.500, Author F1: 0.400
‚úì La_Plume_bpt6k1185893k_1_10_1889__page-007.json
   Title F1: 0.000, Author F1: 0.000
‚úì La_Plume_bpt6k1185893k_1_10_1889__page-008.json
   Title F1: 0.000, Author F1: 0.000
‚úì La_Plume_bpt6k1185893k_1_10_1889__page-009.json
   Title F1: 0.667, Author F1: 0.571
‚úì La_Plume_bpt6k1185893k_1_10_1889__page-010.json
   Title F1: 0.500, Author F1: 0.400
‚úì La_Plume_bpt6k1185893k_1_10_1889__page-011.json
   Title F1: 0.000, Aut

In [7]:
def evaluate_continuation_tracking(gold_path: Path, pred_path: Path,
                                  item_classes: Optional[List[str]] = None) -> Dict:
    """
    Evaluate continuation field accuracy (is_continuation, continues_on_next_page).
    
    Returns:
        Dict with precision, recall, F1 for each continuation field
    """
    # Load and validate gold standard
    with open(gold_path, 'r', encoding='utf-8') as f:
        gold_data = json.load(f)
    gold_page = Stage1PageModel.model_validate(gold_data)
    gold_data = gold_page.model_dump()
    
    # Load and validate prediction
    with open(pred_path, 'r', encoding='utf-8') as f:
        pred_data = json.load(f)
    pred_page = Stage1PageModel.model_validate(pred_data)
    pred_data = pred_page.model_dump()
    
    gold_items = gold_data.get('items', [])
    pred_items = pred_data.get('items', [])
    
    # Filter by item class if specified
    if item_classes:
        gold_items = [item for item in gold_items 
                     if item.get('item_class') in item_classes]
        pred_items = [item for item in pred_items 
                     if item.get('item_class') in item_classes]
    
    min_len = min(len(gold_items), len(pred_items))
    
    # Metrics for is_continuation
    is_cont_metrics = {'tp': 0, 'fp': 0, 'fn': 0, 'tn': 0}
    # Metrics for continues_on_next_page
    continues_metrics = {'tp': 0, 'fp': 0, 'fn': 0, 'tn': 0}
    
    for i in range(min_len):
        gold_item = gold_items[i]
        pred_item = pred_items[i]
        
        # Evaluate is_continuation (treat absent as False)
        gold_is_cont = gold_item.get('is_continuation', False)
        pred_is_cont = pred_item.get('is_continuation', False)
        
        if gold_is_cont and pred_is_cont:
            is_cont_metrics['tp'] += 1
        elif not gold_is_cont and pred_is_cont:
            is_cont_metrics['fp'] += 1
        elif gold_is_cont and not pred_is_cont:
            is_cont_metrics['fn'] += 1
        else:
            is_cont_metrics['tn'] += 1
        
        # Evaluate continues_on_next_page
        gold_continues = gold_item.get('continues_on_next_page', False)
        pred_continues = pred_item.get('continues_on_next_page', False)
        
        if gold_continues and pred_continues:
            continues_metrics['tp'] += 1
        elif not gold_continues and pred_continues:
            continues_metrics['fp'] += 1
        elif gold_continues and not pred_continues:
            continues_metrics['fn'] += 1
        else:
            continues_metrics['tn'] += 1
    
    # Compute metrics for is_continuation
    is_cont_p = is_cont_metrics['tp'] / (is_cont_metrics['tp'] + is_cont_metrics['fp']) if (is_cont_metrics['tp'] + is_cont_metrics['fp']) > 0 else 0
    is_cont_r = is_cont_metrics['tp'] / (is_cont_metrics['tp'] + is_cont_metrics['fn']) if (is_cont_metrics['tp'] + is_cont_metrics['fn']) > 0 else 0
    is_cont_f1 = 2 * is_cont_p * is_cont_r / (is_cont_p + is_cont_r) if (is_cont_p + is_cont_r) > 0 else 0
    
    # Compute metrics for continues_on_next_page
    continues_p = continues_metrics['tp'] / (continues_metrics['tp'] + continues_metrics['fp']) if (continues_metrics['tp'] + continues_metrics['fp']) > 0 else 0
    continues_r = continues_metrics['tp'] / (continues_metrics['tp'] + continues_metrics['fn']) if (continues_metrics['tp'] + continues_metrics['fn']) > 0 else 0
    continues_f1 = 2 * continues_p * continues_r / (continues_p + continues_r) if (continues_p + continues_r) > 0 else 0
    
    return {
        'is_continuation': {
            **is_cont_metrics,
            'precision': is_cont_p,
            'recall': is_cont_r,
            'f1': is_cont_f1
        },
        'continues_on_next_page': {
            **continues_metrics,
            'precision': continues_p,
            'recall': continues_r,
            'f1': continues_f1
        }
    }

# Evaluate continuation tracking
print("Evaluating continuation tracking...\n")

continuation_results = []

for gold_path, pred_path in page_pairs:
    result = evaluate_continuation_tracking(gold_path, pred_path,
                                           item_classes=['prose', 'verse'])
    result['page'] = gold_path.name
    continuation_results.append(result)
    
    print(f"‚úì {gold_path.name}")
    print(f"   is_continuation - F1: {result['is_continuation']['f1']:.3f}")
    print(f"   continues_on_next - F1: {result['continues_on_next_page']['f1']:.3f}\n")

# Aggregate continuation metrics
total_is_cont_tp = sum(r['is_continuation']['tp'] for r in continuation_results)
total_is_cont_fp = sum(r['is_continuation']['fp'] for r in continuation_results)
total_is_cont_fn = sum(r['is_continuation']['fn'] for r in continuation_results)

is_cont_p = total_is_cont_tp / (total_is_cont_tp + total_is_cont_fp) if (total_is_cont_tp + total_is_cont_fp) > 0 else 0
is_cont_r = total_is_cont_tp / (total_is_cont_tp + total_is_cont_fn) if (total_is_cont_tp + total_is_cont_fn) > 0 else 0
is_cont_f1 = 2 * is_cont_p * is_cont_r / (is_cont_p + is_cont_r) if (is_cont_p + is_cont_r) > 0 else 0

total_continues_tp = sum(r['continues_on_next_page']['tp'] for r in continuation_results)
total_continues_fp = sum(r['continues_on_next_page']['fp'] for r in continuation_results)
total_continues_fn = sum(r['continues_on_next_page']['fn'] for r in continuation_results)

continues_p = total_continues_tp / (total_continues_tp + total_continues_fp) if (total_continues_tp + total_continues_fp) > 0 else 0
continues_r = total_continues_tp / (total_continues_tp + total_continues_fn) if (total_continues_tp + total_continues_fn) > 0 else 0
continues_f1 = 2 * continues_p * continues_r / (continues_p + continues_r) if (continues_p + continues_r) > 0 else 0

print(f"{'='*60}")
print(f"CONTINUATION TRACKING SUMMARY (Contributions Only)")
print(f"{'='*60}")
print(f"\nis_continuation:")
print(f"   Precision: {is_cont_p:.2%}")
print(f"   Recall: {is_cont_r:.2%}")
print(f"   F1: {is_cont_f1:.3f}")
print(f"\ncontinues_on_next_page:")
print(f"   Precision: {continues_p:.2%}")
print(f"   Recall: {continues_r:.2%}")
print(f"   F1: {continues_f1:.3f}")

Evaluating continuation tracking...

‚úì La_Plume_bpt6k1185893k_1_10_1889__page-001.json
   is_continuation - F1: 0.000
   continues_on_next - F1: 0.000

‚úì La_Plume_bpt6k1185893k_1_10_1889__page-002.json
   is_continuation - F1: 0.000
   continues_on_next - F1: 0.000

‚úì La_Plume_bpt6k1185893k_1_10_1889__page-003.json
   is_continuation - F1: 0.000
   continues_on_next - F1: 1.000

‚úì La_Plume_bpt6k1185893k_1_10_1889__page-004.json
   is_continuation - F1: 1.000
   continues_on_next - F1: 0.000

‚úì La_Plume_bpt6k1185893k_1_10_1889__page-005.json
   is_continuation - F1: 0.000
   continues_on_next - F1: 1.000

‚úì La_Plume_bpt6k1185893k_1_10_1889__page-006.json
   is_continuation - F1: 0.000
   continues_on_next - F1: 0.000

‚úì La_Plume_bpt6k1185893k_1_10_1889__page-007.json
   is_continuation - F1: 0.000
   continues_on_next - F1: 0.000

‚úì La_Plume_bpt6k1185893k_1_10_1889__page-008.json
   is_continuation - F1: 0.000
   continues_on_next - F1: 0.000

‚úì La_Plume_bpt6k1185893k_

In [8]:
print("\n" + "="*70)
print("COMPREHENSIVE EVALUATION SUMMARY")
print("="*70)
print("\nSecond_try_revised vs Gold Standard")
print(f"Evaluated on {len(page_pairs)} pages\n")

print(f"{'Metric':<30} {'All Items':<20} {'Contributions':<20}")
print("-" * 70)

# Text Quality
print(f"{'TEXT QUALITY':<30}")
print(f"{'  Character Error Rate':<30} {avg_cer_all:>18.2%} {avg_cer_contrib:>18.2%}")
print(f"{'  Word Error Rate':<30} {avg_wer_all:>18.2%} {avg_wer_contrib:>18.2%}")
print()

# Structure Quality
print(f"{'STRUCTURE QUALITY':<30}")
print(f"{'  Boundary Detection F1':<30} {f1_all:>18.3f} {f1_contrib:>18.3f}")
print(f"{'  Classification Accuracy':<30} {overall_accuracy_all:>18.2%} {overall_accuracy_contrib:>18.2%}")
print()

# Metadata Quality
print(f"{'METADATA EXTRACTION':<30}")
print(f"{'  Title F1':<30} {agg_all['title']['f1']:>18.3f} {agg_contrib['title']['f1']:>18.3f}")
print(f"{'  Author F1':<30} {agg_all['author']['f1']:>18.3f} {agg_contrib['author']['f1']:>18.3f}")
print()

# Continuation Tracking
print(f"{'CONTINUATION TRACKING':<30} {'N/A':<20} {'Contributions':<20}")
print(f"{'  is_continuation F1':<30} {'':<20} {is_cont_f1:>18.3f}")
print(f"{'  continues_on_next F1':<30} {'':<20} {continues_f1:>18.3f}")

print("\n" + "="*70)


COMPREHENSIVE EVALUATION SUMMARY

Second_try_revised vs Gold Standard
Evaluated on 14 pages

Metric                         All Items            Contributions       
----------------------------------------------------------------------
TEXT QUALITY                  
  Character Error Rate                     12.63%             31.65%
  Word Error Rate                          17.82%             34.04%

STRUCTURE QUALITY             
  Boundary Detection F1                     0.426              0.459
  Classification Accuracy                  38.78%             85.71%

METADATA EXTRACTION           
  Title F1                                  0.583              0.929
  Author F1                                 0.417              0.889

CONTINUATION TRACKING          N/A                  Contributions       
  is_continuation F1                                             0.500
  continues_on_next F1                                           0.833



In [6]:
import json
from pathlib import Path

PROJECT_ROOT = Path("/home/fabian-ramirez/Documents/These/Code/magazine_graphs")
GOLD_DIR = PROJECT_ROOT / "data" / "gold_standard" / "cleaned"
PRED_DIR = PROJECT_ROOT / "data" / "interim_pages" / "La_Plume_bpt6k1185893k_1_10_1889"

def analyze_continuation_fields(directory, label):
    """Count continuation field usage across all files."""
    stats = {
        'is_continuation': {'true': 0, 'false': 0, 'null': 0, 'absent': 0},
        'continues_on_next_page': {'true': 0, 'false': 0, 'null': 0, 'absent': 0},
        'total_items': 0,
        'files_processed': 0
    }
    
    for json_file in sorted(directory.glob("*.json")):
        try:
            data = json.loads(json_file.read_text(encoding='utf-8'))
            items = data.get('items', [])
            stats['files_processed'] += 1
            
            for item in items:
                stats['total_items'] += 1
                
                # Check is_continuation
                is_cont = item.get('is_continuation')
                if is_cont is True:
                    stats['is_continuation']['true'] += 1
                elif is_cont is False:
                    stats['is_continuation']['false'] += 1
                elif is_cont is None:
                    stats['is_continuation']['null'] += 1
                else:  # key not present
                    stats['is_continuation']['absent'] += 1
                
                # Check continues_on_next_page
                continues = item.get('continues_on_next_page')
                if continues is True:
                    stats['continues_on_next_page']['true'] += 1
                elif continues is False:
                    stats['continues_on_next_page']['false'] += 1
                elif continues is None:
                    stats['continues_on_next_page']['null'] += 1
                else:  # key not present
                    stats['continues_on_next_page']['absent'] += 1
                    
        except Exception as e:
            print(f"Error reading {json_file.name}: {e}")
    
    return stats

print("=" * 70)
print("CONTINUATION FIELD ANALYSIS")
print("=" * 70)

# Analyze gold standard
print("\nüìö GOLD STANDARD:")
gold_stats = analyze_continuation_fields(GOLD_DIR, "Gold")
print(f"  Files processed: {gold_stats['files_processed']}")
print(f"  Total items: {gold_stats['total_items']}")
print(f"\n  is_continuation:")
print(f"    True:   {gold_stats['is_continuation']['true']}")
print(f"    False:  {gold_stats['is_continuation']['false']}")
print(f"    Null:   {gold_stats['is_continuation']['null']}")
print(f"    Absent: {gold_stats['is_continuation']['absent']}")
print(f"\n  continues_on_next_page:")
print(f"    True:   {gold_stats['continues_on_next_page']['true']}")
print(f"    False:  {gold_stats['continues_on_next_page']['false']}")
print(f"    Null:   {gold_stats['continues_on_next_page']['null']}")
print(f"    Absent: {gold_stats['continues_on_next_page']['absent']}")

# Analyze predictions
print("\n\nü§ñ PREDICTIONS:")
pred_stats = analyze_continuation_fields(PRED_DIR, "Predictions")
print(f"  Files processed: {pred_stats['files_processed']}")
print(f"  Total items: {pred_stats['total_items']}")
print(f"\n  is_continuation:")
print(f"    True:   {pred_stats['is_continuation']['true']}")
print(f"    False:  {pred_stats['is_continuation']['false']}")
print(f"    Null:   {pred_stats['is_continuation']['null']}")
print(f"    Absent: {pred_stats['is_continuation']['absent']}")
print(f"\n  continues_on_next_page:")
print(f"    True:   {pred_stats['continues_on_next_page']['true']}")
print(f"    False:  {pred_stats['continues_on_next_page']['false']}")
print(f"    Null:   {pred_stats['continues_on_next_page']['null']}")
print(f"    Absent: {pred_stats['continues_on_next_page']['absent']}")

# Compare
print("\n\n" + "=" * 70)
print("COMPARISON")
print("=" * 70)

print(f"\nis_continuation=True:")
print(f"  Gold has: {gold_stats['is_continuation']['true']}")
print(f"  Pred has: {pred_stats['is_continuation']['true']}")
print(f"  Detection rate: {pred_stats['is_continuation']['true']}/{gold_stats['is_continuation']['true']} = "
      f"{pred_stats['is_continuation']['true']/gold_stats['is_continuation']['true']*100:.1f}%" 
      if gold_stats['is_continuation']['true'] > 0 else "  N/A")

print(f"\ncontinues_on_next_page=True:")
print(f"  Gold has: {gold_stats['continues_on_next_page']['true']}")
print(f"  Pred has: {pred_stats['continues_on_next_page']['true']}")
print(f"  Detection rate: {pred_stats['continues_on_next_page']['true']}/{gold_stats['continues_on_next_page']['true']} = "
      f"{pred_stats['continues_on_next_page']['true']/gold_stats['continues_on_next_page']['true']*100:.1f}%"
      if gold_stats['continues_on_next_page']['true'] > 0 else "  N/A")

print("\n" + "=" * 70)

CONTINUATION FIELD ANALYSIS

üìö GOLD STANDARD:
  Files processed: 14
  Total items: 70

  is_continuation:
    True:   7
    False:  0
    Null:   63
    Absent: 0

  continues_on_next_page:
    True:   7
    False:  0
    Null:   63
    Absent: 0


ü§ñ PREDICTIONS:
  Files processed: 14
  Total items: 52

  is_continuation:
    True:   3
    False:  0
    Null:   49
    Absent: 0

  continues_on_next_page:
    True:   7
    False:  0
    Null:   45
    Absent: 0


COMPARISON

is_continuation=True:
  Gold has: 7
  Pred has: 3
  Detection rate: 3/7 = 42.9%

continues_on_next_page=True:
  Gold has: 7
  Pred has: 7
  Detection rate: 7/7 = 100.0%



In [9]:
print("="*70)
print("ERROR ANALYSIS")
print("="*70)

# Find worst performing pages by CER
worst_pages_cer = sorted(all_results, key=lambda x: x['cer'], reverse=True)[:5]

print("\nWorst 5 Pages by Character Error Rate:")
for i, result in enumerate(worst_pages_cer, 1):
    print(f"{i}. {result['page']}")
    print(f"   CER: {result['cer']:.2%}, WER: {result['wer']:.2%}")
    print(f"   Gold: {result['gold_items']} items, {result['gold_chars']} chars")
    print(f"   Pred: {result['pred_items']} items, {result['pred_chars']} chars")
    print()

# Find pages with item count mismatches
print("\nPages with Item Count Mismatches:")
mismatches = [r for r in classification_results_all if r['gold_count'] != r['pred_count']]
if mismatches:
    for result in mismatches:
        diff = result['pred_count'] - result['gold_count']
        sign = "+" if diff > 0 else ""
        print(f"  ‚Ä¢ {result['page']}: Gold={result['gold_count']}, Pred={result['pred_count']} ({sign}{diff})")
else:
    print("  No mismatches found!")

# Classification errors
print("\nMost Common Classification Errors:")
errors = []
for gold_class, pred_dict in all_confusion.items():
    for pred_class, count in pred_dict.items():
        if gold_class != pred_class and count > 0:
            errors.append((count, gold_class, pred_class))

errors.sort(reverse=True)
for count, gold_class, pred_class in errors[:10]:
    print(f"  ‚Ä¢ {gold_class} ‚Üí {pred_class}: {count} times")

print("\n" + "="*70)

ERROR ANALYSIS

Worst 5 Pages by Character Error Rate:
1. La_Plume_bpt6k1185893k_1_10_1889__page-012.json
   CER: 100.00%, WER: 100.00%
   Gold: 8 items, 3634 chars
   Pred: 0 items, 0 chars

2. La_Plume_bpt6k1185893k_1_10_1889__page-005.json
   CER: 21.48%, WER: 27.55%
   Gold: 5 items, 4745 chars
   Pred: 4 items, 4648 chars

3. La_Plume_bpt6k1185893k_1_10_1889__page-007.json
   CER: 14.49%, WER: 18.18%
   Gold: 2 items, 69 chars
   Pred: 3 items, 69 chars

4. La_Plume_bpt6k1185893k_1_10_1889__page-011.json
   CER: 9.63%, WER: 12.56%
   Gold: 3 items, 5236 chars
   Pred: 1 items, 5174 chars

5. La_Plume_bpt6k1185893k_1_10_1889__page-014.json
   CER: 8.12%, WER: 10.99%
   Gold: 4 items, 1158 chars
   Pred: 6 items, 1190 chars


Pages with Item Count Mismatches:
  ‚Ä¢ La_Plume_bpt6k1185893k_1_10_1889__page-001.json: Gold=8, Pred=5 (-3)
  ‚Ä¢ La_Plume_bpt6k1185893k_1_10_1889__page-004.json: Gold=5, Pred=4 (-1)
  ‚Ä¢ La_Plume_bpt6k1185893k_1_10_1889__page-005.json: Gold=5, Pred=4 (-1)
  

In [10]:
# def standardize_gold_standard_names():
#     """
#     Rename gold standard files to match the PDF-based naming convention.
#     Old: La_Plume___revue_litt√©raire_[...]_bpt6k1185893k__page-001.json
#     New: La_Plume_bpt6k1185893k_1_10_1889__page-001.json
#     """
#     import re
    
#     # The standard name from your PDF
#     STANDARD_BASE = "La_Plume_bpt6k1185893k_1_10_1889"
    
#     for old_path in GOLD_DIR.glob("*.json"):
#         # Extract page number
#         match = re.search(r'page-(\d+)\.json$', old_path.name)
#         if not match:
#             print(f"‚ö†Ô∏è  Skipping (no page number): {old_path.name}")
#             continue
        
#         page_num = match.group(1)
#         new_name = f"{STANDARD_BASE}__page-{page_num}.json"
#         new_path = old_path.parent / new_name
        
#         if old_path.name != new_name:
#             print(f"Renaming: {old_path.name}")
#             print(f"      ‚Üí  {new_name}")
#             old_path.rename(new_path)
    
#     print("\n‚úì Gold standard filenames standardized!")

# # Uncomment to run:
# standardize_gold_standard_names()