In [1]:
"""
Stage 1 Multi-Schema Evaluation
Compares multiple schema variants to identify the best performing schema.

Schema Families:
- WITH continuations: Has is_continuation + continues_on_next_page fields
- WITHOUT continuations: Simpler structure without cross-page tracking

Evaluation Strategy:
1. Auto-detect schema families based on field presence
2. Within-family comparison: Find best schema in each family
3. Cross-family comparison: Compare winners on common dimensions only
4. Final recommendation: Which schema to use and why

Input:  data/predictions/{schema_version}/{magazine_name}/
        data/gold_standard/cleaned/{magazine_name}/
Output: Comparative metrics and recommendation
"""
from pathlib import Path
import json
from typing import Dict, List, Tuple, Optional, Set
from collections import defaultdict
import re
import pandas as pd
import numpy as np
import importlib.util
import inspect

# Project imports
from utils.paths import PROJECT_ROOT, PREDICTIONS, GOLD_CLEAN
from utils.config import EVALUATION_CONFIG
from schemas.stage1_page import Stage1PageModel
from utils.text_processing import (
    normalize_text_strict,
    normalize_text_standard,
    normalize_text_letters_only,
    token_sort_text
)
from utils.ocr_metrics import character_error_rate, word_error_rate
from utils.evaluation import (
    match_items,
    load_and_match_page,
    filter_matches_by_class,
    get_matched_pairs,
    evaluate_order_agnostic,
    evaluate_structure_aware,
    evaluate_classification,
    evaluate_metadata_field,
    evaluate_continuation_all_items
)

# Paths
GOLD_ROOT = GOLD_CLEAN
PRED_ROOT = PREDICTIONS

print("Stage 1 Multi-Schema Evaluation")
print("=" * 60)
print(f"Project root: {PROJECT_ROOT}")
print("\nDirectories:")
print(f"  Gold standard: {GOLD_ROOT}")
print(f"  Predictions:   {PRED_ROOT}")

Stage 1 Multi-Schema Evaluation
Project root: /home/fabian-ramirez/Documents/These/Code/magazine_graphs

Directories:
  Gold standard: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/gold_standard/cleaned
  Predictions:   /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/predictions


In [2]:
"""
Schema Family Detection
Determine schema families by inspecting schema definition files in schemas/ folder.
This is the source of truth - not the prediction outputs.
"""

def detect_schema_family_from_definition(schema_name: str) -> Optional[str]:
    """
    Detect schema family by checking if continuation fields are defined in the schema class.
    
    Args:
        schema_name: Name of schema directory (e.g., 'stage1_page', 'stage1_page_v2')
        
    Returns:
        'with_continuations' or 'without_continuations' or None if cannot determine
    """
    # Map schema directory name to schema file
    # Convention: stage1_page -> stage1_page.py, stage1_page_v2 -> stage1_page_v2.py
    schema_file = PROJECT_ROOT / 'schemas' / f'{schema_name}.py'
    
    if not schema_file.exists():
        print(f"  WARNING: Schema file not found: {schema_file}")
        return None
    
    try:
        # Load the schema module
        spec = importlib.util.spec_from_file_location(schema_name, schema_file)
        if spec is None or spec.loader is None:
            return None
            
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        
        # Find the Item class (Stage1Item or similar)
        item_class = None
        for name, obj in inspect.getmembers(module):
            if inspect.isclass(obj) and 'Item' in name and name != 'BaseModel':
                item_class = obj
                break
        
        if item_class is None:
            print(f"  WARNING: Could not find Item class in {schema_file.name}")
            return None
        
        # Check if continuation fields are defined in the model
        model_fields = item_class.model_fields if hasattr(item_class, 'model_fields') else {}
        
        has_is_continuation = 'is_continuation' in model_fields
        has_continues = 'continues_on_next_page' in model_fields
        
        if has_is_continuation and has_continues:
            return 'with_continuations'
        elif not has_is_continuation and not has_continues:
            return 'without_continuations'
        else:
            # Has one but not the other - unusual
            print(f"  WARNING: {schema_name} has only one continuation field")
            return 'with_continuations' if (has_is_continuation or has_continues) else 'without_continuations'
            
    except Exception as e:
        print(f"  ERROR loading schema {schema_name}: {e}")
        return None

# Schema versions are in predictions/schema_evaluations/
SCHEMA_ROOT = PRED_ROOT / 'schema_evaluations'

if not SCHEMA_ROOT.exists():
    print(f"ERROR: Schema evaluations directory not found at {SCHEMA_ROOT}")
    print("Expected structure: predictions/schema_evaluations/{{schema_version}}/{{magazine_name}}/")
    available_schemas = []
else:
    # Find all schema directories
    available_schemas = sorted([d for d in SCHEMA_ROOT.iterdir() if d.is_dir()])

print("\n" + "=" * 60)
print("Detecting Schema Families (from schema definitions)")
print("=" * 60 + "\n")

if not available_schemas:
    print("ERROR: No schema directories found in predictions/schema_evaluations/")
    print("Expected structure: predictions/schema_evaluations/{{schema_version}}/{{magazine_name}}/")
else:
    print(f"Found {len(available_schemas)} schema version(s):\n")
    for schema_dir in available_schemas:
        print(f"  - {schema_dir.name}")
    
    print(f"\nChecking schema definitions in {PROJECT_ROOT / 'schemas'}/...\n")
    
    # Group schemas by family
    families = {
        'with_continuations': [],
        'without_continuations': []
    }
    
    for schema_dir in available_schemas:
        schema_name = schema_dir.name
        family = detect_schema_family_from_definition(schema_name)
        
        if family:
            families[family].append(schema_name)
            print(f"  {schema_name:<30} -> {family}")
        else:
            print(f"  {schema_name:<30} -> UNKNOWN (could not detect from schema file)")
    
    # Summary
    print("\n" + "-" * 60)
    print("Family Summary:")
    print(f"  WITH continuations:    {len(families['with_continuations'])} schema(s)")
    for schema in families['with_continuations']:
        print(f"    - {schema}")
    print(f"\n  WITHOUT continuations: {len(families['without_continuations'])} schema(s)")
    for schema in families['without_continuations']:
        print(f"    - {schema}")
    print("-" * 60)


Detecting Schema Families (from schema definitions)

Found 7 schema version(s):

  - stage1_page
  - stage1_page_v2
  - stage1_page_v2_medium
  - stage1_page_v2_medium_pure
  - stage1_page_v2_pure
  - stage1_page_v2_small
  - stage1_page_v2_small_pure

Checking schema definitions in /home/fabian-ramirez/Documents/These/Code/magazine_graphs/schemas/...

  stage1_page                    -> with_continuations
  stage1_page_v2                 -> with_continuations
  stage1_page_v2_medium          -> with_continuations
  stage1_page_v2_medium_pure     -> without_continuations
  stage1_page_v2_pure            -> without_continuations
  stage1_page_v2_small           -> with_continuations
  stage1_page_v2_small_pure      -> without_continuations

------------------------------------------------------------
Family Summary:
  WITH continuations:    4 schema(s)
    - stage1_page
    - stage1_page_v2
    - stage1_page_v2_medium
    - stage1_page_v2_small

  WITHOUT continuations: 3 schema(s)
   

In [3]:
"""
Find Magazine Pairs for Each Schema
For each schema version, find magazines with matching gold standard data.
Ensures fair comparison by verifying all schemas have same test set.
"""

def find_magazine_pairs_for_schema(schema_name: str) -> List[Tuple[str, Path, Path, int]]:
    """
    Find magazines with both gold standard and predictions for a given schema.
    
    Args:
        schema_name: Name of schema directory
        
    Returns:
        List of (magazine_name, gold_dir, pred_dir, num_matching_files) tuples
    """
    schema_path = SCHEMA_ROOT / schema_name
    
    # Get all gold standard magazines
    gold_magazines = {d.name: d for d in GOLD_ROOT.iterdir() if d.is_dir()}
    
    # Get all prediction magazines for this schema
    pred_magazines = {d.name: d for d in schema_path.iterdir() if d.is_dir()}
    
    # Find magazines that exist in both
    common_magazines = set(gold_magazines.keys()) & set(pred_magazines.keys())
    
    pairs = []
    for mag_name in sorted(common_magazines):
        gold_dir = gold_magazines[mag_name]
        pred_dir = pred_magazines[mag_name]
        
        # Find matching page files (same filename in both directories)
        gold_files = {f.name for f in gold_dir.glob("*.json")}
        pred_files = {f.name for f in pred_dir.glob("*.json")}
        
        matching_files = gold_files & pred_files
        
        if matching_files:
            pairs.append((mag_name, gold_dir, pred_dir, len(matching_files)))
    
    return pairs

# Find pairs for all schemas
print("\n" + "=" * 60)
print("Finding Magazine Pairs for Each Schema")
print("=" * 60 + "\n")

schema_magazine_pairs = {}

for schema_dir in available_schemas:
    schema_name = schema_dir.name
    pairs = find_magazine_pairs_for_schema(schema_name)
    schema_magazine_pairs[schema_name] = pairs
    
    print(f"{schema_name}:")
    if not pairs:
        print("  No matching magazines found")
    else:
        for mag_name, gold_dir, pred_dir, num_files in pairs:
            print(f"  {mag_name}: {num_files} matching pages")
    print()

# Verify all schemas have same test set (CRITICAL for fair comparison)
all_magazine_sets = [
    set(mag_name for mag_name, _, _, _ in pairs)
    for pairs in schema_magazine_pairs.values()
]

if len(set(map(frozenset, all_magazine_sets))) == 1:
    print("VERIFICATION: All schemas have identical test sets")
    common_magazines = all_magazine_sets[0]
    print(f"Common magazines ({len(common_magazines)}): {', '.join(sorted(common_magazines))}")
else:
    print("WARNING: Schemas have different test sets - comparison may not be fair")
    for schema_name, pairs in schema_magazine_pairs.items():
        mags = set(mag_name for mag_name, _, _, _ in pairs)
        print(f"  {schema_name}: {mags}")


Finding Magazine Pairs for Each Schema

stage1_page:
  La_Plume_bpt6k1185893k_1_10_1889: 14 matching pages
  La_Plume_bpt6k1212187t_15-11-1893: 34 matching pages

stage1_page_v2:
  La_Plume_bpt6k1185893k_1_10_1889: 14 matching pages
  La_Plume_bpt6k1212187t_15-11-1893: 34 matching pages

stage1_page_v2_medium:
  La_Plume_bpt6k1185893k_1_10_1889: 14 matching pages
  La_Plume_bpt6k1212187t_15-11-1893: 34 matching pages

stage1_page_v2_medium_pure:
  La_Plume_bpt6k1185893k_1_10_1889: 14 matching pages
  La_Plume_bpt6k1212187t_15-11-1893: 34 matching pages

stage1_page_v2_pure:
  La_Plume_bpt6k1185893k_1_10_1889: 14 matching pages
  La_Plume_bpt6k1212187t_15-11-1893: 34 matching pages

stage1_page_v2_small:
  La_Plume_bpt6k1185893k_1_10_1889: 14 matching pages
  La_Plume_bpt6k1212187t_15-11-1893: 34 matching pages

stage1_page_v2_small_pure:
  La_Plume_bpt6k1185893k_1_10_1889: 14 matching pages
  La_Plume_bpt6k1212187t_15-11-1893: 34 matching pages

VERIFICATION: All schemas have identica

In [4]:
"""
Evaluation functions imported from utils
All evaluation logic is in utils/evaluation.py
"""
print("Evaluation functions loaded from utils/evaluation.py")
print(f"Similarity threshold: {EVALUATION_CONFIG.similarity_threshold}")

Evaluation functions loaded from utils/evaluation.py
Similarity threshold: 0.7


In [5]:
"""
Full Schema Evaluation Pipeline
Evaluates a single schema across all its magazine pairs.
Runs the complete 01c evaluation (all 5 dimensions) and aggregates results.
"""
def evaluate_schema_full(
    schema_name: str,
    magazine_pairs: List[Tuple[str, Path, Path, int]],
    schema_family: str
) -> Dict:
    """
    Evaluate a schema across all its magazine pairs.
    
    Args:
        schema_name: Name of schema to evaluate
        magazine_pairs: List of (mag_name, gold_dir, pred_dir, n_files) tuples
        schema_family: 'with_continuations' or 'without_continuations'
    
    Returns:
        Dict with aggregated metrics across all pages
    """
    print(f"\nEvaluating {schema_name}...")
    print(f"Family: {schema_family}")
    print(f"Magazines: {len(magazine_pairs)}")
    
    # Accumulators for aggregation
    all_gold_items = []
    all_pred_items = []
    all_matches = []
    all_unmatched_gold = set()
    all_unmatched_pred = set()
    total_pages = 0
    
    # Track item offsets for continuation evaluation
    gold_offset = 0
    pred_offset = 0
    
    # Process all pages across all magazines
    for mag_name, gold_dir, pred_dir, n_files in magazine_pairs:
        gold_files = sorted(gold_dir.glob("*.json"))
        
        for gold_file in gold_files:
            pred_file = pred_dir / gold_file.name
            if not pred_file.exists():
                continue
            
            try:
                # Load and match page
                page_data = load_and_match_page(gold_file, pred_file)
                
                gold_items = page_data['gold_items']
                pred_items = page_data['pred_items']
                matches = page_data['matches']
                unmatched_gold = page_data['unmatched_gold']
                unmatched_pred = page_data['unmatched_pred']
                
                # Accumulate with offset adjustment for continuation evaluation
                all_gold_items.extend(gold_items)
                all_pred_items.extend(pred_items)
                
                # Adjust match indices to account for accumulated items
                adjusted_matches = [
                    (g_idx + gold_offset, p_idx + pred_offset, score)
                    for g_idx, p_idx, score in matches
                ]
                all_matches.extend(adjusted_matches)
                
                # Adjust unmatched indices
                all_unmatched_gold.update(idx + gold_offset for idx in unmatched_gold)
                all_unmatched_pred.update(idx + pred_offset for idx in unmatched_pred)
                
                # Update offsets
                gold_offset += len(gold_items)
                pred_offset += len(pred_items)
                
                total_pages += 1
                
            except Exception as e:
                print(f"  ERROR processing {gold_file.name}: {e}")
                import traceback
                traceback.print_exc()
                continue
    
    print(f"  Processed {total_pages} pages")
    print(f"  Total gold items: {len(all_gold_items)}")
    print(f"  Total pred items: {len(all_pred_items)}")
    print(f"  Total matches: {len(all_matches)}")
    
    # 1. Structure Detection
    total_gold = len(all_gold_items)
    total_pred = len(all_pred_items)
    total_matched = len(all_matches)
    
    precision = total_matched / total_pred if total_pred > 0 else 0.0
    recall = total_matched / total_gold if total_gold > 0 else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    
    structure_metrics = {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'match_rate': recall,  # match_rate is same as recall
        'total_gold_items': total_gold,
        'total_pred_items': total_pred,
        'total_matched': total_matched
    }
    
    # 2. Text Quality (using structure-aware evaluation)
    text_metrics = evaluate_structure_aware(all_gold_items, all_pred_items, all_matches)
    
    # 3. Classification
    classification_metrics = evaluate_classification(all_gold_items, all_pred_items, all_matches)
    
    # 4. Metadata - evaluate across all matched items
    metadata_metrics = {}
    for field_name in ['item_title', 'item_author']:
        field_results = evaluate_metadata_field(
            all_gold_items, 
            all_pred_items, 
            all_matches, 
            field_name
        )
        # Store with simplified key name
        simple_name = field_name.replace('item_', '')
        metadata_metrics[simple_name] = field_results
    
    # 5. Continuation (only if applicable)
    continuation_metrics = None
    if schema_family == 'with_continuations':
        cont_results = evaluate_continuation_all_items(
            all_gold_items,
            all_pred_items,
            all_matches,
            all_unmatched_gold,
            all_unmatched_pred
        )
        
        # Extract combined metrics
        is_cont_f1 = cont_results['is_continuation']['f1']
        continues_f1 = cont_results['continues_on_next_page']['f1']
        combined_f1 = (is_cont_f1 + continues_f1) / 2
        
        continuation_metrics = {
            'is_continuation_f1': is_cont_f1,
            'continues_f1': continues_f1,
            'combined_f1': combined_f1,
            'is_continuation': cont_results['is_continuation'],
            'continues_on_next_page': cont_results['continues_on_next_page']
        }
    
    # Compile results
    results = {
        'schema_name': schema_name,
        'schema_family': schema_family,
        'n_pages': total_pages,
        'structure_detection': structure_metrics,
        'text_quality': {
            'structure_aware': {
                'all_items': text_metrics
            }
        },
        'classification': {
            'overall': classification_metrics
        },
        'metadata_extraction': metadata_metrics
    }
    
    if continuation_metrics is not None:
        results['continuation'] = continuation_metrics
    
    return results

print("Full schema evaluation pipeline ready")

Full schema evaluation pipeline ready


In [6]:
"""
Execute Full Evaluation for All Schemas
Run the comprehensive evaluation pipeline for each schema and collect results.
"""

# Store all results
all_results = {}

print("\n" + "=" * 60)
print("Running Full Evaluation for All Schemas")
print("=" * 60)

for schema_dir in available_schemas:
    schema_name = schema_dir.name
    
    # Get schema family
    family = detect_schema_family_from_definition(schema_name)
    if family is None:
        print(f"\nSkipping {schema_name} - could not determine family")
        continue
    
    # Get magazine pairs
    magazine_pairs = schema_magazine_pairs.get(schema_name, [])
    if not magazine_pairs:
        print(f"\nSkipping {schema_name} - no magazine pairs found")
        continue
    
    # Run evaluation
    try:
        results = evaluate_schema_full(schema_name, magazine_pairs, family)

        # Validate results structure
        required_keys = ['schema_name', 'schema_family', 'n_pages', 'structure_detection', 
                        'text_quality', 'classification', 'metadata_extraction']
        missing_keys = [k for k in required_keys if k not in results]
        if missing_keys:
            print(f"  WARNING: Results missing keys: {missing_keys}")
            continue

        all_results[schema_name] = results
        
        # Print summary
        print(f"\n  Results for {schema_name}:")
        print(f"    Structure Match Rate: {results['structure_detection']['match_rate']:.3f}")
        print(f"    Text CER (SA all): {results['text_quality']['structure_aware']['all_items']['cer_standard']:.3f}")
        print(f"    Classification Acc: {results['classification']['overall']['accuracy']:.3f}")
        
        if 'continuation' in results:
            print(f"    Continuation F1: {results['continuation']['combined_f1']:.3f}")
        
    except Exception as e:
        print(f"\n  ERROR evaluating {schema_name}: {e}")
        import traceback
        traceback.print_exc()
        continue

print("\n" + "=" * 60)
print(f"Evaluation complete. Processed {len(all_results)} schema(s)")
print("=" * 60)


Running Full Evaluation for All Schemas

Evaluating stage1_page...
Family: with_continuations
Magazines: 2
  Processed 48 pages
  Total gold items: 215
  Total pred items: 178
  Total matches: 99

  Results for stage1_page:
    Structure Match Rate: 0.460
    Text CER (SA all): 0.048
    Classification Acc: 0.949
    Continuation F1: 0.151

Evaluating stage1_page_v2...
Family: with_continuations
Magazines: 2
  Processed 48 pages
  Total gold items: 215
  Total pred items: 221
  Total matches: 135

  Results for stage1_page_v2:
    Structure Match Rate: 0.628
    Text CER (SA all): 0.057
    Classification Acc: 0.881
    Continuation F1: 0.143

Evaluating stage1_page_v2_medium...
Family: with_continuations
Magazines: 2
  Processed 48 pages
  Total gold items: 215
  Total pred items: 207
  Total matches: 131

  Results for stage1_page_v2_medium:
    Structure Match Rate: 0.609
    Text CER (SA all): 0.094
    Classification Acc: 0.847
    Continuation F1: 0.146

Evaluating stage1_page_v

In [7]:
"""
Within-Family Evaluation
Compare schemas within each family on all metrics.
"""
# Group results by family
family_results = defaultdict(list)
for schema_name, results in all_results.items():
    family = results['schema_family']
    family_results[family].append((schema_name, results))

# Compare schemas within each family
for family, schemas in family_results.items():
    print(f"\nFamily: {family}")
    print("=" * 90)
    
    # Sort schemas alphabetically for consistent display
    schemas_sorted = sorted(schemas, key=lambda x: x[0])
    
    # Print header
    print(f"{'Schema':<30} {'Match Rate':<12} {'CER':<10} {'Class Acc':<12} {'Meta F1':<10}")
    print("-" * 90)
    
    # Print each schema
    for schema_name, results in schemas_sorted:
        match_rate = results['structure_detection']['match_rate']
        cer = results['text_quality']['structure_aware']['all_items']['cer_standard']
        acc = results['classification']['overall']['accuracy']
        
        # Calculate average metadata F1
        metadata_scores = [field['f1'] for field in results['metadata_extraction'].values()]
        meta_f1 = sum(metadata_scores) / len(metadata_scores) if metadata_scores else 0.0
        
        print(f"{schema_name:<30} {match_rate:<12.3f} {cer:<10.3f} {acc:<12.3f} {meta_f1:<10.3f}")
        
        # Show continuation metrics if available
        if 'continuation' in results:
            cont_f1 = results['continuation']['combined_f1']
            print(f"{'  └─ Continuation F1':<30} {cont_f1:.3f}")


Family: with_continuations
Schema                         Match Rate   CER        Class Acc    Meta F1   
------------------------------------------------------------------------------------------
stage1_page                    0.460        0.048      0.949        0.727     
  └─ Continuation F1           0.151
stage1_page_v2                 0.628        0.057      0.881        0.634     
  └─ Continuation F1           0.143
stage1_page_v2_medium          0.609        0.094      0.847        0.708     
  └─ Continuation F1           0.146
stage1_page_v2_small           0.367        0.108      0.772        0.675     
  └─ Continuation F1           0.043

Family: without_continuations
Schema                         Match Rate   CER        Class Acc    Meta F1   
------------------------------------------------------------------------------------------
stage1_page_v2_medium_pure     0.577        0.075      0.855        0.752     
stage1_page_v2_pure            0.749        0.087      0.9

In [8]:
"""
Cross-Family Comparison
Compare best-performing schema from each family on common dimensions.
"""

if len(family_results) < 2:
    print("\nCross-Family Comparison")
    print("=" * 60)
    print("Only one family present - cross-family comparison not applicable")
else:
    family_representatives = {}
    for family, schemas in family_results.items():
        # Calculate composite score
        best_schema = None
        best_score = -1.0

        for schema_name, results in schemas:
            # Composite score: average of match rate, (1 - CER), classification accuracy, metadata F1
            # Extract 4 core dimensions
            match_rate = results['structure_detection']['match_rate']
            text_quality = 1 - results['text_quality']['structure_aware']['all_items']['cer_standard']
            class_acc = results['classification']['overall']['accuracy']

            # Calculate metadata F1
            metadata_scores = [field['f1'] for field in results['metadata_extraction'].values()]
            meta_f1 = sum(metadata_scores) / len(metadata_scores) if metadata_scores else 0.0
            
            composite = (match_rate + text_quality + class_acc + meta_f1) / 4
            
            if composite > best_score:
                best_score = composite
                best_schema = (schema_name, results)
        family_representatives[family] = best_schema
        print(f"  {family}: {best_schema[0]} (score: {best_score:.3f})")

    # Define common dimensions for comparison
    common_dimensions = ['structure_detection', 'text_quality', 'classification', 'metadata']

    # Compare family representatives on each dimension
    print("\nCross-Family Comparison")
    print("=" * 80)
    print(f"{'Dimension':<25} {' '.join(f'{fam:<20}' for fam in family_representatives.keys())}")
    print("-" * 80)

    dimension_scores = defaultdict(dict)

    # FIRST: Collect all scores
    for dimension in common_dimensions:
        for family, (schema_name, results) in family_representatives.items():
            # Extract the relevant metric for the dimension
            if dimension == 'structure_detection':
                score = results[dimension]['match_rate']
            elif dimension == 'text_quality':
                score = 1 - results[dimension]['structure_aware']['all_items']['cer_standard']
            elif dimension == 'classification':
                score = results[dimension]['overall']['accuracy']
            elif dimension == 'metadata':
                metadata_scores = [field['f1'] for field in results['metadata_extraction'].values()]
                score = sum(metadata_scores) / len(metadata_scores) if metadata_scores else 0.0
            
            dimension_scores[dimension][family] = score

    # THEN: Print with correct asterisks
    for dimension in common_dimensions:
        print(f"{dimension:<25}", end=" ")
        
        best_score_in_dim = max(dimension_scores[dimension].values())
        
        for family in family_representatives.keys():
            score = dimension_scores[dimension][family]
            is_best = (score == best_score_in_dim)
            marker = " *" if is_best else "  "
            print(f"{score:.3f}{marker:<17}", end=" ")
        print()

    print("\n* = Best in dimension")
    print("\nSummary:")
    for family in family_representatives.keys():
        wins = sum(1 for dim_scores in dimension_scores.values() 
                   if dim_scores[family] == max(dim_scores.values()))
        print(f"  {family}: {wins}/{len(common_dimensions)} dimensions won")

  with_continuations: stage1_page (score: 0.772)
  without_continuations: stage1_page_v2_pure (score: 0.813)

Cross-Family Comparison
Dimension                 with_continuations   without_continuations
--------------------------------------------------------------------------------
structure_detection       0.460                  0.749 *                
text_quality              0.952 *                0.913                  
classification            0.949 *                0.944                  
metadata                  0.727 *                0.648                  

* = Best in dimension

Summary:
  with_continuations: 3/4 dimensions won
  without_continuations: 1/4 dimensions won


In [9]:
"""
Detailed Comparison Table
Comprehensive summary table showing all metrics for all schemas across both families.
"""
# Prepare data for the table
comparison_data = []
for family, schemas in family_results.items():
    for schema_name, results in schemas:
        # Extract continuation F1 if available
        continuation_f1 = None
        if 'continuation' in results:
            continuation_f1 = results['continuation']['combined_f1']
        
        # Calculate average metadata F1
        metadata_scores = [field['f1'] for field in results['metadata_extraction'].values()]
        meta_f1 = sum(metadata_scores) / len(metadata_scores) if metadata_scores else 0.0
        
        comparison_data.append({
            'Family': family,
            'Schema': schema_name,
            'Match Rate': results['structure_detection']['match_rate'],
            'CER': results['text_quality']['structure_aware']['all_items']['cer_standard'],
            'WER': results['text_quality']['structure_aware']['all_items']['wer_standard'],
            'Class Acc': results['classification']['overall']['accuracy'],
            'Meta F1': meta_f1,
            'Cont F1': continuation_f1
        })

# Create DataFrame
df_comparison = pd.DataFrame(comparison_data)

# Sort by family and schema name for consistent display
df_comparison = df_comparison.sort_values(by=['Family', 'Schema'])

# Display the table
print("\nComprehensive Comparison Table")
print("=" * 100)
print(df_comparison.to_string(index=False))
print("\nNotes:")
print("  - Lower CER/WER is better (error rates)")
print("  - Higher Match Rate/Accuracy/F1 is better")
print("  - Cont F1 only applicable to schemas with continuation tracking")


Comprehensive Comparison Table
               Family                     Schema  Match Rate      CER      WER  Class Acc  Meta F1  Cont F1
   with_continuations                stage1_page    0.460465 0.048353 0.107027   0.949495 0.727358 0.150943
   with_continuations             stage1_page_v2    0.627907 0.057294 0.111283   0.881481 0.634259 0.142857
   with_continuations      stage1_page_v2_medium    0.609302 0.094469 0.146861   0.847328 0.708333 0.146075
   with_continuations       stage1_page_v2_small    0.367442 0.107978 0.161254   0.772152 0.674797 0.042553
without_continuations stage1_page_v2_medium_pure    0.576744 0.075431 0.134812   0.854839 0.751938      NaN
without_continuations        stage1_page_v2_pure    0.748837 0.086966 0.132047   0.944099 0.647692      NaN
without_continuations  stage1_page_v2_small_pure    0.372093 0.103073 0.166978   0.750000 0.597297      NaN

Notes:
  - Lower CER/WER is better (error rates)
  - Higher Match Rate/Accuracy/F1 is better
  - Cont F

In [10]:
# %%
"""
Final Recommendation: Overall Winner
"""
print("\n" + "=" * 80)
print("OVERALL WINNER")
print("=" * 80)

# Find best schema across ALL
best_overall = None
best_composite = -1

for schema_name, results in all_results.items():
    match_rate = results['structure_detection']['match_rate']
    text_quality = 1 - results['text_quality']['structure_aware']['all_items']['cer_standard']
    class_acc = results['classification']['overall']['accuracy']
    
    metadata_scores = [field['f1'] for field in results['metadata_extraction'].values()]
    meta_f1 = sum(metadata_scores) / len(metadata_scores) if metadata_scores else 0.0
    
    composite = (match_rate + text_quality + class_acc + meta_f1) / 4
    
    if composite > best_composite:
        best_composite = composite
        best_overall = (schema_name, results)

winner_name, winner_results = best_overall

print(f"\n WINNER: {winner_name}")
print(f"   Family: {winner_results['schema_family']}")
print(f"   Composite Score: {best_composite:.3f}\n")
print("Key Metrics:")
print(f"  Match Rate:      {winner_results['structure_detection']['match_rate']:.3f}")
print(f"  CER:             {winner_results['text_quality']['structure_aware']['all_items']['cer_standard']:.3f}")
print(f"  Classification:  {winner_results['classification']['overall']['accuracy']:.3f}")

metadata_scores = [field['f1'] for field in winner_results['metadata_extraction'].values()]
meta_f1 = sum(metadata_scores) / len(metadata_scores)
print(f"  Metadata F1:     {meta_f1:.3f}")

if 'continuation' in winner_results:
    print(f"  Continuation F1: {winner_results['continuation']['combined_f1']:.3f}")


OVERALL WINNER

 WINNER: stage1_page_v2_pure
   Family: without_continuations
   Composite Score: 0.813

Key Metrics:
  Match Rate:      0.749
  CER:             0.087
  Classification:  0.944
  Metadata F1:     0.648


In [11]:
"""
Complete Rankings
"""
print("\n" + "=" * 100)
print("COMPLETE RANKINGS (by composite score)")
print("=" * 100)

rankings = []
for schema_name, results in all_results.items():
    match_rate = results['structure_detection']['match_rate']
    text_quality = 1 - results['text_quality']['structure_aware']['all_items']['cer_standard']
    class_acc = results['classification']['overall']['accuracy']
    
    metadata_scores = [field['f1'] for field in results['metadata_extraction'].values()]
    meta_f1 = sum(metadata_scores) / len(metadata_scores) if metadata_scores else 0.0
    
    composite = (match_rate + text_quality + class_acc + meta_f1) / 4
    
    cer = results['text_quality']['structure_aware']['all_items']['cer_standard']
    
    rankings.append((
        schema_name, 
        composite, 
        match_rate, 
        cer,
        class_acc, 
        meta_f1,
        results['schema_family']
    ))

rankings.sort(key=lambda x: x[1], reverse=True)

print(f"{'Rank':<6} {'Schema':<30} {'Score':<8} {'Match':<8} {'CER':<8} {'Class':<8} {'Meta':<8} {'Family'}")
print("=" * 100)

for i, (name, score, match, cer, cls, meta, family) in enumerate(rankings, 1):
    marker = " WINNER" if i == 1 else ""
    family_short = 'w/cont' if family == 'with_continuations' else 'no cont'
    print(f"{i:<6} {name:<30} {score:.3f}    {match:.3f}  {cer:.3f}  {cls:.3f}  {meta:.3f}  {family_short}{marker}")


COMPLETE RANKINGS (by composite score)
Rank   Schema                         Score    Match    CER      Class    Meta     Family
1      stage1_page_v2_pure            0.813    0.749  0.087  0.944  0.648  no cont WINNER
2      stage1_page_v2_medium_pure     0.777    0.577  0.075  0.855  0.752  no cont
3      stage1_page                    0.772    0.460  0.048  0.949  0.727  w/cont
4      stage1_page_v2                 0.772    0.628  0.057  0.881  0.634  w/cont
5      stage1_page_v2_medium          0.768    0.609  0.094  0.847  0.708  w/cont
6      stage1_page_v2_small           0.677    0.367  0.108  0.772  0.675  w/cont
7      stage1_page_v2_small_pure      0.654    0.372  0.103  0.750  0.597  no cont
