In [None]:
"""
Stage 1 Multi-Schema Evaluation
Compares multiple schema variants to identify the best performing schema.

Schema Families:
- WITH continuations: Has is_continuation + continues_on_next_page fields
- WITHOUT continuations: Simpler structure without cross-page tracking

Evaluation Strategy:
1. Auto-detect schema families based on field presence
2. Within-family comparison: Find best schema in each family
3. Cross-family comparison: Compare winners on common dimensions only
4. Final recommendation: Which schema to use and why

Input:  data/predictions/{schema_version}/{magazine_name}/
        data/gold_standard/cleaned/{magazine_name}/
Output: Comparative metrics and recommendation
"""
from pathlib import Path
import json
from typing import Dict, List, Tuple, Optional, Set
from collections import defaultdict, Counter
from difflib import SequenceMatcher
import Levenshtein
import re
import pandas as pd
import numpy as np
import importlib.util
import inspect

# Project imports
from utils.paths import PROJECT_ROOT, PREDICTIONS, GOLD_CLEAN
from utils.config import EVALUATION_CONFIG
from schemas.stage1_page import Stage1PageModel
from utils.text_processing import (
    normalize_text_strict,
    normalize_text_standard,
    normalize_text_letters_only,
    token_sort_text
)
from utils.ocr_metrics import character_error_rate, word_error_rate
from utils.evaluation import (
    match_items,
    load_and_match_page,
    filter_matches_by_class,
    get_matched_pairs,
    evaluate_order_agnostic,
    evaluate_structure_aware,
    evaluate_classification,
    evaluate_metadata_field,
    evaluate_continuation_all_items
)

# Paths
GOLD_ROOT = GOLD_CLEAN
PRED_ROOT = PREDICTIONS

print("Stage 1 Multi-Schema Evaluation")
print("=" * 60)
print(f"Project root: {PROJECT_ROOT}")
print("\nDirectories:")
print(f"  Gold standard: {GOLD_ROOT}")
print(f"  Predictions:   {PRED_ROOT}")

Stage 1 Multi-Schema Evaluation
Project root: /home/fabian-ramirez/Documents/These/Code/magazine_graphs

Directories:
  Gold standard: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/gold_standard/cleaned
  Predictions:   /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/predictions


In [6]:
"""
Schema Family Detection
Determine schema families by inspecting schema definition files in schemas/ folder.
This is the source of truth - not the prediction outputs.
"""

def detect_schema_family_from_definition(schema_name: str) -> Optional[str]:
    """
    Detect schema family by checking if continuation fields are defined in the schema class.
    
    Args:
        schema_name: Name of schema directory (e.g., 'stage1_page', 'stage1_page_v2')
        
    Returns:
        'with_continuations' or 'without_continuations' or None if cannot determine
    """
    # Map schema directory name to schema file
    # Convention: stage1_page -> stage1_page.py, stage1_page_v2 -> stage1_page_v2.py
    schema_file = PROJECT_ROOT / 'schemas' / f'{schema_name}.py'
    
    if not schema_file.exists():
        print(f"  WARNING: Schema file not found: {schema_file}")
        return None
    
    try:
        # Load the schema module
        spec = importlib.util.spec_from_file_location(schema_name, schema_file)
        if spec is None or spec.loader is None:
            return None
            
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        
        # Find the Item class (Stage1Item or similar)
        item_class = None
        for name, obj in inspect.getmembers(module):
            if inspect.isclass(obj) and 'Item' in name and name != 'BaseModel':
                item_class = obj
                break
        
        if item_class is None:
            print(f"  WARNING: Could not find Item class in {schema_file.name}")
            return None
        
        # Check if continuation fields are defined in the model
        model_fields = item_class.model_fields if hasattr(item_class, 'model_fields') else {}
        
        has_is_continuation = 'is_continuation' in model_fields
        has_continues = 'continues_on_next_page' in model_fields
        
        if has_is_continuation and has_continues:
            return 'with_continuations'
        elif not has_is_continuation and not has_continues:
            return 'without_continuations'
        else:
            # Has one but not the other - unusual
            print(f"  WARNING: {schema_name} has only one continuation field")
            return 'with_continuations' if (has_is_continuation or has_continues) else 'without_continuations'
            
    except Exception as e:
        print(f"  ERROR loading schema {schema_name}: {e}")
        return None

# Schema versions are in predictions/schema_evaluations/
SCHEMA_ROOT = PRED_ROOT / 'schema_evaluations'

if not SCHEMA_ROOT.exists():
    print(f"ERROR: Schema evaluations directory not found at {SCHEMA_ROOT}")
    print("Expected structure: predictions/schema_evaluations/{{schema_version}}/{{magazine_name}}/")
    available_schemas = []
else:
    # Find all schema directories
    available_schemas = sorted([d for d in SCHEMA_ROOT.iterdir() if d.is_dir()])

print("\n" + "=" * 60)
print("Detecting Schema Families (from schema definitions)")
print("=" * 60 + "\n")

if not available_schemas:
    print("ERROR: No schema directories found in predictions/schema_evaluations/")
    print("Expected structure: predictions/schema_evaluations/{{schema_version}}/{{magazine_name}}/")
else:
    print(f"Found {len(available_schemas)} schema version(s):\n")
    for schema_dir in available_schemas:
        print(f"  - {schema_dir.name}")
    
    print(f"\nChecking schema definitions in {PROJECT_ROOT / 'schemas'}/...\n")
    
    # Group schemas by family
    families = {
        'with_continuations': [],
        'without_continuations': []
    }
    
    for schema_dir in available_schemas:
        schema_name = schema_dir.name
        family = detect_schema_family_from_definition(schema_name)
        
        if family:
            families[family].append(schema_name)
            print(f"  {schema_name:<30} -> {family}")
        else:
            print(f"  {schema_name:<30} -> UNKNOWN (could not detect from schema file)")
    
    # Summary
    print("\n" + "-" * 60)
    print("Family Summary:")
    print(f"  WITH continuations:    {len(families['with_continuations'])} schema(s)")
    for schema in families['with_continuations']:
        print(f"    - {schema}")
    print(f"\n  WITHOUT continuations: {len(families['without_continuations'])} schema(s)")
    for schema in families['without_continuations']:
        print(f"    - {schema}")
    print("-" * 60)


Detecting Schema Families (from schema definitions)

Found 3 schema version(s):

  - stage1_page
  - stage1_page_v2
  - stage1_page_v2_medium

Checking schema definitions in /home/fabian-ramirez/Documents/These/Code/magazine_graphs/schemas/...

  stage1_page                    -> with_continuations
  stage1_page_v2                 -> with_continuations
  stage1_page_v2_medium          -> with_continuations

------------------------------------------------------------
Family Summary:
  WITH continuations:    3 schema(s)
    - stage1_page
    - stage1_page_v2
    - stage1_page_v2_medium

  WITHOUT continuations: 0 schema(s)
------------------------------------------------------------


In [8]:
"""
Find Magazine Pairs for Each Schema
For each schema version, find magazines with matching gold standard data.
Ensures fair comparison by verifying all schemas have same test set.
"""

def find_magazine_pairs_for_schema(schema_name: str) -> List[Tuple[str, Path, Path, int]]:
    """
    Find magazines with both gold standard and predictions for a given schema.
    
    Args:
        schema_name: Name of schema directory
        
    Returns:
        List of (magazine_name, gold_dir, pred_dir, num_matching_files) tuples
    """
    schema_path = SCHEMA_ROOT / schema_name
    
    # Get all gold standard magazines
    gold_magazines = {d.name: d for d in GOLD_ROOT.iterdir() if d.is_dir()}
    
    # Get all prediction magazines for this schema
    pred_magazines = {d.name: d for d in schema_path.iterdir() if d.is_dir()}
    
    # Find magazines that exist in both
    common_magazines = set(gold_magazines.keys()) & set(pred_magazines.keys())
    
    pairs = []
    for mag_name in sorted(common_magazines):
        gold_dir = gold_magazines[mag_name]
        pred_dir = pred_magazines[mag_name]
        
        # Find matching page files (same filename in both directories)
        gold_files = {f.name for f in gold_dir.glob("*.json")}
        pred_files = {f.name for f in pred_dir.glob("*.json")}
        
        matching_files = gold_files & pred_files
        
        if matching_files:
            pairs.append((mag_name, gold_dir, pred_dir, len(matching_files)))
    
    return pairs

# Find pairs for all schemas
print("\n" + "=" * 60)
print("Finding Magazine Pairs for Each Schema")
print("=" * 60 + "\n")

schema_magazine_pairs = {}

for schema_dir in available_schemas:
    schema_name = schema_dir.name
    pairs = find_magazine_pairs_for_schema(schema_name)
    schema_magazine_pairs[schema_name] = pairs
    
    print(f"{schema_name}:")
    if not pairs:
        print("  No matching magazines found")
    else:
        for mag_name, gold_dir, pred_dir, num_files in pairs:
            print(f"  {mag_name}: {num_files} matching pages")
    print()

# Verify all schemas have same test set (CRITICAL for fair comparison)
all_magazine_sets = [
    set(mag_name for mag_name, _, _, _ in pairs)
    for pairs in schema_magazine_pairs.values()
]

if len(set(map(frozenset, all_magazine_sets))) == 1:
    print("VERIFICATION: All schemas have identical test sets")
    common_magazines = all_magazine_sets[0]
    print(f"Common magazines ({len(common_magazines)}): {', '.join(sorted(common_magazines))}")
else:
    print("WARNING: Schemas have different test sets - comparison may not be fair")
    for schema_name, pairs in schema_magazine_pairs.items():
        mags = set(mag_name for mag_name, _, _, _ in pairs)
        print(f"  {schema_name}: {mags}")


Finding Magazine Pairs for Each Schema

stage1_page:
  La_Plume_bpt6k1185893k_1_10_1889: 14 matching pages
  La_Plume_bpt6k1212187t_15-11-1893: 34 matching pages

stage1_page_v2:
  La_Plume_bpt6k1185893k_1_10_1889: 14 matching pages
  La_Plume_bpt6k1212187t_15-11-1893: 34 matching pages

stage1_page_v2_medium:
  La_Plume_bpt6k1185893k_1_10_1889: 14 matching pages
  La_Plume_bpt6k1212187t_15-11-1893: 34 matching pages

VERIFICATION: All schemas have identical test sets
Common magazines (2): La_Plume_bpt6k1185893k_1_10_1889, La_Plume_bpt6k1212187t_15-11-1893


In [None]:
"""
Item Matching Functions
Reused from 01c - match gold items to predicted items using text similarity.
These functions determine which predicted item corresponds to which gold item.
"""

def normalize_text(text: str) -> str:
    """
    Normalize text for similarity comparison.
    - Lowercase
    - Remove punctuation
    - Normalize whitespace to single spaces
    - Strip leading/trailing whitespace
    
    This makes the similarity matching more robust to minor OCR variations.
    """
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

def text_similarity(text1: str, text2: str) -> float:
    """
    Calculate similarity ratio between two texts using SequenceMatcher.
    
    Returns:
        Float between 0.0 (completely different) and 1.0 (identical)
    """
    t1 = normalize_text(text1)
    t2 = normalize_text(text2)
    
    if not t1 and not t2:
        return 1.0  # Both empty
    if not t1 or not t2:
        return 0.0  # One empty, one not
    
    return SequenceMatcher(None, t1, t2).ratio()

def match_items(
    gold_items: List[Dict], 
    pred_items: List[Dict],
    similarity_threshold: float = EVALUATION_CONFIG.similarity_threshold
) -> Tuple[List[Tuple[int, int, float]], Set[int], Set[int]]:
    """
    Match gold items to prediction items using greedy best-match algorithm.
    
    Algorithm:
        For each gold item:
        1. Compare its text against all unmatched predicted items
        2. Find the best match (highest similarity score)
        3. If score >= threshold, accept the match
        4. Mark that predicted item as matched (can't be reused)
        5. Move to next gold item
    
    This ensures each predicted item matches at most one gold item.
    
    Args:
        gold_items: List of gold standard items
        pred_items: List of predicted items
        similarity_threshold: Minimum similarity score to consider a match
    
    Returns:
        Tuple of:
        - matches: List of (gold_idx, pred_idx, similarity_score)
        - unmatched_gold: Set of gold indices with no match
        - unmatched_pred: Set of pred indices with no match
    """
    matches = []
    matched_pred_indices = set()
    unmatched_gold = set()
    
    for gold_idx, gold_item in enumerate(gold_items):
        gold_text = gold_item.get('item_text_raw', '')
        
        best_score = 0.0
        best_pred_idx = None
        
        # Find best matching predicted item
        for pred_idx, pred_item in enumerate(pred_items):
            if pred_idx in matched_pred_indices:
                continue  # Already matched to another gold item
            
            pred_text = pred_item.get('item_text_raw', '')
            score = text_similarity(gold_text, pred_text)
            
            if score > best_score:
                best_score = score
                best_pred_idx = pred_idx
        
        # Accept match if above threshold
        if best_score >= similarity_threshold and best_pred_idx is not None:
            matches.append((gold_idx, best_pred_idx, best_score))
            matched_pred_indices.add(best_pred_idx)
        else:
            unmatched_gold.add(gold_idx)
    
    # Find predicted items that never got matched
    unmatched_pred = set(range(len(pred_items))) - matched_pred_indices
    
    return matches, unmatched_gold, unmatched_pred

def load_and_match_page(
    gold_path: Path, 
    pred_path: Path,
    similarity_threshold: float = EVALUATION_CONFIG.similarity_threshold
) -> Dict:
    """
    Load a page pair (gold + prediction) and match their items.
    
    Args:
        gold_path: Path to gold standard JSON
        pred_path: Path to prediction JSON
        similarity_threshold: Minimum similarity for matching
    
    Returns:
        Dict with:
        - gold_items: All gold items
        - pred_items: All pred items
        - matches: List of (gold_idx, pred_idx, score) tuples
        - unmatched_gold: Set of unmatched gold indices
        - unmatched_pred: Set of unmatched pred indices
        - page_name: Filename
    """
    # Load and validate gold standard
    with open(gold_path, 'r', encoding='utf-8') as f:
        gold_data = json.load(f)
    gold_page = Stage1PageModel.model_validate(gold_data)
    gold_data = gold_page.model_dump()
    
    # Load and validate predictions
    with open(pred_path, 'r', encoding='utf-8') as f:
        pred_data = json.load(f)
    pred_page = Stage1PageModel.model_validate(pred_data)
    pred_data = pred_page.model_dump()
    
    gold_items = gold_data.get('items', [])
    pred_items = pred_data.get('items', [])
    
    # Match items
    matches, unmatched_gold, unmatched_pred = match_items(
        gold_items, pred_items, similarity_threshold
    )
    
    return {
        'gold_items': gold_items,
        'pred_items': pred_items,
        'matches': matches,
        'unmatched_gold': unmatched_gold,
        'unmatched_pred': unmatched_pred,
        'page_name': gold_path.name
    }

print("Item matching functions loaded")
print(f"Similarity threshold: {EVALUATION_CONFIG.similarity_threshold}")

Item matching functions loaded
Similarity threshold: 0.7
