In [2]:
import sys
from pathlib import Path
import json
from typing import Dict, List, Tuple, Optional
from collections import defaultdict
import Levenshtein

# Path setup
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

# Import schemas
from schemas.stage1_page import Stage1PageModel

# Paths
GOLD_DIR = PROJECT_ROOT / "data" / "gold_standard" / "cleaned"
PRED_DIR = PROJECT_ROOT / "data" / "interim_pages" / "Second_try_revised"

print(f"✓ Project root: {PROJECT_ROOT}")
print(f"✓ Gold standard: {GOLD_DIR}")
print(f"✓ Predictions: {PRED_DIR}")

# Count files
gold_files = list(GOLD_DIR.glob("*.json"))
pred_files = list(PRED_DIR.glob("*.json"))

print(f"\nDataset sizes:")
print(f"   Gold standard files: {len(gold_files)}")
print(f"   Prediction files: {len(pred_files)}")

✓ Project root: /home/fabian-ramirez/Documents/These/Code/magazine_graphs
✓ Gold standard: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/gold_standard/cleaned
✓ Predictions: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/interim_pages/Second_try_revised

Dataset sizes:
   Gold standard files: 1
   Prediction files: 14


In [3]:
def load_page_pairs() -> List[Tuple[Path, Path]]:
    """
    Match gold standard files with prediction files by filename.
    Returns list of (gold_path, pred_path) tuples.
    """
    gold_files = {f.name: f for f in GOLD_DIR.glob("*.json")}
    pred_files = {f.name: f for f in PRED_DIR.glob("*.json")}
    
    # Find common files
    common_names = set(gold_files.keys()) & set(pred_files.keys())
    
    pairs = [(gold_files[name], pred_files[name]) for name in sorted(common_names)]
    
    print(f"Found {len(pairs)} matching page pairs")
    if len(pairs) < len(gold_files):
        missing = set(gold_files.keys()) - set(pred_files.keys())
        print(f"⚠️  {len(missing)} gold standard pages without predictions:")
        for name in sorted(missing)[:5]:
            print(f"   • {name}")
    
    return pairs

page_pairs = load_page_pairs()

Found 1 matching page pairs


In [4]:
def character_error_rate(reference: str, hypothesis: str) -> float:
    """Calculate Character Error Rate using Levenshtein distance."""
    if not reference:
        return 1.0 if hypothesis else 0.0
    distance = Levenshtein.distance(reference, hypothesis)
    return distance / len(reference)

def word_error_rate(reference: str, hypothesis: str) -> float:
    """Calculate Word Error Rate using Levenshtein distance on words."""
    ref_words = reference.split()
    hyp_words = hypothesis.split()
    if not ref_words:
        return 1.0 if hyp_words else 0.0
    distance = Levenshtein.distance(ref_words, hyp_words)
    return distance / len(ref_words)

def evaluate_text_quality(gold_path: Path, pred_path: Path, 
                         item_classes: Optional[List[str]] = None) -> Dict:
    """
    Compare text quality between gold and prediction.
    
    Args:
        gold_path: Path to gold standard JSON
        pred_path: Path to prediction JSON
        item_classes: If provided, only evaluate these item classes
    
    Returns:
        Dict with CER, WER, and per-item metrics
    """
    with open(gold_path, 'r', encoding='utf-8') as f:
        gold_data = json.load(f)
    with open(pred_path, 'r', encoding='utf-8') as f:
        pred_data = json.load(f)
    
    # Extract text blocks
    gold_items = gold_data.get('items', [])
    pred_items = pred_data.get('items', [])
    
    # Filter by item class if specified
    if item_classes:
        gold_items = [item for item in gold_items 
                     if item.get('item_class') in item_classes]
        pred_items = [item for item in pred_items 
                     if item.get('item_class') in item_classes]
    
    # Flatten to text
    gold_text = "\n\n".join(item.get('item_text_raw', '') for item in gold_items)
    pred_text = "\n\n".join(item.get('item_text_raw', '') for item in pred_items)
    
    cer = character_error_rate(gold_text, pred_text)
    wer = word_error_rate(gold_text, pred_text)
    
    return {
        'cer': cer,
        'wer': wer,
        'gold_chars': len(gold_text),
        'pred_chars': len(pred_text),
        'gold_words': len(gold_text.split()),
        'pred_words': len(pred_text.split()),
        'gold_items': len(gold_items),
        'pred_items': len(pred_items)
    }

# Evaluate text quality for all pages
print("Evaluating text quality...\n")

all_results = []
contributions_results = []

for gold_path, pred_path in page_pairs:
    # All items
    result_all = evaluate_text_quality(gold_path, pred_path)
    result_all['page'] = gold_path.name
    all_results.append(result_all)
    
    # Contributions only (prose + verse)
    result_contrib = evaluate_text_quality(gold_path, pred_path, 
                                          item_classes=['prose', 'verse'])
    result_contrib['page'] = gold_path.name
    contributions_results.append(result_contrib)
    
    print(f"✓ {gold_path.name}")
    print(f"   All items - CER: {result_all['cer']:.2%}, WER: {result_all['wer']:.2%}")
    print(f"   Contributions - CER: {result_contrib['cer']:.2%}, WER: {result_contrib['wer']:.2%}\n")

# Compute averages
avg_cer_all = sum(r['cer'] for r in all_results) / len(all_results) if all_results else 0
avg_wer_all = sum(r['wer'] for r in all_results) / len(all_results) if all_results else 0
avg_cer_contrib = sum(r['cer'] for r in contributions_results) / len(contributions_results) if contributions_results else 0
avg_wer_contrib = sum(r['wer'] for r in contributions_results) / len(contributions_results) if contributions_results else 0

print(f"{'='*60}")
print(f"TEXT QUALITY SUMMARY")
print(f"{'='*60}")
print(f"\nAll Items:")
print(f"   Average CER: {avg_cer_all:.2%}")
print(f"   Average WER: {avg_wer_all:.2%}")
print(f"\nContributions Only (prose + verse):")
print(f"   Average CER: {avg_cer_contrib:.2%}")
print(f"   Average WER: {avg_wer_contrib:.2%}")

Evaluating text quality...

✓ La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-001.json
   All items - CER: 1.18%, WER: 2.96%
   Contributions - CER: 0.00%, WER: 0.00%

TEXT QUALITY SUMMARY

All Items:
   Average CER: 1.18%
   Average WER: 2.96%

Contributions Only (prose + verse):
   Average CER: 0.00%
   Average WER: 0.00%


In [6]:
def evaluate_item_boundaries(gold_path: Path, pred_path: Path, 
                            item_classes: Optional[List[str]] = None,
                            tolerance: int = 20) -> Dict:
    """
    Evaluate whether item boundaries are correctly detected.
    
    Args:
        gold_path: Path to gold standard JSON
        pred_path: Path to prediction JSON
        item_classes: If provided, only evaluate these item classes
        tolerance: Character tolerance window for boundary matching
    
    Returns:
        Dict with precision, recall, F1
    """
    with open(gold_path, 'r', encoding='utf-8') as f:
        gold_data = json.load(f)
    with open(pred_path, 'r', encoding='utf-8') as f:
        pred_data = json.load(f)
    
    def get_boundaries(items, filter_classes=None):
        """Get character positions where items start."""
        boundaries = []
        pos = 0
        for item in items:
            if filter_classes is None or item.get('item_class') in filter_classes:
                boundaries.append(pos)
            # Add text length + separator
            pos += len(item.get('item_text_raw', '')) + 2
        return set(boundaries)
    
    gold_items = gold_data.get('items', [])
    pred_items = pred_data.get('items', [])
    
    gold_bounds = get_boundaries(gold_items, item_classes)
    pred_bounds = get_boundaries(pred_items, item_classes)
    
    # Match boundaries within tolerance
    tp = 0
    for pred_b in pred_bounds:
        if any(abs(pred_b - gold_b) <= tolerance for gold_b in gold_bounds):
            tp += 1
    
    fp = len(pred_bounds) - tp
    fn = len(gold_bounds) - tp
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'tp': tp,
        'fp': fp,
        'fn': fn,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'gold_boundaries': len(gold_bounds),
        'pred_boundaries': len(pred_bounds)
    }

# Evaluate item boundaries
print("Evaluating item boundary detection...\n")

boundary_results_all = []
boundary_results_contrib = []

for gold_path, pred_path in page_pairs:
    # All items
    result_all = evaluate_item_boundaries(gold_path, pred_path)
    result_all['page'] = gold_path.name
    boundary_results_all.append(result_all)
    
    # Contributions only
    result_contrib = evaluate_item_boundaries(gold_path, pred_path,
                                             item_classes=['prose', 'verse'])
    result_contrib['page'] = gold_path.name
    boundary_results_contrib.append(result_contrib)
    
    print(f"✓ {gold_path.name}")
    print(f"   All items - P: {result_all['precision']:.2%}, "
          f"R: {result_all['recall']:.2%}, F1: {result_all['f1']:.3f}")
    print(f"   Contributions - P: {result_contrib['precision']:.2%}, "
          f"R: {result_contrib['recall']:.2%}, F1: {result_contrib['f1']:.3f}\n")

# Compute micro-averages (sum all TP/FP/FN, then compute metrics)
total_tp_all = sum(r['tp'] for r in boundary_results_all)
total_fp_all = sum(r['fp'] for r in boundary_results_all)
total_fn_all = sum(r['fn'] for r in boundary_results_all)

precision_all = total_tp_all / (total_tp_all + total_fp_all) if (total_tp_all + total_fp_all) > 0 else 0
recall_all = total_tp_all / (total_tp_all + total_fn_all) if (total_tp_all + total_fn_all) > 0 else 0
f1_all = 2 * precision_all * recall_all / (precision_all + recall_all) if (precision_all + recall_all) > 0 else 0

total_tp_contrib = sum(r['tp'] for r in boundary_results_contrib)
total_fp_contrib = sum(r['fp'] for r in boundary_results_contrib)
total_fn_contrib = sum(r['fn'] for r in boundary_results_contrib)

precision_contrib = total_tp_contrib / (total_tp_contrib + total_fp_contrib) if (total_tp_contrib + total_fp_contrib) > 0 else 0
recall_contrib = total_tp_contrib / (total_tp_contrib + total_fn_contrib) if (total_tp_contrib + total_fn_contrib) > 0 else 0
f1_contrib = 2 * precision_contrib * recall_contrib / (precision_contrib + recall_contrib) if (precision_contrib + recall_contrib) > 0 else 0

print(f"{'='*60}")
print(f"ITEM BOUNDARY DETECTION SUMMARY")
print(f"{'='*60}")
print(f"\nAll Items:")
print(f"   Precision: {precision_all:.2%}")
print(f"   Recall: {recall_all:.2%}")
print(f"   F1: {f1_all:.3f}")
print(f"\nContributions Only (prose + verse):")
print(f"   Precision: {precision_contrib:.2%}")
print(f"   Recall: {recall_contrib:.2%}")
print(f"   F1: {f1_contrib:.3f}")


Evaluating item boundary detection...

✓ La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-001.json
   All items - P: 100.00%, R: 100.00%, F1: 1.000
   Contributions - P: 0.00%, R: 0.00%, F1: 0.000

ITEM BOUNDARY DETECTION SUMMARY

All Items:
   Precision: 100.00%
   Recall: 100.00%
   F1: 1.000

Contributions Only (prose + verse):
   Precision: 0.00%
   Recall: 0.00%
   F1: 0.000


In [8]:
def evaluate_classification(gold_path: Path, pred_path: Path,
                           item_classes: Optional[List[str]] = None) -> Dict:
    """
    Evaluate item_class classification accuracy.
    
    Assumes items are in same order (or uses simple alignment).
    
    Returns:
        Dict with accuracy, per-class metrics, confusion matrix
    """
    with open(gold_path, 'r', encoding='utf-8') as f:
        gold_data = json.load(f)
    with open(pred_path, 'r', encoding='utf-8') as f:
        pred_data = json.load(f)
    
    gold_items = gold_data.get('items', [])
    pred_items = pred_data.get('items', [])
    
    # Filter by item class if specified
    if item_classes:
        gold_items = [item for item in gold_items 
                     if item.get('item_class') in item_classes]
        pred_items = [item for item in pred_items 
                     if item.get('item_class') in item_classes]
    
    # Simple alignment: assume same number and order
    if len(gold_items) != len(pred_items):
        print(f"   ⚠️  Item count mismatch: gold={len(gold_items)}, pred={len(pred_items)}")
    
    min_len = min(len(gold_items), len(pred_items))
    
    correct = 0
    confusion = defaultdict(lambda: defaultdict(int))
    
    for i in range(min_len):
        gold_class = gold_items[i].get('item_class', 'unknown')
        pred_class = pred_items[i].get('item_class', 'unknown')
        
        confusion[gold_class][pred_class] += 1
        if gold_class == pred_class:
            correct += 1
    
    accuracy = correct / min_len if min_len > 0 else 0
    
    return {
        'correct': correct,
        'total': min_len,
        'accuracy': accuracy,
        'confusion': dict(confusion),
        'gold_count': len(gold_items),
        'pred_count': len(pred_items)
    }

# Evaluate classification
print("Evaluating item classification...\n")

classification_results_all = []
classification_results_contrib = []

for gold_path, pred_path in page_pairs:
    # All items
    result_all = evaluate_classification(gold_path, pred_path)
    result_all['page'] = gold_path.name
    classification_results_all.append(result_all)
    
    # Contributions only
    result_contrib = evaluate_classification(gold_path, pred_path,
                                            item_classes=['prose', 'verse'])
    result_contrib['page'] = gold_path.name
    classification_results_contrib.append(result_contrib)
    
    print(f"✓ {gold_path.name}")
    print(f"   All items - Accuracy: {result_all['accuracy']:.2%} "
          f"({result_all['correct']}/{result_all['total']})")
    if result_contrib['total'] > 0:
        print(f"   Contributions - Accuracy: {result_contrib['accuracy']:.2%} "
              f"({result_contrib['correct']}/{result_contrib['total']})")
    print()

# Compute overall accuracy
total_correct_all = sum(r['correct'] for r in classification_results_all)
total_items_all = sum(r['total'] for r in classification_results_all)
overall_accuracy_all = total_correct_all / total_items_all if total_items_all > 0 else 0

total_correct_contrib = sum(r['correct'] for r in classification_results_contrib)
total_items_contrib = sum(r['total'] for r in classification_results_contrib)
overall_accuracy_contrib = total_correct_contrib / total_items_contrib if total_items_contrib > 0 else 0

# Aggregate confusion matrix
all_confusion = defaultdict(lambda: defaultdict(int))
for result in classification_results_all:
    for gold_class, pred_dict in result['confusion'].items():
        for pred_class, count in pred_dict.items():
            all_confusion[gold_class][pred_class] += count

print(f"{'='*60}")
print(f"CLASSIFICATION ACCURACY SUMMARY")
print(f"{'='*60}")
print(f"\nAll Items:")
print(f"   Overall Accuracy: {overall_accuracy_all:.2%} ({total_correct_all}/{total_items_all})")
print(f"\nContributions Only (prose + verse):")
print(f"   Overall Accuracy: {overall_accuracy_contrib:.2%} ({total_correct_contrib}/{total_items_contrib})")

print(f"\nConfusion Matrix (All Items):")
print(f"{'Gold / Pred':<15}", end="")
all_classes = sorted(set(list(all_confusion.keys()) + 
                        [pred for preds in all_confusion.values() for pred in preds.keys()]))
for pred_class in all_classes:
    print(f"{pred_class:<12}", end="")
print()
for gold_class in all_classes:
    print(f"{gold_class:<15}", end="")
    for pred_class in all_classes:
        count = all_confusion[gold_class][pred_class]
        print(f"{count:<12}", end="")
    print()

Evaluating item classification...

✓ La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-001.json
   All items - Accuracy: 100.00% (7/7)

CLASSIFICATION ACCURACY SUMMARY

All Items:
   Overall Accuracy: 100.00% (7/7)

Contributions Only (prose + verse):
   Overall Accuracy: 0.00% (0/0)

Confusion Matrix (All Items):
Gold / Pred    paratext    
paratext       7           


In [9]:
def evaluate_metadata(gold_path: Path, pred_path: Path,
                     item_classes: Optional[List[str]] = None) -> Dict:
    """
    Evaluate title and author extraction accuracy.
    
    Returns:
        Dict with title/author presence detection and exact match metrics
    """
    with open(gold_path, 'r', encoding='utf-8') as f:
        gold_data = json.load(f)
    with open(pred_path, 'r', encoding='utf-8') as f:
        pred_data = json.load(f)
    
    gold_items = gold_data.get('items', [])
    pred_items = pred_data.get('items', [])
    
    # Filter by item class if specified
    if item_classes:
        gold_items = [item for item in gold_items 
                     if item.get('item_class') in item_classes]
        pred_items = [item for item in pred_items 
                     if item.get('item_class') in item_classes]
    
    min_len = min(len(gold_items), len(pred_items))
    
    title_metrics = {'tp': 0, 'fp': 0, 'fn': 0, 'exact_match': 0}
    author_metrics = {'tp': 0, 'fp': 0, 'fn': 0, 'exact_match': 0}
    
    for i in range(min_len):
        gold_item = gold_items[i]
        pred_item = pred_items[i]
        
        # Title evaluation
        gold_title = gold_item.get('item_title')
        pred_title = pred_item.get('item_title')
        
        if gold_title and pred_title:
            title_metrics['tp'] += 1
            if gold_title == pred_title:
                title_metrics['exact_match'] += 1
        elif not gold_title and pred_title:
            title_metrics['fp'] += 1
        elif gold_title and not pred_title:
            title_metrics['fn'] += 1
        
        # Author evaluation
        gold_author = gold_item.get('item_author')
        pred_author = pred_item.get('item_author')
        
        if gold_author and pred_author:
            author_metrics['tp'] += 1
            if gold_author == pred_author:
                author_metrics['exact_match'] += 1
        elif not gold_author and pred_author:
            author_metrics['fp'] += 1
        elif gold_author and not pred_author:
            author_metrics['fn'] += 1
    
    # Compute F1 for title
    title_p = title_metrics['tp'] / (title_metrics['tp'] + title_metrics['fp']) if (title_metrics['tp'] + title_metrics['fp']) > 0 else 0
    title_r = title_metrics['tp'] / (title_metrics['tp'] + title_metrics['fn']) if (title_metrics['tp'] + title_metrics['fn']) > 0 else 0
    title_f1 = 2 * title_p * title_r / (title_p + title_r) if (title_p + title_r) > 0 else 0
    
    # Compute F1 for author
    author_p = author_metrics['tp'] / (author_metrics['tp'] + author_metrics['fp']) if (author_metrics['tp'] + author_metrics['fp']) > 0 else 0
    author_r = author_metrics['tp'] / (author_metrics['tp'] + author_metrics['fn']) if (author_metrics['tp'] + author_metrics['fn']) > 0 else 0
    author_f1 = 2 * author_p * author_r / (author_p + author_r) if (author_p + author_r) > 0 else 0
    
    return {
        'title': {
            **title_metrics,
            'precision': title_p,
            'recall': title_r,
            'f1': title_f1
        },
        'author': {
            **author_metrics,
            'precision': author_p,
            'recall': author_r,
            'f1': author_f1
        }
    }

# Evaluate metadata extraction
print("Evaluating metadata extraction...\n")

metadata_results_all = []
metadata_results_contrib = []

for gold_path, pred_path in page_pairs:
    # All items
    result_all = evaluate_metadata(gold_path, pred_path)
    result_all['page'] = gold_path.name
    metadata_results_all.append(result_all)
    
    # Contributions only
    result_contrib = evaluate_metadata(gold_path, pred_path,
                                      item_classes=['prose', 'verse'])
    result_contrib['page'] = gold_path.name
    metadata_results_contrib.append(result_contrib)
    
    print(f"✓ {gold_path.name}")
    print(f"   Title F1: {result_all['title']['f1']:.3f}, "
          f"Author F1: {result_all['author']['f1']:.3f}")

# Aggregate metrics
def aggregate_metadata_metrics(results):
    total_title_tp = sum(r['title']['tp'] for r in results)
    total_title_fp = sum(r['title']['fp'] for r in results)
    total_title_fn = sum(r['title']['fn'] for r in results)
    total_title_exact = sum(r['title']['exact_match'] for r in results)
    
    title_p = total_title_tp / (total_title_tp + total_title_fp) if (total_title_tp + total_title_fp) > 0 else 0
    title_r = total_title_tp / (total_title_tp + total_title_fn) if (total_title_tp + total_title_fn) > 0 else 0
    title_f1 = 2 * title_p * title_r / (title_p + title_r) if (title_p + title_r) > 0 else 0
    
    total_author_tp = sum(r['author']['tp'] for r in results)
    total_author_fp = sum(r['author']['fp'] for r in results)
    total_author_fn = sum(r['author']['fn'] for r in results)
    total_author_exact = sum(r['author']['exact_match'] for r in results)
    
    author_p = total_author_tp / (total_author_tp + total_author_fp) if (total_author_tp + total_author_fp) > 0 else 0
    author_r = total_author_tp / (total_author_tp + total_author_fn) if (total_author_tp + total_author_fn) > 0 else 0
    author_f1 = 2 * author_p * author_r / (author_p + author_r) if (author_p + author_r) > 0 else 0
    
    return {
        'title': {'precision': title_p, 'recall': title_r, 'f1': title_f1, 'exact_match': total_title_exact, 'tp': total_title_tp},
        'author': {'precision': author_p, 'recall': author_r, 'f1': author_f1, 'exact_match': total_author_exact, 'tp': total_author_tp}
    }

agg_all = aggregate_metadata_metrics(metadata_results_all)
agg_contrib = aggregate_metadata_metrics(metadata_results_contrib)

print(f"\n{'='*60}")
print(f"METADATA EXTRACTION SUMMARY")
print(f"{'='*60}")
print(f"\nAll Items:")
print(f"   Title - P: {agg_all['title']['precision']:.2%}, R: {agg_all['title']['recall']:.2%}, F1: {agg_all['title']['f1']:.3f}")
print(f"           Exact matches: {agg_all['title']['exact_match']}/{agg_all['title']['tp']}")
print(f"   Author - P: {agg_all['author']['precision']:.2%}, R: {agg_all['author']['recall']:.2%}, F1: {agg_all['author']['f1']:.3f}")
print(f"            Exact matches: {agg_all['author']['exact_match']}/{agg_all['author']['tp']}")
print(f"\nContributions Only (prose + verse):")
print(f"   Title - P: {agg_contrib['title']['precision']:.2%}, R: {agg_contrib['title']['recall']:.2%}, F1: {agg_contrib['title']['f1']:.3f}")
print(f"   Author - P: {agg_contrib['author']['precision']:.2%}, R: {agg_contrib['author']['recall']:.2%}, F1: {agg_contrib['author']['f1']:.3f}")

Evaluating metadata extraction...

✓ La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-001.json
   Title F1: 1.000, Author F1: 0.000

METADATA EXTRACTION SUMMARY

All Items:
   Title - P: 100.00%, R: 100.00%, F1: 1.000
           Exact matches: 1/1
   Author - P: 0.00%, R: 0.00%, F1: 0.000
            Exact matches: 0/0

Contributions Only (prose + verse):
   Title - P: 0.00%, R: 0.00%, F1: 0.000
   Author - P: 0.00%, R: 0.00%, F1: 0.000


In [10]:
def evaluate_continuation_tracking(gold_path: Path, pred_path: Path,
                                  item_classes: Optional[List[str]] = None) -> Dict:
    """
    Evaluate continuation field accuracy (is_continuation, continues_on_next_page).
    
    Returns:
        Dict with precision, recall, F1 for each continuation field
    """
    with open(gold_path, 'r', encoding='utf-8') as f:
        gold_data = json.load(f)
    with open(pred_path, 'r', encoding='utf-8') as f:
        pred_data = json.load(f)
    
    gold_items = gold_data.get('items', [])
    pred_items = pred_data.get('items', [])
    
    # Filter by item class if specified
    if item_classes:
        gold_items = [item for item in gold_items 
                     if item.get('item_class') in item_classes]
        pred_items = [item for item in pred_items 
                     if item.get('item_class') in item_classes]
    
    min_len = min(len(gold_items), len(pred_items))
    
    # Metrics for is_continuation
    is_cont_metrics = {'tp': 0, 'fp': 0, 'fn': 0, 'tn': 0}
    # Metrics for continues_on_next_page
    continues_metrics = {'tp': 0, 'fp': 0, 'fn': 0, 'tn': 0}
    
    for i in range(min_len):
        gold_item = gold_items[i]
        pred_item = pred_items[i]
        
        # Evaluate is_continuation (treat absent as False)
        gold_is_cont = gold_item.get('is_continuation', False)
        pred_is_cont = pred_item.get('is_continuation', False)
        
        if gold_is_cont and pred_is_cont:
            is_cont_metrics['tp'] += 1
        elif not gold_is_cont and pred_is_cont:
            is_cont_metrics['fp'] += 1
        elif gold_is_cont and not pred_is_cont:
            is_cont_metrics['fn'] += 1
        else:
            is_cont_metrics['tn'] += 1
        
        # Evaluate continues_on_next_page
        gold_continues = gold_item.get('continues_on_next_page', False)
        pred_continues = pred_item.get('continues_on_next_page', False)
        
        if gold_continues and pred_continues:
            continues_metrics['tp'] += 1
        elif not gold_continues and pred_continues:
            continues_metrics['fp'] += 1
        elif gold_continues and not pred_continues:
            continues_metrics['fn'] += 1
        else:
            continues_metrics['tn'] += 1
    
    # Compute metrics for is_continuation
    is_cont_p = is_cont_metrics['tp'] / (is_cont_metrics['tp'] + is_cont_metrics['fp']) if (is_cont_metrics['tp'] + is_cont_metrics['fp']) > 0 else 0
    is_cont_r = is_cont_metrics['tp'] / (is_cont_metrics['tp'] + is_cont_metrics['fn']) if (is_cont_metrics['tp'] + is_cont_metrics['fn']) > 0 else 0
    is_cont_f1 = 2 * is_cont_p * is_cont_r / (is_cont_p + is_cont_r) if (is_cont_p + is_cont_r) > 0 else 0
    
    # Compute metrics for continues_on_next_page
    continues_p = continues_metrics['tp'] / (continues_metrics['tp'] + continues_metrics['fp']) if (continues_metrics['tp'] + continues_metrics['fp']) > 0 else 0
    continues_r = continues_metrics['tp'] / (continues_metrics['tp'] + continues_metrics['fn']) if (continues_metrics['tp'] + continues_metrics['fn']) > 0 else 0
    continues_f1 = 2 * continues_p * continues_r / (continues_p + continues_r) if (continues_p + continues_r) > 0 else 0
    
    return {
        'is_continuation': {
            **is_cont_metrics,
            'precision': is_cont_p,
            'recall': is_cont_r,
            'f1': is_cont_f1
        },
        'continues_on_next_page': {
            **continues_metrics,
            'precision': continues_p,
            'recall': continues_r,
            'f1': continues_f1
        }
    }

# Evaluate continuation tracking
print("Evaluating continuation tracking...\n")

continuation_results = []

for gold_path, pred_path in page_pairs:
    result = evaluate_continuation_tracking(gold_path, pred_path,
                                           item_classes=['prose', 'verse'])
    result['page'] = gold_path.name
    continuation_results.append(result)
    
    print(f"✓ {gold_path.name}")
    print(f"   is_continuation - F1: {result['is_continuation']['f1']:.3f}")
    print(f"   continues_on_next - F1: {result['continues_on_next_page']['f1']:.3f}\n")

# Aggregate continuation metrics
total_is_cont_tp = sum(r['is_continuation']['tp'] for r in continuation_results)
total_is_cont_fp = sum(r['is_continuation']['fp'] for r in continuation_results)
total_is_cont_fn = sum(r['is_continuation']['fn'] for r in continuation_results)

is_cont_p = total_is_cont_tp / (total_is_cont_tp + total_is_cont_fp) if (total_is_cont_tp + total_is_cont_fp) > 0 else 0
is_cont_r = total_is_cont_tp / (total_is_cont_tp + total_is_cont_fn) if (total_is_cont_tp + total_is_cont_fn) > 0 else 0
is_cont_f1 = 2 * is_cont_p * is_cont_r / (is_cont_p + is_cont_r) if (is_cont_p + is_cont_r) > 0 else 0

total_continues_tp = sum(r['continues_on_next_page']['tp'] for r in continuation_results)
total_continues_fp = sum(r['continues_on_next_page']['fp'] for r in continuation_results)
total_continues_fn = sum(r['continues_on_next_page']['fn'] for r in continuation_results)

continues_p = total_continues_tp / (total_continues_tp + total_continues_fp) if (total_continues_tp + total_continues_fp) > 0 else 0
continues_r = total_continues_tp / (total_continues_tp + total_continues_fn) if (total_continues_tp + total_continues_fn) > 0 else 0
continues_f1 = 2 * continues_p * continues_r / (continues_p + continues_r) if (continues_p + continues_r) > 0 else 0

print(f"{'='*60}")
print(f"CONTINUATION TRACKING SUMMARY (Contributions Only)")
print(f"{'='*60}")
print(f"\nis_continuation:")
print(f"   Precision: {is_cont_p:.2%}")
print(f"   Recall: {is_cont_r:.2%}")
print(f"   F1: {is_cont_f1:.3f}")
print(f"\ncontinues_on_next_page:")
print(f"   Precision: {continues_p:.2%}")
print(f"   Recall: {continues_r:.2%}")
print(f"   F1: {continues_f1:.3f}")

Evaluating continuation tracking...

✓ La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-001.json
   is_continuation - F1: 0.000
   continues_on_next - F1: 0.000

CONTINUATION TRACKING SUMMARY (Contributions Only)

is_continuation:
   Precision: 0.00%
   Recall: 0.00%
   F1: 0.000

continues_on_next_page:
   Precision: 0.00%
   Recall: 0.00%
   F1: 0.000


In [11]:
print("\n" + "="*70)
print("COMPREHENSIVE EVALUATION SUMMARY")
print("="*70)
print("\nSecond_try_revised vs Gold Standard")
print(f"Evaluated on {len(page_pairs)} pages\n")

print(f"{'Metric':<30} {'All Items':<20} {'Contributions':<20}")
print("-" * 70)

# Text Quality
print(f"{'TEXT QUALITY':<30}")
print(f"{'  Character Error Rate':<30} {avg_cer_all:>18.2%} {avg_cer_contrib:>18.2%}")
print(f"{'  Word Error Rate':<30} {avg_wer_all:>18.2%} {avg_wer_contrib:>18.2%}")
print()

# Structure Quality
print(f"{'STRUCTURE QUALITY':<30}")
print(f"{'  Boundary Detection F1':<30} {f1_all:>18.3f} {f1_contrib:>18.3f}")
print(f"{'  Classification Accuracy':<30} {overall_accuracy_all:>18.2%} {overall_accuracy_contrib:>18.2%}")
print()

# Metadata Quality
print(f"{'METADATA EXTRACTION':<30}")
print(f"{'  Title F1':<30} {agg_all['title']['f1']:>18.3f} {agg_contrib['title']['f1']:>18.3f}")
print(f"{'  Author F1':<30} {agg_all['author']['f1']:>18.3f} {agg_contrib['author']['f1']:>18.3f}")
print()

# Continuation Tracking
print(f"{'CONTINUATION TRACKING':<30} {'N/A':<20} {'Contributions':<20}")
print(f"{'  is_continuation F1':<30} {'':<20} {is_cont_f1:>18.3f}")
print(f"{'  continues_on_next F1':<30} {'':<20} {continues_f1:>18.3f}")

print("\n" + "="*70)


COMPREHENSIVE EVALUATION SUMMARY

Second_try_revised vs Gold Standard
Evaluated on 1 pages

Metric                         All Items            Contributions       
----------------------------------------------------------------------
TEXT QUALITY                  
  Character Error Rate                      1.18%              0.00%
  Word Error Rate                           2.96%              0.00%

STRUCTURE QUALITY             
  Boundary Detection F1                     1.000              0.000
  Classification Accuracy                 100.00%              0.00%

METADATA EXTRACTION           
  Title F1                                  1.000              0.000
  Author F1                                 0.000              0.000

CONTINUATION TRACKING          N/A                  Contributions       
  is_continuation F1                                             0.000
  continues_on_next F1                                           0.000



In [12]:
print("="*70)
print("ERROR ANALYSIS")
print("="*70)

# Find worst performing pages by CER
worst_pages_cer = sorted(all_results, key=lambda x: x['cer'], reverse=True)[:5]

print("\nWorst 5 Pages by Character Error Rate:")
for i, result in enumerate(worst_pages_cer, 1):
    print(f"{i}. {result['page']}")
    print(f"   CER: {result['cer']:.2%}, WER: {result['wer']:.2%}")
    print(f"   Gold: {result['gold_items']} items, {result['gold_chars']} chars")
    print(f"   Pred: {result['pred_items']} items, {result['pred_chars']} chars")
    print()

# Find pages with item count mismatches
print("\nPages with Item Count Mismatches:")
mismatches = [r for r in classification_results_all if r['gold_count'] != r['pred_count']]
if mismatches:
    for result in mismatches:
        diff = result['pred_count'] - result['gold_count']
        sign = "+" if diff > 0 else ""
        print(f"  • {result['page']}: Gold={result['gold_count']}, Pred={result['pred_count']} ({sign}{diff})")
else:
    print("  No mismatches found!")

# Classification errors
print("\nMost Common Classification Errors:")
errors = []
for gold_class, pred_dict in all_confusion.items():
    for pred_class, count in pred_dict.items():
        if gold_class != pred_class and count > 0:
            errors.append((count, gold_class, pred_class))

errors.sort(reverse=True)
for count, gold_class, pred_class in errors[:10]:
    print(f"  • {gold_class} → {pred_class}: {count} times")

print("\n" + "="*70)

ERROR ANALYSIS

Worst 5 Pages by Character Error Rate:
1. La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-001.json
   CER: 1.18%, WER: 2.96%
   Gold: 7 items, 1191 chars
   Pred: 7 items, 1181 chars


Pages with Item Count Mismatches:
  No mismatches found!

Most Common Classification Errors:

