In [1]:
"""
Gold Standard Preparation - Stage 1 OCR

Prepares prediction files for manual annotation and validates corrected gold standard.

Input:  Predictions from data/predictions/{magazine_name}/
Output: Gold standard in data/gold_standard/{raw|cleaned}/{magazine_name}/
Schema: schemas/stage1_page.py
"""

from pathlib import Path
import json
import shutil
from typing import Dict, List
from collections import Counter

# Project imports
from utils.paths import PROJECT_ROOT, PREDICTIONS, GOLD_RAW, GOLD_CLEAN, ensure_data_dirs
from schemas.stage1_page import Stage1PageModel

# Ensure directories exist
ensure_data_dirs()

# Centralized paths
PRED_ROOT = PREDICTIONS
GOLD_ROOT = GOLD_RAW.parent 

print("Gold Standard Preparation - Stage 1")
print("=" * 60)
print(f"Project root: {PROJECT_ROOT}")
print("\nDirectories:")
print(f"  Predictions: {PRED_ROOT}")
print(f"  Gold raw:    {GOLD_RAW}")
print(f"  Gold clean:  {GOLD_CLEAN}")
print(f"  Schema:      {Stage1PageModel.__name__}")

Gold Standard Preparation - Stage 1
Project root: /home/fabian-ramirez/Documents/These/Code/magazine_graphs

Directories:
  Predictions: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/predictions
  Gold raw:    /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/gold_standard/raw
  Gold clean:  /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/gold_standard/cleaned
  Schema:      Stage1PageModel


In [2]:
"""
Copy Predictions for Annotation
"""

def copy_for_annotation(
    pred_root: Path,
    gold_raw: Path,
    overwrite: bool = False
) -> Dict[str, int]:
    """
    Copy prediction files to gold_standard/raw/ for annotation.
    
    Processes all magazine directories in predictions/.
    Skips files that already exist unless overwrite=True.
    
    Args:
        pred_root: Root predictions directory
        gold_raw: Gold standard raw directory
        overwrite: If True, overwrite existing files
    
    Returns:
        Statistics: {magazine_name: {'copied': n, 'skipped': n}}
    """
    # Find all magazine directories in predictions
    magazine_dirs = [d for d in pred_root.iterdir() if d.is_dir()]
    
    if not magazine_dirs:
        print("No magazine directories found in predictions/")
        return {}
    
    stats = {}
    
    for mag_dir in sorted(magazine_dirs):
        magazine_name = mag_dir.name
        dest_dir = gold_raw / magazine_name
        dest_dir.mkdir(parents=True, exist_ok=True)
        
        json_files = list(mag_dir.glob("*.json"))
        
        if not json_files:
            print(f"Attention: {magazine_name}: No JSON files found")
            continue
        
        mag_stats = {'copied': 0, 'skipped': 0}
        
        for json_file in sorted(json_files):
            dest_file = dest_dir / json_file.name
            
            if dest_file.exists() and not overwrite:
                mag_stats['skipped'] += 1
            else:
                shutil.copy2(json_file, dest_file)
                mag_stats['copied'] += 1
        
        stats[magazine_name] = mag_stats
        
        print(f"✓ {magazine_name}: {mag_stats['copied']} copied, {mag_stats['skipped']} skipped")
    
    return stats

# Run copy
print("Copying predictions for annotation...")
print()
copy_stats = copy_for_annotation(PRED_ROOT, GOLD_RAW, overwrite=False)

print()
print("=" * 60)
print("Copy Summary")
print("=" * 60)
total_copied = sum(s['copied'] for s in copy_stats.values())
total_skipped = sum(s['skipped'] for s in copy_stats.values())
print(f"Magazines processed: {len(copy_stats)}")
print(f"Files copied:        {total_copied}")
print(f"Files skipped:       {total_skipped}")

Copying predictions for annotation...

✓ La_Plume_bpt6k1185893k_1_10_1889: 0 copied, 14 skipped
✓ La_Plume_bpt6k1212187t_15-11-1893: 0 copied, 34 skipped
Attention: backup: No JSON files found

Copy Summary
Magazines processed: 2
Files copied:        0
Files skipped:       48


In [3]:
"""
Validate Gold Standard Files
"""

def validate_gold_standard(gold_clean: Path) -> Dict:
    """
    Validate all JSON files in gold_standard/cleaned/ against schema.
    
    Args:
        gold_clean: Gold standard cleaned directory
    
    Returns:
        Validation results by magazine
    """
    magazine_dirs = [d for d in gold_clean.iterdir() if d.is_dir()]
    
    if not magazine_dirs:
        print("No magazine directories found in cleaned/")
        return {}
    
    results = {}
    
    for mag_dir in sorted(magazine_dirs):
        magazine_name = mag_dir.name
        json_files = list(mag_dir.glob("*.json"))
        
        if not json_files:
            continue
        
        mag_results = {'valid': 0, 'invalid': 0, 'errors': {}}
        
        for json_file in sorted(json_files):
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                Stage1PageModel(**data)
                mag_results['valid'] += 1
                
            except json.JSONDecodeError as e:
                mag_results['invalid'] += 1
                mag_results['errors'][json_file.name] = f"JSON parse error: {e}"
                
            except Exception as e:
                mag_results['invalid'] += 1
                mag_results['errors'][json_file.name] = str(e)
        
        results[magazine_name] = mag_results
    
    return results

# Run validation
print("\n" + "=" * 60)
print("Validate Gold Standard")
print("=" * 60 + "\n")

validation_results = validate_gold_standard(GOLD_CLEAN)

if not validation_results:
    print("No magazines found in cleaned/")
else:
    for magazine_name, result in validation_results.items():
        total = result['valid'] + result['invalid']
        print(f"{magazine_name}: {result['valid']}/{total} valid")
        
        if result['errors']:
            for filename, error in result['errors'].items():
                print(f"  ERROR: {filename}")
                print(f"    {error}")

    print("\n" + "=" * 60)
    print("Summary")
    print("=" * 60)
    total_valid = sum(r['valid'] for r in validation_results.values())
    total_invalid = sum(r['invalid'] for r in validation_results.values())
    print(f"Valid:   {total_valid}")
    print(f"Invalid: {total_invalid}")


Validate Gold Standard

La_Plume_bpt6k1185893k_1_10_1889: 14/14 valid
La_Plume_bpt6k1212187t_15-11-1893: 1/1 valid

Summary
Valid:   15
Invalid: 0


In [4]:
"""
Compute Statistics for Gold Standard
"""

def compute_statistics(gold_clean: Path) -> Dict:
    """
    Compute summary statistics for annotated gold standard.
    
    Args:
        gold_clean: Gold standard cleaned directory
    
    Returns:
        Statistics by magazine and totals
    """
    magazine_dirs = [d for d in gold_clean.iterdir() if d.is_dir()]
    
    if not magazine_dirs:
        print("No magazine directories found in cleaned/")
        return {}
    
    results = {}
    
    for mag_dir in sorted(magazine_dirs):
        magazine_name = mag_dir.name
        json_files = list(mag_dir.glob("*.json"))
        
        if not json_files:
            continue
        
        mag_stats = {
            'pages': len(json_files),
            'items': 0,
            'item_classes': Counter(),
            'authors': Counter(),
            'with_title': 0,
            'with_author': 0,
            'text_lengths': []
        }
        
        for json_file in json_files:
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                items = data.get('items', [])
                mag_stats['items'] += len(items)
                
                for item in items:
                    mag_stats['item_classes'][item.get('item_class', 'unknown')] += 1
                    
                    if item.get('item_title'):
                        mag_stats['with_title'] += 1
                    
                    if item.get('item_author'):
                        mag_stats['with_author'] += 1
                        mag_stats['authors'][item.get('item_author')] += 1
                    
                    text_len = len(item.get('item_text_raw', ''))
                    mag_stats['text_lengths'].append(text_len)
                    
            except Exception as e:
                print(f"ERROR reading {json_file.name}: {e}")
        
        results[magazine_name] = mag_stats
    
    return results

# Run statistics
print("\n" + "=" * 60)
print("Gold Standard Statistics")
print("=" * 60 + "\n")

stats = compute_statistics(GOLD_CLEAN)

if not stats:
    print("No magazines found in cleaned/")
else:
    # Per-magazine stats
    for magazine_name, mag_stats in stats.items():
        print(f"{magazine_name}:")
        print(f"  Pages:      {mag_stats['pages']}")
        print(f"  Items:      {mag_stats['items']}")
        
        if mag_stats['items'] > 0:
            avg_items = mag_stats['items'] / mag_stats['pages']
            print(f"  Avg/page:   {avg_items:.1f}")
            
            print(f"  Item classes:")
            for item_class, count in mag_stats['item_classes'].most_common():
                pct = count / mag_stats['items'] * 100
                print(f"    {item_class}: {count} ({pct:.1f}%)")
            
            if mag_stats['text_lengths']:
                avg_len = sum(mag_stats['text_lengths']) / len(mag_stats['text_lengths'])
                print(f"  Text length:")
                print(f"    Avg: {avg_len:.0f} chars")
                print(f"    Min: {min(mag_stats['text_lengths'])} chars")
                print(f"    Max: {max(mag_stats['text_lengths'])} chars")
            
            if mag_stats['with_title'] > 0:
                print(f"  With title: {mag_stats['with_title']}")
            if mag_stats['with_author'] > 0:
                print(f"  With author: {mag_stats['with_author']}")
            
            if mag_stats['authors']:
                print(f"  Top authors:")
                for author, count in mag_stats['authors'].most_common(3):
                    print(f"    {author}: {count}")
        
        print()
    
    # Totals
    print("=" * 60)
    print("Totals")
    print("=" * 60)
    total_pages = sum(s['pages'] for s in stats.values())
    total_items = sum(s['items'] for s in stats.values())
    print(f"Magazines:  {len(stats)}")
    print(f"Pages:      {total_pages}")
    print(f"Items:      {total_items}")
    
    if total_items > 0:
        all_classes = Counter()
        for s in stats.values():
            all_classes.update(s['item_classes'])
        
        print(f"Item classes:")
        for item_class, count in all_classes.most_common():
            pct = count / total_items * 100
            print(f"  {item_class}: {count} ({pct:.1f}%)")


Gold Standard Statistics

La_Plume_bpt6k1185893k_1_10_1889:
  Pages:      14
  Items:      70
  Avg/page:   5.0
  Item classes:
    paratext: 44 (62.9%)
    prose: 14 (20.0%)
    verse: 11 (15.7%)
    ad: 1 (1.4%)
  Text length:
    Avg: 673 chars
    Min: 2 chars
    Max: 5222 chars
  With title: 20
  With author: 18
  Top authors:
    Léon Deschamps.: 2
    Gustave Rivet.: 1
    Francis Poictevin.: 1

La_Plume_bpt6k1212187t_15-11-1893:
  Pages:      1
  Items:      3
  Avg/page:   3.0
  Item classes:
    paratext: 2 (66.7%)
    prose: 1 (33.3%)
  Text length:
    Avg: 1491 chars
    Min: 24 chars
    Max: 4406 chars
  With title: 1

Totals
Magazines:  2
Pages:      15
Items:      73
Item classes:
  paratext: 46 (63.0%)
  prose: 15 (20.5%)
  verse: 11 (15.1%)
  ad: 1 (1.4%)
