In [1]:
"""
Gold Standard Preparation - Stage 1 OCR

Prepares prediction files for manual annotation and validates corrected gold standard.

Input:  Predictions from data/predictions/{magazine_name}/
Output: Gold standard in data/gold_standard/{raw|cleaned}/{magazine_name}/
Schema: schemas/stage1_page.py
"""

import sys
from pathlib import Path
import json
import shutil
from typing import Dict, List
from collections import Counter

# Project root detection
PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent

print("Gold Standard Preparation - Stage 1")
print("=" * 60)
print(f"Project root: {PROJECT_ROOT}")

# Add schemas to path
SCHEMAS_DIR = PROJECT_ROOT / "schemas"
if str(SCHEMAS_DIR) not in sys.path:
    sys.path.insert(0, str(SCHEMAS_DIR))

# Import schema
from stage1_page import Stage1PageModel

# Directory structure
PRED_ROOT = PROJECT_ROOT / "data" / "predictions"
GOLD_ROOT = PROJECT_ROOT / "data" / "gold_standard"
GOLD_RAW = GOLD_ROOT / "raw"
GOLD_CLEAN = GOLD_ROOT / "cleaned"

# Create directories
for directory in (GOLD_ROOT, GOLD_RAW, GOLD_CLEAN):
    directory.mkdir(parents=True, exist_ok=True)

print("\nDirectories:")
print(f"  Predictions: {PRED_ROOT}")
print(f"  Gold raw:    {GOLD_RAW}")
print(f"  Gold clean:  {GOLD_CLEAN}")
print(f"  Schema:      {Stage1PageModel.__name__}")

Gold Standard Preparation - Stage 1
Project root: /home/fabian-ramirez/Documents/These/Code/magazine_graphs

Directories:
  Predictions: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/predictions
  Gold raw:    /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/gold_standard/raw
  Gold clean:  /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/gold_standard/cleaned
  Schema:      Stage1PageModel


In [2]:
"""
Copy Predictions for Annotation
"""

def copy_for_annotation(
    pred_root: Path,
    gold_raw: Path,
    overwrite: bool = False
) -> Dict[str, int]:
    """
    Copy prediction files to gold_standard/raw/ for annotation.
    
    Processes all magazine directories in predictions/.
    Skips files that already exist unless overwrite=True.
    
    Args:
        pred_root: Root predictions directory
        gold_raw: Gold standard raw directory
        overwrite: If True, overwrite existing files
    
    Returns:
        Statistics: {magazine_name: {'copied': n, 'skipped': n}}
    """
    # Find all magazine directories in predictions
    magazine_dirs = [d for d in pred_root.iterdir() if d.is_dir()]
    
    if not magazine_dirs:
        print("No magazine directories found in predictions/")
        return {}
    
    stats = {}
    
    for mag_dir in sorted(magazine_dirs):
        magazine_name = mag_dir.name
        dest_dir = gold_raw / magazine_name
        dest_dir.mkdir(parents=True, exist_ok=True)
        
        json_files = list(mag_dir.glob("*.json"))
        
        if not json_files:
            print(f"Attention: {magazine_name}: No JSON files found")
            continue
        
        mag_stats = {'copied': 0, 'skipped': 0}
        
        for json_file in sorted(json_files):
            dest_file = dest_dir / json_file.name
            
            if dest_file.exists() and not overwrite:
                mag_stats['skipped'] += 1
            else:
                shutil.copy2(json_file, dest_file)
                mag_stats['copied'] += 1
        
        stats[magazine_name] = mag_stats
        
        print(f"✓ {magazine_name}: {mag_stats['copied']} copied, {mag_stats['skipped']} skipped")
    
    return stats

# Run copy
print("Copying predictions for annotation...")
print()
copy_stats = copy_for_annotation(PRED_ROOT, GOLD_RAW, overwrite=False)

print()
print("=" * 60)
print("Copy Summary")
print("=" * 60)
total_copied = sum(s['copied'] for s in copy_stats.values())
total_skipped = sum(s['skipped'] for s in copy_stats.values())
print(f"Magazines processed: {len(copy_stats)}")
print(f"Files copied:        {total_copied}")
print(f"Files skipped:       {total_skipped}")

Copying predictions for annotation...

✓ La_Plume_bpt6k1185893k_1_10_1889: 0 copied, 14 skipped
✓ La_Plume_bpt6k1212187t_15-11-1893: 3 copied, 3 skipped

Copy Summary
Magazines processed: 2
Files copied:        3
Files skipped:       17


In [3]:
"""
Validate Gold Standard Files
"""

def validate_gold_standard(gold_clean: Path) -> Dict:
    """
    Validate all JSON files in gold_standard/cleaned/ against schema.
    
    Args:
        gold_clean: Gold standard cleaned directory
    
    Returns:
        Validation results by magazine
    """
    magazine_dirs = [d for d in gold_clean.iterdir() if d.is_dir()]
    
    if not magazine_dirs:
        print("No magazine directories found in cleaned/")
        return {}
    
    results = {}
    
    for mag_dir in sorted(magazine_dirs):
        magazine_name = mag_dir.name
        json_files = list(mag_dir.glob("*.json"))
        
        if not json_files:
            continue
        
        mag_results = {'valid': 0, 'invalid': 0, 'errors': {}}
        
        for json_file in sorted(json_files):
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                Stage1PageModel(**data)
                mag_results['valid'] += 1
                
            except json.JSONDecodeError as e:
                mag_results['invalid'] += 1
                mag_results['errors'][json_file.name] = f"JSON parse error: {e}"
                
            except Exception as e:
                mag_results['invalid'] += 1
                mag_results['errors'][json_file.name] = str(e)
        
        results[magazine_name] = mag_results
    
    return results

# Run validation
print("\n" + "=" * 60)
print("Validate Gold Standard")
print("=" * 60 + "\n")

validation_results = validate_gold_standard(GOLD_CLEAN)

if not validation_results:
    print("No magazines found in cleaned/")
else:
    for magazine_name, result in validation_results.items():
        total = result['valid'] + result['invalid']
        print(f"{magazine_name}: {result['valid']}/{total} valid")
        
        if result['errors']:
            for filename, error in result['errors'].items():
                print(f"  ERROR: {filename}")
                print(f"    {error}")

    print("\n" + "=" * 60)
    print("Summary")
    print("=" * 60)
    total_valid = sum(r['valid'] for r in validation_results.values())
    total_invalid = sum(r['invalid'] for r in validation_results.values())
    print(f"Valid:   {total_valid}")
    print(f"Invalid: {total_invalid}")


Validate Gold Standard

La_Plume_bpt6k1185893k_1_10_1889: 14/14 valid

Summary
Valid:   14
Invalid: 0
