In [3]:
"""
Gold Standard Preparation - Stage 1 OCR

Prepares prediction files for manual annotation and validates corrected gold standard.

Input:  Predictions from data/predictions/{magazine_name}/
Output: Gold standard in data/gold_standard/{raw|cleaned}/{magazine_name}/
Schema: schemas/stage1_page.py
"""

import sys
from pathlib import Path
import json
import shutil
from typing import Dict, List
from collections import Counter

# Project root detection
PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent

print("Gold Standard Preparation - Stage 1")
print("=" * 60)
print(f"Project root: {PROJECT_ROOT}")

# Add schemas to path
SCHEMAS_DIR = PROJECT_ROOT / "schemas"
if str(SCHEMAS_DIR) not in sys.path:
    sys.path.insert(0, str(SCHEMAS_DIR))

# Import schema
from stage1_page import Stage1PageModel

# Directory structure
PRED_ROOT = PROJECT_ROOT / "data" / "predictions"
GOLD_ROOT = PROJECT_ROOT / "data" / "gold_standard"
GOLD_RAW = GOLD_ROOT / "raw"
GOLD_CLEAN = GOLD_ROOT / "cleaned"

# Create directories
for directory in (GOLD_ROOT, GOLD_RAW, GOLD_CLEAN):
    directory.mkdir(parents=True, exist_ok=True)

print("\nDirectories:")
print(f"  Predictions: {PRED_ROOT}")
print(f"  Gold raw:    {GOLD_RAW}")
print(f"  Gold clean:  {GOLD_CLEAN}")
print(f"  Schema:      {Stage1PageModel.__name__}")

Gold Standard Preparation - Stage 1
Project root: /home/fabian-ramirez/Documents/These/Code/magazine_graphs

Directories:
  Predictions: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/predictions
  Gold raw:    /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/gold_standard/raw
  Gold clean:  /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/gold_standard/cleaned
  Schema:      Stage1PageModel


In [2]:
def copy_for_annotation(source_dir: Path, dest_dir: Path, overwrite: bool = False) -> Dict[str, int]:
    """
    Copy JSON files from source to destination for annotation.
    
    Args:
        source_dir: Directory with raw extractions
        dest_dir: Gold standard directory
        overwrite: If True, overwrite existing files (use with caution!)
    
    Returns:
        Stats dict with copied/skipped/total counts
    """
    if not source_dir.exists():
        print(f"Source directory not found: {source_dir}")
        return {"error": "source_not_found"}
    
    json_files = list(source_dir.glob("*.json"))
    
    if not json_files:
        print(f"No JSON files found in {source_dir}")
        return {"total": 0, "copied": 0, "skipped": 0}
    
    stats = {"total": len(json_files), "copied": 0, "skipped": 0}
    
    for json_file in json_files:
        dest_file = dest_dir / json_file.name
        
        if dest_file.exists() and not overwrite:
            stats["skipped"] += 1
            print(f"Skipped (already exists): {json_file.name}")
        else:
            shutil.copy2(json_file, dest_file)
            stats["copied"] += 1
            print(f"Copied: {json_file.name}")
    
    return stats

# Run the copy
print("Copying files for annotation...\n")
copy_stats = copy_for_annotation(RAW_DIR, GOLD_DIR_RAW, overwrite=False)

print(f"\n{'='*50}")
print(f"Copy Summary")
print(f"{'='*50}")
print(f"Total files found: {copy_stats.get('total', 0)}")
print(f"Copied: {copy_stats.get('copied', 0)}")
print(f"Skipped (already exist): {copy_stats.get('skipped', 0)}")
print(f"Tip: If you need to re-copy, set overwrite=True in the function call above.")

Copying files for annotation...

Copied: La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-007.json
Copied: La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-001.json
Copied: La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-004.json
Copied: La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-014.json
Copied: La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-010.json
Copied: La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-011.json
Copied: La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-013.json
Copied: La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-003.json
Copied: La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-005.json
Copied: La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-008.json
Copied: La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-006.json
Copied: La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-002.json
Copied: La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-012.json
Copied: La_Plume___revue_littéraire_[...]_bp

In [None]:
# def standardize_gold_standard_names():
#     """
#     Rename gold standard files to match the PDF-based naming convention.
#     Old: La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-001.json
#     New: La_Plume_bpt6k1185893k_1_10_1889__page-001.json
#     """
#     import re
    
#     # The standard name from your PDF
#     STANDARD_BASE = "La_Plume_bpt6k1185893k_1_10_1889"
    
#     for old_path in GOLD_DIR.glob("*.json"):
#         # Extract page number
#         match = re.search(r'page-(\d+)\.json$', old_path.name)
#         if not match:
#             print(f"⚠️  Skipping (no page number): {old_path.name}")
#             continue
        
#         page_num = match.group(1)
#         new_name = f"{STANDARD_BASE}__page-{page_num}.json"
#         new_path = old_path.parent / new_name
        
#         if old_path.name != new_name:
#             print(f"Renaming: {old_path.name}")
#             print(f"      →  {new_name}")
#             old_path.rename(new_path)
    
#     print("\n✓ Gold standard filenames standardized!")

# # Uncomment to run:
# standardize_gold_standard_names()

In [3]:
def validate_json_files(directory: Path, schema_class) -> Dict:
    """
    Validate all JSON files in directory against a schema.
    
    Args:
        directory: Path to folder with JSON files
        schema_class: Pydantic model class (PageWithContinuation or PageNoContinuation)
    
    Returns:
        Dict with validation results and errors
    """
    json_files = list(directory.glob("*.json"))
    
    if not json_files:
        return {"total": 0, "valid": 0, "invalid": 0, "errors": {}}
    
    results = {
        "total": len(json_files),
        "valid": 0,
        "invalid": 0,
        "errors": {}
    }
    
    for json_file in sorted(json_files):
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Validate with Pydantic
            schema_class(**data)
            results["valid"] += 1
            print(f"✓ Valid: {json_file.name}")
            
        except json.JSONDecodeError as e:
            results["invalid"] += 1
            results["errors"][json_file.name] = f"JSON parse error: {e}"
            print(f"Invalid JSON: {json_file.name}")
            
        except Exception as e:
            results["invalid"] += 1
            results["errors"][json_file.name] = str(e)
            print(f"Schema error: {json_file.name}")
    
    return results

# Choose which schema to validate against
# Uncomment the one you're using:

SCHEMA = PageWithContinuation  # Schema WITH continuation
# SCHEMA = PageNoContinuation  # Schema WITHOUT continuation

print(f"Validating against: {SCHEMA.__name__}\n")
validation_results = validate_json_files(GOLD_DIR_CLEAN, SCHEMA)

print(f"\n{'='*50}")
print(f"Validation Summary")
print(f"{'='*50}")
print(f"Total files: {validation_results['total']}")
print(f"Valid: {validation_results['valid']} ✓")
print(f"Invalid: {validation_results['invalid']} x")

if validation_results['errors']:
    print(f"Errors found:")
    for filename, error in validation_results['errors'].items():
        print(f"  • {filename}:")
        print(f"    {error}")

Validating against: Stage1PageModel

✓ Valid: La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-001.json

Validation Summary
Total files: 1
Valid: 1 ✓
Invalid: 0 x


In [None]:
def consistency_checks(directory: Path) -> Dict:
    """
    Run consistency checks on annotated files.
    
    Returns:
        Dict with warnings by category
    """
    json_files = list(directory.glob("*.json"))
    
    warnings = defaultdict(list)
    valid_classes = {"prose", "verse", "ad", "paratext", "unknown"}
    
    for json_file in sorted(json_files):
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Check each item
            for i, item in enumerate(data.get('items', [])):
                item_id = f"{json_file.name} - item {i}"
                
                # Check 1: Valid item class
                item_class = item.get('item_class')
                if item_class not in valid_classes:
                    warnings['invalid_class'].append(
                        f"{item_id}: '{item_class}' not in {valid_classes}"
                    )
                
                # Check 2: Empty text
                text = item.get('item_text_raw', '').strip()
                if not text:
                    warnings['empty_text'].append(
                        f"{item_id}: item_text_raw is empty"
                    )
                
                # Check 3: Continuation logic (if schema has these fields)
                is_cont = item.get('is_continuation', False)
                continues = item.get('continues_on_next_page', False)
                
                # Suspicious: same item both starts and ends continuation
                if is_cont and continues:
                    warnings['continuation_logic'].append(
                        f"{item_id}: both is_continuation AND continues_on_next_page"
                    )
                
                # Check 4: Very short text (potential OCR error)
                if len(text) < 10 and item_class not in ['paratext', 'unknown']:
                    warnings['short_text'].append(
                        f"{item_id}: suspiciously short text ({len(text)} chars)"
                    )
        
        except Exception as e:
            warnings['file_error'].append(f"{json_file.name}: {e}")
    
    return dict(warnings)

print("🔍 Running consistency checks...\n")
checks = consistency_checks(GOLD_DIR)

print(f"{'='*50}")
print(f"Consistency Check Results")
print(f"{'='*50}\n")

if not checks:
    print("✓ All checks passed! No warnings found.")
else:
    for category, warnings in checks.items():
        print(f"{category.upper().replace('_', ' ')} ({len(warnings)}):")
        for warning in warnings:
            print(f"   • {warning}")
        print()

🔍 Running consistency checks...

📊 Consistency Check Results



In [6]:
def compute_statistics(directory: Path) -> Dict:
    """
    Compute summary statistics for annotated corpus.
    """
    json_files = list(directory.glob("*.json"))
    
    stats = {
        'total_pages': len(json_files),
        'total_items': 0,
        'item_classes': Counter(),
        'authors': Counter(),
        'has_title': 0,
        'has_author': 0,
        'is_continuation': 0,
        'continues_next': 0,
        'text_lengths': [],
        'empty_pages': 0
    }
    
    for json_file in json_files:
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            items = data.get('items', [])
            
            if not items:
                stats['empty_pages'] += 1
                continue
            
            stats['total_items'] += len(items)
            
            for item in items:
                # Item class
                stats['item_classes'][item.get('item_class', 'unknown')] += 1
                
                # Title/author presence
                if item.get('item_title'):
                    stats['has_title'] += 1
                if item.get('item_author'):
                    stats['has_author'] += 1
                    stats['authors'][item.get('item_author')] += 1
                
                # Continuation (if schema has these fields)
                if item.get('is_continuation'):
                    stats['is_continuation'] += 1
                if item.get('continues_on_next_page'):
                    stats['continues_next'] += 1
                
                # Text length
                text_len = len(item.get('item_text_raw', ''))
                stats['text_lengths'].append(text_len)
        
        except Exception as e:
            print(f"Error processing {json_file.name}: {e}")
    
    # Calculate averages
    if stats['text_lengths']:
        stats['avg_text_length'] = sum(stats['text_lengths']) / len(stats['text_lengths'])
    else:
        stats['avg_text_length'] = 0
    
    return stats

print("Computing statistics...\n")
stats = compute_statistics(GOLD_DIR_CLEAN)

print(f"{'='*50}")
print(f"Gold Standard Statistics")
print(f"{'='*50}\n")

print(f"Pages: {stats['total_pages']}")
print(f"   • Empty pages: {stats['empty_pages']}")
print(f"   • With content: {stats['total_pages'] - stats['empty_pages']}")

print(f"\nItems: {stats['total_items']}")
print(f"   • Average per page: {stats['total_items'] / max(stats['total_pages'], 1):.1f}")

print(f"\nItem Classes:")
for item_class, count in stats['item_classes'].most_common():
    percentage = (count / stats['total_items'] * 100) if stats['total_items'] > 0 else 0
    print(f"   • {item_class}: {count} ({percentage:.1f}%)")

if stats['is_continuation'] > 0 or stats['continues_next'] > 0:
    print(f"\nContinuations:")
    print(f"   • Starts with continuation: {stats['is_continuation']}")
    print(f"   • Continues on next page: {stats['continues_next']}")

print(f"\nMetadata:")
print(f"   • Items with title: {stats['has_title']}")
print(f"   • Items with author: {stats['has_author']}")

if stats['authors']:
    print(f"\nTop Authors:")
    for author, count in stats['authors'].most_common(5):
        print(f"   • {author}: {count}")

print(f"\nText Length:")
print(f"   • Average: {stats['avg_text_length']:.0f} characters")
if stats['text_lengths']:
    print(f"   • Min: {min(stats['text_lengths'])}")
    print(f"   • Max: {max(stats['text_lengths'])}")

Computing statistics...

Gold Standard Statistics

Pages: 1
   • Empty pages: 0
   • With content: 1

Items: 7
   • Average per page: 7.0

Item Classes:
   • paratext: 7 (100.0%)

Metadata:
   • Items with title: 1
   • Items with author: 0

Text Length:
   • Average: 168 characters
   • Min: 38
   • Max: 894


In [10]:
def track_progress(directory: Path) -> None:
    """
    Display progress tracking checklist.
    """
    json_files = sorted(directory.glob("*.json"))
    
    if not json_files:
        print("No JSON files found to track.")
        return
    
    print(f"{'='*50}")
    print(f"Annotation Progress Checklist")
    print(f"{'='*50}\n")
    print(f"Total files: {len(json_files)}\n")
    
    now = datetime.now()
    
    for i, json_file in enumerate(json_files, 1):
        # Get modification time
        mod_time = datetime.fromtimestamp(json_file.stat().st_mtime)
        time_ago = now - mod_time
        
        # Format time ago
        if time_ago.days > 0:
            time_str = f"{time_ago.days}d ago"
        elif time_ago.seconds > 3600:
            time_str = f"{time_ago.seconds // 3600}h ago"
        elif time_ago.seconds > 60:
            time_str = f"{time_ago.seconds // 60}m ago"
        else:
            time_str = "just now"
        
        # Check if file has been modified recently (within last hour)
        recently_modified = time_ago.seconds < 3600 and time_ago.days == 0
        indicator = "[✓]" if recently_modified else "[ ]"
        
        print(f"{indicator} {i:2d}. {json_file.name:<40} ({time_str})")
    
    print(f"\nLegend:")
    print(f"   [ ]  Not recently modified")
    print(f"   [✓]  Modified within last hour")
    print(f"\nLast check: {now.strftime('%Y-%m-%d %H:%M:%S')}")

track_progress(GOLD_DIR_CLEAN)

Annotation Progress Checklist

Total files: 1

[✓]  1. La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-001.json (9m ago)

Legend:
   [ ]  Not recently modified
   [✓]  Modified within last hour

Last check: 2025-10-14 18:26:26


In [11]:
def full_validation_report(directory: Path, schema_class) -> None:
    """
    Run complete validation suite and print summary report.
    """
    print("\n" + "="*60)
    print("FULL VALIDATION REPORT")
    print("="*60 + "\n")
    
    # 1. Schema validation
    print("[1] Schema Validation...")
    validation = validate_json_files(directory, schema_class)
    print(f"    {validation['valid']}/{validation['total']} files valid\n")
    
    # 2. Consistency checks
    print("[2] Consistency Checks...")
    checks = consistency_checks(directory)
    total_warnings = sum(len(w) for w in checks.values())
    if total_warnings == 0:
        print("    No warnings found\n")
    else:
        print(f"    {total_warnings} warnings (see details above)\n")
    
    # 3. Statistics
    print("[3] Computing Statistics...")
    stats = compute_statistics(directory)
    print(f"    • {stats['total_pages']} pages")
    print(f"    • {stats['total_items']} items")
    print(f"    • {len(stats['item_classes'])} unique item classes\n")
    
    # Final verdict
    print("="*60)
    if validation['invalid'] == 0 and total_warnings == 0:
        print("[✓] GOLD STANDARD READY!")
        print("    All files validated successfully.")
    elif validation['invalid'] == 0:
        print("[!] GOLD STANDARD HAS WARNINGS")
        print("    Schema valid but consistency issues found.")
    else:
        print("[X] GOLD STANDARD HAS ERRORS")
        print("    Fix schema validation errors before proceeding.")
    print("="*60)

# Run full validation
full_validation_report(GOLD_DIR_CLEAN, SCHEMA)


FULL VALIDATION REPORT

[1] Schema Validation...
✓ Valid: La_Plume___revue_littéraire_[...]_bpt6k1185893k__page-001.json
    1/1 files valid

[2] Consistency Checks...

[3] Computing Statistics...
    • 1 pages
    • 7 items
    • 1 unique item classes

[✓] GOLD STANDARD READY!
    All files validated successfully.
