In [None]:
"""
Stage 1 Multi-Schema OCR Extraction - Mistral Document AI

Extracts structured page-level data using multiple schema variants.
Tests all 7 schema versions to compare extraction quality.

Input:  PDF files in data/raw/
Output: JSON files per page in data/predictions/schema_evaluations/{schema_name}/
Schemas: All variants in schemas/stage1_page*.py
"""

from __future__ import annotations

import importlib
import logging
from pathlib import Path
from typing import Dict, List

# Project imports
from utils.paths import PROJECT_ROOT, RAW_DATA, PREDICTIONS, ensure_data_dirs
from utils.config import MISTRAL_CONFIG, EXTRACTION_CONFIG
from utils.extraction import extract_all_pdfs
from mistralai import Mistral

# Logging configuration
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
)
logger = logging.getLogger("multi_schema_extraction")

print("Stage 1 Multi-Schema OCR Extraction")
print("=" * 60)
print(f"Project root: {PROJECT_ROOT}")

Stage 1 Multi-Schema OCR Extraction
Project root: /home/fabian-ramirez/Documents/These/Code/magazine_graphs


In [None]:
"""
Configuration and Schema Definitions
"""

# Ensure directories exist
ensure_data_dirs()

# Source data
SRC_ROOT = RAW_DATA

# Output root: data/predictions/schema_evaluations/
SCHEMA_EVAL_ROOT = PREDICTIONS / "schema_evaluations"
SCHEMA_EVAL_ROOT.mkdir(parents=True, exist_ok=True)

# Define all schema variants to extract
SCHEMA_VARIANTS = [
    'stage1_page',              # Original v1
    'stage1_page_v2',           # V2 improved (130 lines)
    'stage1_page_v2_medium',    # V2 medium (50 lines)
    'stage1_page_v2_small',     # V2 small (15 lines)
    'stage1_page_v2_pure',      # V2 improved pure (no continuation)
    'stage1_page_v2_medium_pure',  # V2 medium pure
    'stage1_page_v2_small_pure',   # V2 small pure
]

print("\nConfiguration:")
print(f"  Source directory: {SRC_ROOT}")
print(f"  Output root: {SCHEMA_EVAL_ROOT}")
print(f"  Schemas to extract: {len(SCHEMA_VARIANTS)}")
print(f"  Model: {MISTRAL_CONFIG.model_name}")
print(f"  Overwrite existing: {EXTRACTION_CONFIG.overwrite}")

# API client
def get_mistral_client() -> Mistral:
    """Initialize Mistral client with API key."""
    return Mistral(api_key=MISTRAL_CONFIG.get_api_key())

print("  API key: Configured")


Configuration:
  Source directory: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/raw
  Output root: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/predictions/schema_evaluations
  Schemas to extract: 7
  Model: mistral-ocr-latest
  Overwrite existing: False
  API key: Configured
