In [5]:
# %%
"""
Stage 1 Multi-Schema OCR Extraction - Mistral Document AI

Extracts structured page-level data using MULTIPLE schema variants.
Enables comparison of different schema strategies (description length, continuation fields).

Input:  PDF files in data/raw/
Output: JSON files per page in data/predictions/schema_evaluations/{schema_name}/{magazine}/
Schemas: All variants in schemas/ directory
"""

from __future__ import annotations

import logging
import importlib
from pathlib import Path
from typing import Dict, List

# Project imports
from utils.paths import PROJECT_ROOT, RAW_DATA, PREDICTIONS, ensure_data_dirs
from utils.config import MISTRAL_CONFIG, EXTRACTION_CONFIG
from utils.extraction import extract_all_pdfs
from mistralai import Mistral

# Logging configuration
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
)
logger = logging.getLogger("multi_schema_extraction")

print("Stage 1 Multi-Schema OCR Extraction")
print("=" * 60)
print(f"Project root: {PROJECT_ROOT}")

Stage 1 Multi-Schema OCR Extraction
Project root: /home/fabian-ramirez/Documents/These/Code/magazine_graphs


In [6]:
"""
Schema Configuration
Define all schema variants to extract.
"""

# List of all schemas to process
SCHEMAS_TO_EXTRACT = [
    'stage1_page',              # Original v1
    'stage1_page_v2',           # V2 improved (long descriptions)
    'stage1_page_v2_medium',    # V2 medium (moderate descriptions)
    'stage1_page_v2_small',     # V2 small (minimal descriptions)
    'stage1_page_v2_pure',      # V2 improved without continuation fields
    'stage1_page_v2_medium_pure',  # V2 medium without continuation fields
    'stage1_page_v2_small_pure',   # V2 small without continuation fields
]

# Ensure directories exist
ensure_data_dirs()

# Base paths
SRC_ROOT = RAW_DATA
SCHEMA_EVAL_ROOT = PREDICTIONS / 'schema_evaluations'

print("\nConfiguration:")
print(f"  Source directory: {SRC_ROOT}")
print(f"  Output base:      {SCHEMA_EVAL_ROOT}")
print(f"  Model:            {MISTRAL_CONFIG.model_name}")
print(f"  Overwrite:        {EXTRACTION_CONFIG.overwrite}")
print(f"\n  Schemas to extract: {len(SCHEMAS_TO_EXTRACT)}")
for schema in SCHEMAS_TO_EXTRACT:
    print(f"    - {schema}")


Configuration:
  Source directory: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/raw
  Output base:      /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/predictions/schema_evaluations
  Model:            mistral-ocr-latest
  Overwrite:        False

  Schemas to extract: 7
    - stage1_page
    - stage1_page_v2
    - stage1_page_v2_medium
    - stage1_page_v2_small
    - stage1_page_v2_pure
    - stage1_page_v2_medium_pure
    - stage1_page_v2_small_pure


In [7]:
"""
Mistral API Client Setup
"""

def get_mistral_client() -> Mistral:
    """Initialize Mistral client with API key."""
    return Mistral(api_key=MISTRAL_CONFIG.get_api_key())

client = get_mistral_client()
print("  API key: Configured ✓")

  API key: Configured ✓


In [8]:
"""
Execute Multi-Schema Extraction
Processes each schema sequentially. Automatically skips existing files.
Can be safely interrupted and resumed.
"""

print("\n" + "=" * 60)
print("STARTING MULTI-SCHEMA EXTRACTION")
print("=" * 60)

# Track overall progress
all_results = {}

for idx, schema_name in enumerate(SCHEMAS_TO_EXTRACT, 1):
    print(f"\n{'='*60}")
    print(f"SCHEMA {idx}/{len(SCHEMAS_TO_EXTRACT)}: {schema_name}")
    print(f"{'='*60}")
    
    # Dynamic schema import
    try:
        schema_module = importlib.import_module(f'schemas.{schema_name}')
        schema_class = schema_module.Stage1PageModel
        print(f"  ✓ Schema loaded: {schema_class.__name__}")
    except Exception as e:
        logger.error(f"  ✗ Failed to load schema '{schema_name}': {e}")
        all_results[schema_name] = {"error": str(e)}
        continue
    
    # Set output directory for this schema
    out_root = SCHEMA_EVAL_ROOT / schema_name
    print(f"  Output: {out_root}")
    
    # Extract all PDFs with this schema
    try:
        results = extract_all_pdfs(
            src_root=SRC_ROOT,
            schema_class=schema_class,
            client=client,
            out_root=out_root,
            model_name=MISTRAL_CONFIG.model_name,
            overwrite=EXTRACTION_CONFIG.overwrite,
            zero_pad=EXTRACTION_CONFIG.zero_pad,
            max_retries=MISTRAL_CONFIG.max_retries,
            base_delay=MISTRAL_CONFIG.base_delay,
            max_delay=MISTRAL_CONFIG.max_delay
        )
        
        all_results[schema_name] = results
        
        print(f"\n  ✓ {schema_name} complete:")
        print(f"     Written: {results['written']}")
        print(f"     Skipped: {results['skipped']}")
        print(f"     Failed:  {results['failed']}")
        
    except Exception as e:
        logger.error(f"  ✗ Extraction failed for '{schema_name}': {e}")
        all_results[schema_name] = {"error": str(e)}
        continue

print("\n" + "=" * 60)
print("MULTI-SCHEMA EXTRACTION COMPLETE")
print("=" * 60)

2025-11-05 17:41:37,495 | INFO | Found 2 PDF(s) to process
2025-11-05 17:41:37,513 | INFO | Processing La_Plume_bpt6k1185893k_1_10_1889.pdf: 0 to extract, 14 already exist
2025-11-05 17:41:37,513 | INFO | ✓ La_Plume_bpt6k1185893k_1_10_1889.pdf: All pages already extracted
2025-11-05 17:41:37,560 | INFO | Processing La_Plume_bpt6k1212187t_15-11-1893.pdf: 0 to extract, 34 already exist
2025-11-05 17:41:37,561 | INFO | ✓ La_Plume_bpt6k1212187t_15-11-1893.pdf: All pages already extracted
2025-11-05 17:41:37,562 | INFO | Found 2 PDF(s) to process
2025-11-05 17:41:37,579 | INFO | Processing La_Plume_bpt6k1185893k_1_10_1889.pdf: 0 to extract, 14 already exist
2025-11-05 17:41:37,580 | INFO | ✓ La_Plume_bpt6k1185893k_1_10_1889.pdf: All pages already extracted



STARTING MULTI-SCHEMA EXTRACTION

SCHEMA 1/7: stage1_page
  ✓ Schema loaded: Stage1PageModel
  Output: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/predictions/schema_evaluations/stage1_page

EXTRACTION COMPLETE
Total pages:   48
  Written:     0
  Skipped:     48
  Failed:      0

  ✓ stage1_page complete:
     Written: 0
     Skipped: 48
     Failed:  0

SCHEMA 2/7: stage1_page_v2
  ✓ Schema loaded: Stage1PageModel
  Output: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/predictions/schema_evaluations/stage1_page_v2


2025-11-05 17:41:37,624 | INFO | Processing La_Plume_bpt6k1212187t_15-11-1893.pdf: 0 to extract, 34 already exist
2025-11-05 17:41:37,625 | INFO | ✓ La_Plume_bpt6k1212187t_15-11-1893.pdf: All pages already extracted
2025-11-05 17:41:37,626 | INFO | Found 2 PDF(s) to process



EXTRACTION COMPLETE
Total pages:   48
  Written:     0
  Skipped:     48
  Failed:      0

  ✓ stage1_page_v2 complete:
     Written: 0
     Skipped: 48
     Failed:  0

SCHEMA 3/7: stage1_page_v2_medium
  ✓ Schema loaded: Stage1PageModel
  Output: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/predictions/schema_evaluations/stage1_page_v2_medium


2025-11-05 17:41:37,649 | INFO | Processing La_Plume_bpt6k1185893k_1_10_1889.pdf: 0 to extract, 14 already exist
2025-11-05 17:41:37,650 | INFO | ✓ La_Plume_bpt6k1185893k_1_10_1889.pdf: All pages already extracted
2025-11-05 17:41:37,701 | INFO | Processing La_Plume_bpt6k1212187t_15-11-1893.pdf: 0 to extract, 34 already exist
2025-11-05 17:41:37,702 | INFO | ✓ La_Plume_bpt6k1212187t_15-11-1893.pdf: All pages already extracted
2025-11-05 17:41:37,703 | INFO | Found 2 PDF(s) to process
2025-11-05 17:41:37,802 | INFO | Processing La_Plume_bpt6k1185893k_1_10_1889.pdf: 0 to extract, 14 already exist
2025-11-05 17:41:37,803 | INFO | ✓ La_Plume_bpt6k1185893k_1_10_1889.pdf: All pages already extracted



EXTRACTION COMPLETE
Total pages:   48
  Written:     0
  Skipped:     48
  Failed:      0

  ✓ stage1_page_v2_medium complete:
     Written: 0
     Skipped: 48
     Failed:  0

SCHEMA 4/7: stage1_page_v2_small
  ✓ Schema loaded: Stage1PageModel
  Output: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/predictions/schema_evaluations/stage1_page_v2_small


2025-11-05 17:41:37,855 | INFO | Processing La_Plume_bpt6k1212187t_15-11-1893.pdf: 0 to extract, 34 already exist
2025-11-05 17:41:37,856 | INFO | ✓ La_Plume_bpt6k1212187t_15-11-1893.pdf: All pages already extracted
2025-11-05 17:41:37,857 | INFO | Found 2 PDF(s) to process
2025-11-05 17:41:37,878 | INFO | Processing La_Plume_bpt6k1185893k_1_10_1889.pdf: 0 to extract, 14 already exist
2025-11-05 17:41:37,878 | INFO | ✓ La_Plume_bpt6k1185893k_1_10_1889.pdf: All pages already extracted



EXTRACTION COMPLETE
Total pages:   48
  Written:     0
  Skipped:     48
  Failed:      0

  ✓ stage1_page_v2_small complete:
     Written: 0
     Skipped: 48
     Failed:  0

SCHEMA 5/7: stage1_page_v2_pure
  ✓ Schema loaded: Stage1PageModel
  Output: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/predictions/schema_evaluations/stage1_page_v2_pure


2025-11-05 17:41:37,923 | INFO | Processing La_Plume_bpt6k1212187t_15-11-1893.pdf: 0 to extract, 34 already exist
2025-11-05 17:41:37,924 | INFO | ✓ La_Plume_bpt6k1212187t_15-11-1893.pdf: All pages already extracted
2025-11-05 17:41:37,925 | INFO | Found 2 PDF(s) to process
2025-11-05 17:41:37,942 | INFO | Processing La_Plume_bpt6k1185893k_1_10_1889.pdf: 0 to extract, 14 already exist
2025-11-05 17:41:37,943 | INFO | ✓ La_Plume_bpt6k1185893k_1_10_1889.pdf: All pages already extracted
2025-11-05 17:41:37,983 | INFO | Processing La_Plume_bpt6k1212187t_15-11-1893.pdf: 2 to extract, 32 already exist



EXTRACTION COMPLETE
Total pages:   48
  Written:     0
  Skipped:     48
  Failed:      0

  ✓ stage1_page_v2_pure complete:
     Written: 0
     Skipped: 48
     Failed:  0

SCHEMA 6/7: stage1_page_v2_medium_pure
  ✓ Schema loaded: Stage1PageModel
  Output: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/predictions/schema_evaluations/stage1_page_v2_medium_pure


  La_Plume_bpt6k1212187t_15-11-1893.pdf:   0%|          | 0/2 [00:00<?, ?it/s]

2025-11-05 17:42:22,690 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"
2025-11-05 17:43:01,811 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"
2025-11-05 17:43:01,820 | INFO | ✓ La_Plume_bpt6k1212187t_15-11-1893.pdf: 2 written, 32 skipped, 0 failed
2025-11-05 17:43:01,833 | INFO | Found 2 PDF(s) to process
2025-11-05 17:43:01,881 | INFO | Processing La_Plume_bpt6k1185893k_1_10_1889.pdf: 14 to extract, 0 already exist



EXTRACTION COMPLETE
Total pages:   48
  Written:     2
  Skipped:     46
  Failed:      0

  ✓ stage1_page_v2_medium_pure complete:
     Written: 2
     Skipped: 46
     Failed:  0

SCHEMA 7/7: stage1_page_v2_small_pure
  ✓ Schema loaded: Stage1PageModel
  Output: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/predictions/schema_evaluations/stage1_page_v2_small_pure


  La_Plume_bpt6k1185893k_1_10_1889.pdf:   0%|          | 0/14 [00:00<?, ?it/s]

2025-11-05 17:43:22,580 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"
2025-11-05 17:43:37,119 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"
2025-11-05 17:43:58,209 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"
2025-11-05 17:44:32,708 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"
2025-11-05 17:45:01,169 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"
2025-11-05 17:45:20,109 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"
2025-11-05 17:45:28,300 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"
2025-11-05 17:45:35,978 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"
2025-11-05 17:46:00,446 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"
2025-11-05 17:46:27,068 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"


  La_Plume_bpt6k1212187t_15-11-1893.pdf:   0%|          | 0/34 [00:00<?, ?it/s]

2025-11-05 17:48:47,941 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"
2025-11-05 17:49:36,058 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"
2025-11-05 17:50:22,847 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"
2025-11-05 17:51:06,149 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"
2025-11-05 17:51:42,292 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"
2025-11-05 17:52:27,440 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"
2025-11-05 17:53:14,330 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"
2025-11-05 17:53:57,635 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"
2025-11-05 17:54:30,195 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"
2025-11-05 17:55:03,876 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"



EXTRACTION COMPLETE
Total pages:   48
  Written:     48
  Skipped:     0
  Failed:      0

  ✓ stage1_page_v2_small_pure complete:
     Written: 48
     Skipped: 0
     Failed:  0

MULTI-SCHEMA EXTRACTION COMPLETE
