In [None]:
"""
Stage 1 OCR Extraction - Mistral Document AI

Extracts structured page-level data from photographs of historical French literary magazines.

Input:  PDF files in data/raw/
Output: JSON files per page in data/predictions/
Schema: schemas/stage1_page.py
"""

from __future__ import annotations

import os
import json
import base64
import logging
import sys
import time
import random
from pathlib import Path
from typing import Dict, List, Optional


# Project imports
from utils.paths import PROJECT_ROOT, RAW_DATA, PREDICTIONS, ensure_data_dirs
from utils.config import MISTRAL_CONFIG, EXTRACTION_CONFIG
from utils.extraction import extract_all_pdfs

from pypdf import PdfReader
from pydantic import BaseModel, ValidationError

from mistralai import Mistral
from mistralai.extra import response_format_from_pydantic_model

try:
    from tqdm.auto import tqdm
except ImportError:
    tqdm = lambda x, **kwargs: x


# Logging configuration
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
)
logger = logging.getLogger("extraction")


print("Stage 1 OCR Extraction")
print("=" * 60)
print(f"Project root: {PROJECT_ROOT}")

Stage 1 OCR Extraction
Project root: /home/fabian-ramirez/Documents/These/Code/magazine_graphs


In [7]:
"""
Configuration and Path Setup
"""

# Ensure directories exist
ensure_data_dirs()

# Use centralized paths
SRC_ROOT = RAW_DATA
DST_PAGES = PREDICTIONS

print("\nConfiguration:")
print(f"  Source directory: {SRC_ROOT}")
print(f"  Output directory: {DST_PAGES}")
print(f"  Model: {MISTRAL_CONFIG.model_name}")
print(f"  Overwrite existing: {EXTRACTION_CONFIG.overwrite}")

# API key setup and Mistral client
def get_mistral_client() -> Mistral:
    """Initialize Mistral client with API key."""
    return Mistral(api_key=MISTRAL_CONFIG.get_api_key())

print("  API key: Configured")


Configuration:
  Source directory: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/raw
  Output directory: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/predictions
  Model: mistral-ocr-latest
  Overwrite existing: False
  API key: Configured


In [8]:
"""
Load Stage 1 Schema
"""

# Import schema
from schemas.stage1_page import Stage1PageModel, Stage1Item, ITEM_CLASS

# Generate response format for Mistral API
DOC_ANNOT_FMT = response_format_from_pydantic_model(Stage1PageModel)

print("\nSchema:")
print(f"  Loaded: {Stage1PageModel.__name__}")
print(f"  Item classes: {ITEM_CLASS}")


Schema:
  Loaded: Stage1PageModel
  Item classes: typing.Literal['prose', 'verse', 'ad', 'paratext', 'unknown']


In [None]:
# Execute extraction on all PDFs in data/raw/
results = extract_all_pdfs(
    src_root=SRC_ROOT,
    schema_class=Stage1PageModel,
    client=get_mistral_client(),
    out_root=DST_PAGES,
    model_name=MISTRAL_CONFIG.model_name,
    overwrite=EXTRACTION_CONFIG.overwrite,
    zero_pad=EXTRACTION_CONFIG.zero_pad,
    max_retries=MISTRAL_CONFIG.max_retries,
    base_delay=MISTRAL_CONFIG.base_delay,
    max_delay=MISTRAL_CONFIG.max_delay
)