In [1]:
"""
Stage 1 OCR Extraction - Mistral Document AI

Extracts structured page-level data from photographs of historical French literary magazines.

Input:  PDF files in data/raw/
Output: JSON files per page in data/interim_pages/
Schema: schemas/stage1_page.py
"""

from __future__ import annotations

import os
import json
import base64
import logging
import sys
import time
import random
from pathlib import Path
from typing import Dict, List, Optional


from pypdf import PdfReader
from pydantic import BaseModel, ValidationError

from mistralai import Mistral
from mistralai.extra import response_format_from_pydantic_model

try:
    from tqdm.auto import tqdm
except ImportError:
    tqdm = lambda x, **kwargs: x


# Logging configuration
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
)
logger = logging.getLogger("extraction")

# Project root detection
PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent

print("Stage 1 OCR Extraction")
print("=" * 60)
print(f"Project root: {PROJECT_ROOT}")

Stage 1 OCR Extraction
Project root: /home/fabian-ramirez/Documents/These/Code/magazine_graphs


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
"""
Configuration and Path Setup
"""

# Input/Output directories
SRC_ROOT = PROJECT_ROOT / "data" / "raw"
DST_PAGES = PROJECT_ROOT / "data" / "interim_pages"   # <-- where page JSONs will be written

# Create directories if they don't exist
for directory in (SRC_ROOT, DST_PAGES):
    directory.mkdir(parents=True, exist_ok=True)

# Extraction parameters
CONFIG = {
    "model_name": "mistral-ocr-latest",
    "overwrite": False,  # Skip already-extracted pages
    "zero_pad": 3,  # Page number padding (001, 002, ...)
    "max_retries": 3,  # API retry attempts
    "base_delay": 1.0,  # Initial retry delay (seconds)
    "max_delay": 8.0,  # Maximum retry delay (seconds)
}

print("\nConfiguration:")
print(f"  Source directory: {SRC_ROOT}")
print(f"  Output directory: {DST_PAGES}")
print(f"  Model: {CONFIG['model_name']}")
print(f"  Overwrite existing: {CONFIG['overwrite']}")

# API key setup
def read_api_key(
    env_var: str = "MISTRAL_API_KEY",
    fallback_file: Path = PROJECT_ROOT / "api_key",
) -> str:
    """
    Read Mistral API key from environment variable or fallback file.
    
    Args:
        env_var: Environment variable name
        fallback_file: Path to file containing API key
        
    Returns:
        API key string
        
    Raises:
        RuntimeError: If API key not found
    """
    key = os.environ.get(env_var)
    if key:
        return key.strip()
    
    if fallback_file.exists():
        return fallback_file.read_text(encoding="utf-8").strip()
    
    raise RuntimeError(
        f"{env_var} not set and fallback file '{fallback_file}' not found. "
        "Please set MISTRAL_API_KEY environment variable or create api_key file."
    )

def get_mistral_client() -> Mistral:
    """Initialize Mistral client with API key."""
    return Mistral(api_key=read_api_key())

print("  API key: Configured")


Configuration:
  Source directory: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/raw
  Output directory: /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/interim_pages
  Model: mistral-ocr-latest
  Overwrite existing: False
  API key: Configured


In [3]:
"""
Load Stage 1 Schema
"""

# Add schemas directory to Python path
SCHEMAS_DIR = PROJECT_ROOT / "schemas"
if str(SCHEMAS_DIR) not in sys.path:
    sys.path.insert(0, str(SCHEMAS_DIR))

# Import schema
from stage1_page import Stage1PageModel, Stage1Item, ITEM_CLASS

# Generate response format for Mistral API
DOC_ANNOT_FMT = response_format_from_pydantic_model(Stage1PageModel)

print("\nSchema:")
print(f"  Loaded: {Stage1PageModel.__name__}")
print(f"  Item classes: {ITEM_CLASS}")


Schema:
  Loaded: Stage1PageModel
  Item classes: typing.Literal['prose', 'verse', 'ad', 'paratext', 'unknown']


In [4]:
"""
PDF Processing Utilities
"""

def count_pages(pdf_path: Path) -> int:
    """
    Count number of pages in a PDF file.
    
    Args:
        pdf_path: Path to PDF file
        
    Returns:
        Number of pages (0 if file cannot be read)
    """
    try:
        with pdf_path.open("rb") as fh:
            try:
                reader = PdfReader(fh, strict=False)
            except TypeError:
                reader = PdfReader(fh)  # fallback if 'strict' arg unsupported, because I'm unsure
            if getattr(reader, "is_encrypted", False) and reader.decrypt("") == 0:
                logger.warning(f"Encrypted PDF (cannot decrypt): {pdf_path.name}")
                return 0
            return len(reader.pages)
    except Exception as e:
        logger.warning(f"Could not read {pdf_path.name}: {e}")
        return 0

def encode_file_to_data_url(path: Path, mime: str = "application/pdf") -> str:
    """
    Encode file as base64 data URL for Mistral API.
    
    Args:
        path: Path to file
        mime: MIME type
        
    Returns:
        Data URL string (data:<mime>;base64,<encoded_content>)
    """
    b64 = base64.b64encode(path.read_bytes()).decode("utf-8")
    return f"data:{mime};base64,{b64}"

def chunks(seq, size):
    for i in range(0, len(seq), size):
        yield seq[i:i+size]

def parse_annotation_response(resp) -> dict:
    """
    Extract annotation dict from Mistral OCR response.
    
    Handles different response formats:
    - resp.document_annotation (string or dict)
    - resp.pages[0].document_annotation (fallback)
    
    Args:
        resp: Mistral OCR API response object
        
    Returns:
        Annotation dict (empty dict if parsing fails)
    """
    # Try top-level document_annotation first
    ann = getattr(resp, "document_annotation", None)
    
    if isinstance(ann, str):
        try:
            return json.loads(ann)
        except json.JSONDecodeError:
            pass
    elif isinstance(ann, dict):
        return ann or {}
    
    # Fall back to pages array
    pages = getattr(resp, "pages", None) or []
    if pages:
        page_ann = getattr(pages[0], "document_annotation", None)
        
        if isinstance(page_ann, str):
            try:
                return json.loads(page_ann)
            except json.JSONDecodeError:
                return {}
        elif isinstance(page_ann, dict):
            return page_ann or {}
    
    return {}

def call_with_retry(fn, *, retries: int = 3, base_delay: float = 1.0, max_delay: float = 8.0):
    """
    Call function with exponential backoff retry logic.
    
    Args:
        fn: Function to call (no arguments)
        retries: Maximum number of retry attempts
        base_delay: Initial delay between retries (seconds)
        max_delay: Maximum delay between retries (seconds)
        
    Returns:
        Function result
        
    Raises:
        Exception: If all retries fail
    """
    for attempt in range(retries):
        try:
            return fn()
        except Exception as e:
            if attempt == retries - 1:
                raise
            
            delay = min(max_delay, base_delay * (2 ** attempt))
            jitter = delay * (1 + 0.25 * random.random())
            
            logger.warning(f"API call failed ({e}). Retrying in {jitter:.1f}s...")
            time.sleep(jitter)

def validate_extraction(annot: dict, page_number: int, pdf_name: str) -> tuple[bool, List[str]]:
    """
    Validate extracted annotation for common issues.
    
    Args:
        annot: Annotation dictionary
        page_number: Page number (1-indexed)
        pdf_name: PDF filename for logging
        
    Returns:
        Tuple of (is_valid, list_of_warnings)
    """
    warnings = []
    
    # Check if items exist
    if "items" not in annot:
        warnings.append(f"Missing 'items' field")
        return False, warnings
    
    items = annot["items"]
    
    # Check for empty pages (valid but worth noting)
    if len(items) == 0:
        warnings.append(f"Zero items extracted (possibly blank page)")
    
    # Check for suspiciously short items
    for idx, item in enumerate(items):
        text = item.get("item_text_raw", "")
        if len(text) < 3:
            warnings.append(f"Item {idx} has very short text ({len(text)} chars)")
    
    # Schema validation with Pydantic
    try:
        Stage1PageModel(**annot)
    except ValidationError as e:
        warnings.append(f"Schema validation failed: {e}")
        return False, warnings
    
    return True, warnings

In [7]:
"""
Core Extraction Function - Per-Page Processing
"""

def extract_pdf_pages(
    pdf_path: Path,
    out_root: Path = DST_PAGES,
    overwrite: bool = None,
) -> Dict[str, int]:
    """
    Extract structured data from all pages of a PDF.
    
    Creates one JSON file per page in:
    out_root / <pdf_name> / <pdf_name>__page-001.json
    
    Args:
        pdf_path: Path to PDF file
        out_root: Root directory for output
        overwrite: Override CONFIG["overwrite"] if specified
        
    Returns:
        Dict with statistics: {"written": n, "skipped": n, "failed": n, "total": n}
    """
    # Count pages
    n_pages = count_pages(pdf_path)
    if n_pages == 0:
        logger.warning(f"No pages found in {pdf_path.name}")
        return {"written": 0, "skipped": 0, "failed": 0, "total": 0}
    
    # Setup output directory
    rel_path = pdf_path.relative_to(SRC_ROOT).with_suffix("")
    out_dir = out_root / rel_path
    out_dir.mkdir(parents=True, exist_ok=True)
    
    # Initialize client
    client = get_mistral_client()
    data_url = encode_file_to_data_url(pdf_path)
    
    # Use CONFIG value unless overridden
    should_overwrite = CONFIG["overwrite"] if overwrite is None else overwrite
    
    # Statistics
    stats = {"written": 0, "skipped": 0, "failed": 0, "total": n_pages}
    
    # Process each page
    logger.info(f"Processing {pdf_path.name} ({n_pages} pages)")
    
    for page_idx in tqdm(range(n_pages), desc=f"  {pdf_path.name}", leave=False):
        page_num = page_idx + 1
        out_json = out_dir / f"{pdf_path.stem}__page-{page_num:0{CONFIG['zero_pad']}d}.json"
        
        # Skip if exists and not overwriting
        if out_json.exists() and not should_overwrite:
            stats["skipped"] += 1
            continue
        
        # Call API with retry logic
        try:
            def _call():
                return client.ocr.process(
                    model=CONFIG["model_name"],
                    document={"type": "document_url", "document_url": data_url},
                    pages=[page_idx],
                    document_annotation_format=DOC_ANNOT_FMT,
                    include_image_base64=False,
                )
            
            resp = call_with_retry(
                _call,
                retries=CONFIG["max_retries"],
                base_delay=CONFIG["base_delay"],
                max_delay=CONFIG["max_delay"],
            )
            
        except Exception as e:
            logger.error(f"Page {page_num} failed after {CONFIG['max_retries']} retries: {e}")
            stats["failed"] += 1
            continue
        
        # Parse response
        annot = parse_annotation_response(resp) or {}
        
        # Ensure items key exists
        if "items" not in annot:
            annot["items"] = []
        
        # Validate (but don't block writing)
        is_valid, warnings = validate_extraction(annot, page_num, pdf_path.name)
        if warnings:
            for warning in warnings:
                logger.warning(f"Page {page_num}: {warning}")
        
        # Write output
        try:
            out_json.write_text(
                json.dumps(annot, ensure_ascii=False, indent=2),
                encoding="utf-8"
            )
            stats["written"] += 1
            
        except Exception as e:
            logger.error(f"Failed to write {out_json.name}: {e}")
            stats["failed"] += 1
    
    # Log summary
    logger.info(
        f"✓ {pdf_path.name}: "
        f"{stats['written']} written, "
        f"{stats['skipped']} skipped, "
        f"{stats['failed']} failed"
    )
    
    return stats


def extract_all_pdfs(src_root: Path = SRC_ROOT) -> Dict[str, int]:
    """
    Extract all PDFs in source directory.
    
    Returns:
        Combined statistics across all PDFs
    """
    pdfs = sorted([p for p in src_root.rglob("*.pdf") if p.is_file()])
    
    if not pdfs:
        logger.warning(f"No PDF files found in {src_root}")
        return {"written": 0, "skipped": 0, "failed": 0, "total": 0}
    
    logger.info(f"Found {len(pdfs)} PDF(s) to process")
    
    # Accumulate statistics
    total_stats = {"written": 0, "skipped": 0, "failed": 0, "total": 0}
    
    for pdf_path in pdfs:
        stats = extract_pdf_pages(pdf_path)
        for key in total_stats:
            total_stats[key] += stats[key]
    
    # Final summary
    print("\n" + "=" * 60)
    print("EXTRACTION COMPLETE")
    print("=" * 60)
    print(f"Total pages:   {total_stats['total']}")
    print(f"  Written:     {total_stats['written']}")
    print(f"  Skipped:     {total_stats['skipped']}")
    print(f"  Failed:      {total_stats['failed']}")
    print("=" * 60)
    
    return total_stats

# Execute extraction on all PDFs in data/raw/
results = extract_all_pdfs()

2025-10-22 00:03:57,265 | INFO | Found 1 PDF(s) to process
2025-10-22 00:03:57,415 | INFO | Processing La_Plume_bpt6k1185893k_1_10_1889.pdf (14 pages)
  La_Plume_bpt6k1185893k_1_10_1889.pdf:   0%|          | 0/14 [00:00<?, ?it/s]2025-10-22 00:06:01,558 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 502 Bad Gateway"
<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
<head>

<title> | 500: Internal server error</title>
<meta charset="UTF-8" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta name="robots" content="noindex, nofollow" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<link rel="stylesheet"


EXTRACTION COMPLETE
Total pages:   14
  Written:     14
  Skipped:     0
  Failed:      0
