In [9]:
"""
Imports and Setup
"""

from __future__ import annotations

import logging
import importlib
from pathlib import Path
from typing import Dict, List, Set, Tuple

# Project imports
from utils.paths import (
    PROJECT_ROOT,
    RAW_DATA,
    PREDICTIONS,
    build_evaluation_path,
    discover_available_magazines,
    discover_existing_extractions,
    generate_all_combinations,
    calculate_missing_extractions
)
from utils.config import MISTRAL_CONFIG, EXTRACTION_CONFIG
from utils.extraction import extract_pdf_pages
from mistralai import Mistral

# Logging configuration
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
)
logger = logging.getLogger("unified_extraction")

print("Unified Extraction - Stage 1")
print("=" * 60)
print(f"Project root: {PROJECT_ROOT}")

Unified Extraction - Stage 1
Project root: /home/fabian-ramirez/Documents/These/Code/magazine_graphs


In [10]:
"""
Mode Selection

Choose your extraction mode:
- 'manual': You specify everything (magazines, model, schema, prompt)
- 'automatic': Auto-discovers everything, extracts missing combinations
"""

MODE = "automatic"  # "manual" or "automatic" for auto-discovery mode

print("\n" + "=" * 60)
print(f"MODE: {MODE.upper()}")
print("=" * 60)


MODE: AUTOMATIC


In [11]:
"""
Manual Mode Configuration

Specify exactly what to extract:
- MAGAZINES: List of PDF names (without .pdf extension)
- MODELS: List of model names
- SCHEMAS: List of schema names
- PROMPTS: List of prompt names (use [] for OCR-only models)
"""

if MODE == "manual":
    # ========================================
    # EDIT THIS SECTION FOR MANUAL MODE
    # ========================================
    
    # Magazines to extract (PDF filenames without .pdf)
    MAGAZINES = [
        "La_Plume_minimal_test",
        # Add more magazines here
    ]
    
    # Models to use (can be multiple)
    MODELS = [
        # "pixtral-large-latest",
        "mistral-ocr-latest",
        "pixtral-12b-latest",
    ]
    
    # Schemas to use (can be multiple)
    SCHEMAS = [
        "stage1_page",
        "stage1_page_v2",
        # Add more schemas here
    ]
    
    # Prompts to use (can be multiple, use [] for OCR-only)
    PROMPTS = [
        "stage1_page_v2",
        # "detailed_v1",
        # Add more prompts here
    ]
    
    # ========================================
    
    print("\nManual Mode Configuration:")
    print(f"  Magazines: {len(MAGAZINES)}")
    for mag in MAGAZINES:
        print(f"    - {mag}")
    print(f"\n  Models: {len(MODELS)}")
    for model in MODELS:
        print(f"    - {model}")
    print(f"\n  Schemas: {len(SCHEMAS)}")
    for schema in SCHEMAS:
        print(f"    - {schema}")
    print(f"\n  Prompts: {len(PROMPTS)}")
    if PROMPTS:
        for prompt in PROMPTS:
            print(f"    - {prompt}")
    else:
        print(f"    - (none - OCR only)")

In [12]:
"""
Automatic Mode - Auto-Discovery

Discovers everything from your project:
- Magazines: from data/raw/*.pdf
- Schemas: from schemas/*.py
- Prompts: from prompts/*.txt
- Models: from hardcoded list (OCR + Vision)
"""

if MODE == "automatic":
    print("\nAutomatic Mode - Discovering resources...")
    print()
    
    # Discover magazines from data/raw/
    MAGAZINES = discover_available_magazines(RAW_DATA)
    print(f"Discovered {len(MAGAZINES)} magazine(s) in {RAW_DATA}:")
    for mag in MAGAZINES:
        print(f"  - {mag}")
    print()
    
    # Discover schemas from schemas/ directory
    schemas_dir = PROJECT_ROOT / "schemas"
    schema_files = sorted(schemas_dir.glob("stage1_*.py"))
    SCHEMAS = [f.stem for f in schema_files if not f.stem.endswith("__init__")]
    print(f"Discovered {len(SCHEMAS)} schema(s) in {schemas_dir}:")
    for schema in SCHEMAS:
        print(f"  - {schema}")
    print()
    
    # Discover prompts from prompts/ directory
    prompts_dir = PROJECT_ROOT / "prompts"
    prompt_files = sorted(prompts_dir.glob("*.txt")) if prompts_dir.exists() else []
    PROMPTS = [f.stem for f in prompt_files]
    print(f"Discovered {len(PROMPTS)} prompt(s) in {prompts_dir}:")
    for prompt in PROMPTS:
        print(f"  - {prompt}")
    print()
    
    # Hardcoded models list (OCR + Vision)
    MODELS = [
        "mistral-ocr-latest",
        "pixtral-12b-latest",
        "pixtral-large-latest",
    ]
    print(f"Using {len(MODELS)} model(s):")
    for model in MODELS:
        print(f"  - {model}")
    print()
    
    # Check for missing resources
    if not MAGAZINES:
        print("WARNING: No magazines found in data/raw/")
    if not SCHEMAS:
        print("WARNING: No schemas found in schemas/")
    if not PROMPTS:
        print("WARNING: No prompts found in prompts/ (OCR models will still work)")


Automatic Mode - Discovering resources...

Discovered 2 magazine(s) in /home/fabian-ramirez/Documents/These/Code/magazine_graphs/data/raw:
  - La_Plume_bpt6k1185893k_1_10_1889
  - La_Plume_bpt6k1212187t_15-11-1893

Discovered 7 schema(s) in /home/fabian-ramirez/Documents/These/Code/magazine_graphs/schemas:
  - stage1_page
  - stage1_page_v2
  - stage1_page_v2_medium
  - stage1_page_v2_medium_pure
  - stage1_page_v2_pure
  - stage1_page_v2_small
  - stage1_page_v2_small_pure

Discovered 2 prompt(s) in /home/fabian-ramirez/Documents/These/Code/magazine_graphs/prompts:
  - stage1_page_v2
  - stage1_page_v2_pure

Using 3 model(s):
  - mistral-ocr-latest
  - pixtral-12b-latest
  - pixtral-large-latest



In [13]:
"""
Planning & Missing Calculation

Calculate what needs to be extracted:
- Manual mode: Generate extraction plan from configuration
- Automatic mode: Calculate missing = (expected - existing)
"""

print("\n" + "=" * 60)
print("EXTRACTION PLANNING")
print("=" * 60)
print()

if MODE == "manual":
    # Manual mode: generate all combinations from specified lists
    try:
        planned_extractions = generate_all_combinations(
            magazines=MAGAZINES,
            models=MODELS,
            schemas=SCHEMAS,
            prompts=PROMPTS)

        print(f"Manual mode: Planning {len(planned_extractions)} extraction(s)")
        print(f"  Magazines:  {len(MAGAZINES)}")
        print(f"  Models:     {len(MODELS)}")
        print(f"  Schema:     {len(SCHEMAS)}")
        print(f"  Prompt:     {len(PROMPTS) if PROMPTS else 0}")

        # Breakdown
        ocr_extractions = [e for e in planned_extractions if e[3] is None]
        vision_extractions = [e for e in planned_extractions if e[3] is not None]

        if ocr_extractions:
            print(f"\n  OCR Extractions:    {len(ocr_extractions)}")
        if vision_extractions:
            print(f"  Vision Extractions: {len(vision_extractions)}")

    except ValueError as e:
        print(f"\nWARNING ERROR: {e}")
        print("\nFor vision models, add prompt files to PROMPTS list")
        raise
    
elif MODE == "automatic":
    # Automatic mode: calculate missing extractions
    print("Calculating missing extractions...")
    print()
    
    # What already exists?
    existing = discover_existing_extractions(PREDICTIONS / "evaluations")
    print(f"Found {len(existing)} existing extraction(s)")
    
    # What should exist?
    try:
        expected = generate_all_combinations(MAGAZINES, MODELS, SCHEMAS, PROMPTS)
        print(f"Expected {len(expected)} total extraction(s)")
    except ValueError as e:
        print(f"\nWARNING  ERROR: {e}")
        print("\nPlease add prompt files to prompts/ directory or use manual mode.")
        raise
    
    # What's missing?
    planned_extractions = expected - existing
    print(f"\nMissing {len(planned_extractions)} extraction(s) (need to run)")
    
    # Breakdown
    ocr_extractions = [e for e in planned_extractions if e[3] is None]
    vision_extractions = [e for e in planned_extractions if e[3] is not None]
    
    print(f"  OCR:    {len(ocr_extractions)}")
    print(f"  Vision: {len(vision_extractions)}")

print()
print(f"Will extract {len(planned_extractions)} combination(s)")


EXTRACTION PLANNING

Calculating missing extractions...

Found 0 existing extraction(s)
Expected 70 total extraction(s)

Missing 70 extraction(s) (need to run)
  OCR:    14
  Vision: 56

Will extract 70 combination(s)


In [14]:
"""
Extraction Preview

Display what will be extracted before running.
"""

print("\n" + "=" * 60)
print("EXTRACTION PREVIEW")
print("=" * 60)
print()

if len(planned_extractions) == 0:
    print("✓ Nothing to extract - all combinations already exist!")
    print()
    print("You can:")
    print("  - Set EXTRACTION_CONFIG.overwrite = True to re-extract")
    print("  - Add new prompts/schemas and run automatic mode again")
    print("  - Switch to manual mode for specific extractions")
else:
    # Group by magazine for clearer display
    from collections import defaultdict
    by_magazine = defaultdict(list)
    
    for mag, model, schema, prompt in sorted(planned_extractions):
        by_magazine[mag].append((model, schema, prompt))
    
    print(f"Planned extractions: {len(planned_extractions)}")
    print()
    
    # Show first 10 or all if <= 10
    items_to_show = list(planned_extractions)[:10]
    
    print(f"{'Magazine':<35} {'Model':<25} {'Schema':<20} {'Prompt'}")
    print("-" * 100)
    
    for mag, model, schema, prompt in sorted(items_to_show):
        prompt_str = prompt or "none"
        print(f"{mag:<35} {model:<25} {schema:<20} {prompt_str}")
    
    if len(planned_extractions) > 10:
        print(f"... and {len(planned_extractions) - 10} more")
    
    print()
    print(f"Output directory: {PREDICTIONS / 'evaluations'}")
    print(f"Structure: {{magazine}}/model={{model}}/schema={{schema}}/prompt={{prompt}}/")


EXTRACTION PREVIEW

Planned extractions: 70

Magazine                            Model                     Schema               Prompt
----------------------------------------------------------------------------------------------------
La_Plume_bpt6k1185893k_1_10_1889    mistral-ocr-latest        stage1_page_v2_small_pure none
La_Plume_bpt6k1185893k_1_10_1889    pixtral-12b-latest        stage1_page_v2       stage1_page_v2
La_Plume_bpt6k1185893k_1_10_1889    pixtral-12b-latest        stage1_page_v2_medium stage1_page_v2
La_Plume_bpt6k1185893k_1_10_1889    pixtral-12b-latest        stage1_page_v2_small_pure stage1_page_v2_pure
La_Plume_bpt6k1185893k_1_10_1889    pixtral-large-latest      stage1_page_v2_medium stage1_page_v2
La_Plume_bpt6k1212187t_15-11-1893   pixtral-12b-latest        stage1_page_v2_medium_pure stage1_page_v2_pure
La_Plume_bpt6k1212187t_15-11-1893   pixtral-12b-latest        stage1_page_v2_pure  stage1_page_v2
La_Plume_bpt6k1212187t_15-11-1893   pixtral-12b-latest     

In [15]:
"""
API Client Setup
"""

def get_mistral_client() -> Mistral:
    """Initialize Mistral client with API key."""
    return Mistral(api_key=MISTRAL_CONFIG.get_api_key())

client = get_mistral_client()

print("\n" + "=" * 60)
print("API SETUP")
print("=" * 60)
print("✓ Mistral client initialized")
print("✓ API key configured")


API SETUP
✓ Mistral client initialized
✓ API key configured


In [None]:
"""
Execute Extractions

Run extractions for all planned combinations.
Progress tracked per extraction with statistics.
"""

if len(planned_extractions) == 0:
    print("\nSkipping extraction - nothing to do")
else:
    print("\n" + "=" * 60)
    print("EXECUTING EXTRACTIONS")
    print("=" * 60)
    print()
    
    # Track results
    all_results = []
    
    for idx, (magazine, model, schema, prompt) in enumerate(sorted(planned_extractions), 1):
        print(f"\n[{idx}/{len(planned_extractions)}] " + "=" * 50)
        prompt_str = prompt or "none"
        print(f"Magazine: {magazine}")
        print(f"Model:    {model}")
        print(f"Schema:   {schema}")
        print(f"Prompt:   {prompt_str}")
        print("=" * 60)
        
        # Load schema class dynamically
        try:
            schema_module = importlib.import_module(f"schemas.{schema}")
            schema_class = schema_module.Stage1PageModel
        except Exception as e:
            logger.error(f"Failed to load schema '{schema}': {e}")
            all_results.append({
                "magazine": magazine,
                "model": model,
                "schema": schema,
                "prompt": prompt,
                "error": str(e)
            })
            continue
        
        # Build output path (magazine-first structure)
        out_root = build_evaluation_path(
            magazine_name=magazine,
            model_name=model,
            schema_name=schema,
            prompt_name=prompt
        )
        
        # Find PDF file
        pdf_path = None
        for pdf_file in RAW_DATA.rglob(f"{magazine}.pdf"):
            pdf_path = pdf_file
            break
        
        if pdf_path is None:
            logger.error(f"PDF not found: {magazine}.pdf")
            all_results.append({
                "magazine": magazine,
                "model": model,
                "schema": schema,
                "prompt": prompt,
                "error": "PDF not found"
            })
            continue
        
        # Run extraction
        try:
            stats = extract_pdf_pages(
                pdf_path=pdf_path,
                schema_class=schema_class,
                client=client,
                out_root=out_root,
                model_name=model,
                prompt_name=prompt,
                overwrite=EXTRACTION_CONFIG.overwrite,
                zero_pad=EXTRACTION_CONFIG.zero_pad,
                max_retries=MISTRAL_CONFIG.max_retries,
                base_delay=MISTRAL_CONFIG.base_delay,
                max_delay=MISTRAL_CONFIG.max_delay,
                use_providers=True
            )
            
            all_results.append({
                "magazine": magazine,
                "model": model,
                "schema": schema,
                "prompt": prompt,
                "written": stats['written'],
                "skipped": stats['skipped'],
                "failed": stats['failed'],
                "total": stats['total']
            })
            
            print(f"\n✓ Extraction complete")
            print(f"  Written: {stats['written']}")
            print(f"  Skipped: {stats['skipped']}")
            print(f"  Failed:  {stats['failed']}")
            
        except Exception as e:
            logger.error(f"Extraction failed: {e}")
            all_results.append({
                "magazine": magazine,
                "model": model,
                "schema": schema,
                "prompt": prompt,
                "error": str(e)
            })
    
    print("\n" + "=" * 60)
    print("EXTRACTION COMPLETE")
    print("=" * 60)

2025-11-12 18:19:06,389 | INFO | Using provider architecture for extraction with model: mistral-ocr-latest



EXECUTING EXTRACTIONS


Magazine: La_Plume_bpt6k1185893k_1_10_1889
Model:    mistral-ocr-latest
Schema:   stage1_page
Prompt:   none


2025-11-12 18:19:06,506 | INFO | Processing La_Plume_bpt6k1185893k_1_10_1889.pdf: 14 to extract, 0 already exist


  La_Plume_bpt6k1185893k_1_10_1889.pdf:   0%|          | 0/14 [00:00<?, ?it/s]

2025-11-12 18:19:18,011 | INFO | HTTP Request: POST https://api.mistral.ai/v1/ocr "HTTP/1.1 200 OK"


In [None]:
"""
Results Summary
"""

if len(planned_extractions) > 0:
    import pandas as pd
    
    # Convert to DataFrame
    df = pd.DataFrame(all_results)
    
    print("\n" + "=" * 60)
    print("RESULTS SUMMARY")
    print("=" * 60)
    print()
    
    # Check for errors
    if 'error' in df.columns:
        errors = df[df['error'].notna()]
        if len(errors) > 0:
            print(f"WARNING  {len(errors)} extraction(s) failed:")
            print()
            for _, row in errors.iterrows():
                prompt_str = row['prompt'] or 'none'
                print(f"  {row['magazine']} / {row['model']} / {row['schema']} / {prompt_str}")
                print(f"    Error: {row['error']}")
            print()
    
    # Successful extractions
    successful = df[df.get('written', pd.Series()).notna()]
    if len(successful) > 0:
        print(f"✓ {len(successful)} extraction(s) completed successfully")
        print()
        print(f"  Total pages written: {successful['written'].sum()}")
        print(f"  Total pages skipped: {successful['skipped'].sum()}")
        print(f"  Total pages failed:  {successful['failed'].sum()}")
        print()
        
        # Group by magazine
        print("By magazine:")
        for magazine in sorted(successful['magazine'].unique()):
            mag_data = successful[successful['magazine'] == magazine]
            print(f"\n  {magazine}:")
            for _, row in mag_data.iterrows():
                prompt_str = row['prompt'] or 'none'
                print(f"    {row['model']} × {row['schema']} × {prompt_str}")
                print(f"      Written: {row['written']}, Skipped: {row['skipped']}, Failed: {row['failed']}")
    
    print("\n" + "=" * 60)
    print("Output directory:")
    print(f"  {PREDICTIONS / 'evaluations'}")
    print()
    print("Next step: Run evaluation notebook (01g_master_evaluation.ipynb)")
    print("=" * 60)
    
    # Display full results table
    print("\nFull results:")
    display(df)