In [1]:
import os
import argparse
import base64
from pathlib import Path
from mistralai import Mistral
from mistralai import DocumentURLChunk
import json
import time

ModuleNotFoundError: No module named 'mistralai'

# Mistal PARSING (Synchronous)

In [13]:
def process_pdf(pdf_file: Path, output_dir: Path, client: Mistral):
    """Process a single PDF file and save results to output directory.
    
    Args:
        pdf_file: Path to the PDF file to process
        output_dir: Directory where results will be saved
        client: Mistral client instance
    """
    print(f"Processing {pdf_file}...")

    # Upload PDF file to Mistral's OCR service
    uploaded_file = client.files.upload(
        file={
            "file_name": pdf_file.name,
            "content": pdf_file.read_bytes(),
        },
        purpose="ocr",
    )

    # Get URL for the uploaded file
    signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)

    # Process PDF with OCR, including embedded images
    pdf_response = client.ocr.process(
        document=DocumentURLChunk(document_url=signed_url.url),
        model="mistral-ocr-latest",
        include_image_base64=True
    )

    # Convert response to JSON format
    response_dict = json.loads(pdf_response.model_dump_json())

    # Save response to JSON file
    output_dir.mkdir(parents=True, exist_ok=True)
    with open(output_dir / "response.json", "w") as f:
        json.dump(response_dict, f)

    # Save images to PNG files
    images_dir = output_dir / "images"
    images_dir.mkdir(exist_ok=True)
    
    for page in pdf_response.pages:
        for img in page.images:
            # Extract base64 data after the comma
            img_data = img.image_base64.split(',')[1]
            # Decode and save image
            img_bytes = base64.b64decode(img_data)
            with open(images_dir / img.id, "wb") as f:
                f.write(img_bytes)
            
    # Save raw text
    with open(output_dir / "text.txt", "w", encoding="utf-8") as f:
        for page in pdf_response.pages:
            f.write(page.markdown)  # Use markdown instead of text attribute

In [14]:
# open pdf file
file_path = "data/codigo_derecho_constitiucional/codigo1.pdf"
pdf_file = Path(file_path)

output_dir = Path('data/codigo_derecho_constitiucional/')

client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))

process_pdf(pdf_file, output_dir, client)

Processing data/codigo_derecho_constitiucional/codigo.pdf...


SDKError: API error occurred: Status 400
{"object":"error","message":"This document has 1269 pages, which is more than the maximum allowed of 1000.","type":"invalid_request_error","param":null,"code":null}

# MISTRAL PARSING (asynchronous)    
Barch Processing

In [5]:
def process_pdf_batch(input_folder: Path, output_folder: Path, client: Mistral):
    """Process all PDFs in a folder using Mistral's batch OCR processing.
    
    Args:
        input_folder: Directory containing PDF files
        output_folder: Directory to save the processed output
        client: Mistral client instance
    """
    # Get all PDF files in input folder
    pdf_files = list(input_folder.glob("**/*.pdf"))
    if not pdf_files:
        print(f"No PDF files found in {input_folder}")
        return

    # Create intermediate processing directory
    intermediate_dir = output_folder / "_processing"
    intermediate_dir.mkdir(parents=True, exist_ok=True)

    # Create batch file
    batch_file = intermediate_dir / f"batch_{int(time.time())}.jsonl"

    with open(batch_file, "w") as f:
        for pdf_file in pdf_files:
            # Use the PDF filename as the identifier
            file_id = pdf_file.stem
            
            # Upload PDF to Mistral
            uploaded_file = client.files.upload(
                file={
                    "file_name": pdf_file.name,
                    "content": pdf_file.read_bytes(),
                },
                purpose="ocr",
            )

            # Get signed URL
            signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)

            entry = {
                "custom_id": file_id,
                "body": {
                    "document": {
                        "type": "document_url",
                        "document_url": signed_url.url,
                    },
                    "include_image_base64": True
                }
            }
            f.write(json.dumps(entry) + "\n")

    print(f"Created batch file with {len(pdf_files)} PDFs")
    print("Starting batch processing...")

    # Upload batch file
    batch_data = client.files.upload(
        file={
            "file_name": batch_file.name,
            "content": open(batch_file, "rb"),
        },
        purpose="batch",
    )

    # Create and monitor job
    created_job = client.batch.jobs.create(
        input_files=[batch_data.id],
        model="mistral-ocr-latest",
        endpoint="/v1/ocr",
        metadata={"job_type": "pdf_processing"}
    )

    print(f"Job created with ID: {created_job.id}")
    
    # Monitor progress
    retrieved_job = client.batch.jobs.get(job_id=created_job.id)
    while retrieved_job.status in ["QUEUED", "RUNNING"]:
        retrieved_job = client.batch.jobs.get(job_id=created_job.id)
        print(f"Status: {retrieved_job.status}")
        print(f"Progress: {retrieved_job.succeeded_requests + retrieved_job.failed_requests}/{retrieved_job.total_requests} "
              f"({round((retrieved_job.succeeded_requests + retrieved_job.failed_requests) / retrieved_job.total_requests * 100, 1)}%)")
        time.sleep(2)

    # Download and process results
    print("Downloading results...")
    downloaded_file = client.files.download(file_id=retrieved_job.output_file)
    download_path = intermediate_dir / f"output_{int(time.time())}.jsonl"
    
    with open(download_path, "w") as f:
        for chunk in downloaded_file.stream:
            f.write(chunk.decode("utf-8"))

    # Process each result
    with open(download_path, "r") as f:
        for line in f:
            response_dict = json.loads(line)
            
            # Create output directory for this PDF
            pdf_output_dir = output_folder / response_dict["custom_id"]
            pdf_output_dir.mkdir(parents=True, exist_ok=True)

            # Save response JSON
            with open(pdf_output_dir / "response.json", "w") as out_f:
                json.dump(response_dict, out_f)

            content = response_dict["response"]["body"]

            # Save images
            images_dir = pdf_output_dir / "images"
            images_dir.mkdir(exist_ok=True)
            
            for page in content["pages"]:
                for img in page["images"]:
                    img_data = img["image_base64"].split(',')[1]
                    img_bytes = base64.b64decode(img_data)
                    with open(images_dir / img["id"], "wb") as img_f:
                        img_f.write(img_bytes)

            # Save text content
            with open(pdf_output_dir / "text.txt", "w", encoding="utf-8") as text_f:
                for page in content["pages"]:
                    text_f.write(page["markdown"])

    print(f"Processing complete. Results saved in {output_folder}")


In [9]:
# open pdf file
input_dir = Path('data/raw_small/')
output_dir = Path('data/mistral_parsing_batch/')

client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))

process_pdf_batch(input_dir, output_dir, client)

Created batch file with 3 PDFs
Starting batch processing...
Job created with ID: 893afcd0-e095-4745-a774-4b706e5cde42
Status: RUNNING
Progress: 0/3 (0.0%)
Status: RUNNING
Progress: 0/3 (0.0%)
Status: RUNNING
Progress: 0/3 (0.0%)
Status: RUNNING
Progress: 0/3 (0.0%)
Status: RUNNING
Progress: 0/3 (0.0%)
Status: SUCCESS
Progress: 3/3 (100.0%)
Downloading results...
Processing complete. Results saved in data/mistral_parsing_batch
