In [31]:
#!/usr/bin/env python3
# surya_batched_processing.py

import json
import os
from PIL import Image
from pypdf import PdfReader
import fitz  # PyMuPDF for rendering PDF pages to images

from surya.recognition import RecognitionPredictor
from surya.detection import DetectionPredictor
from surya.layout import LayoutPredictor
from surya.table_rec import TableRecPredictor

# --- Configuration ---
PDF_PATH    = "/Users/nicholasmcintosh/Documents/sandbox_code/project/data/raw/ANTI_OEDIPUS_FR.pdf"
BATCH_SIZE  = 10
OUTPUT_DIR  = "surya_batch_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Helper: Recursive serialization for Surya result objects ---
def serialize_surya(obj):
    if hasattr(obj, "to_dict"):
        return obj.to_dict()
    if isinstance(obj, (list, tuple)):
        return [serialize_surya(item) for item in obj]
    if hasattr(obj, "__dict__"):
        return {key: serialize_surya(value) for key, value in obj.__dict__.items()}
    return obj


In [26]:
# Cell 2: Initialize Surya Predictors

print("Initializing Surya predictors...")

# Initialize DetectionPredictor first, as it's a dependency for RecognitionPredictor [1].
detection_predictor = DetectionPredictor() # Used for line-level text detection [2]

# Initialize RecognitionPredictor for OCR. It relies on a DetectionPredictor [1].
recognition_predictor = RecognitionPredictor() # For OCR [13]

# Initialize LayoutPredictor for layout analysis and reading order detection [13].
layout_predictor = LayoutPredictor()

# Initialize TableRecPredictor for table recognition (detecting rows/columns) [13].
table_rec_predictor = TableRecPredictor()

print("All Surya predictors initialized successfully.")

Initializing Surya predictors...
All Surya predictors initialized successfully.


In [32]:
print(f"Starting batched processing of '{os.path.basename(PDF_PATH)}'...")

doc = fitz.open(PDF_PATH)
try:
    total_pages = len(doc)
    print(f"Total pages in PDF: {total_pages}")

    for batch_num, start_idx in enumerate(range(0, total_pages, BATCH_SIZE), start=1):
        end_idx = min(start_idx + BATCH_SIZE, total_pages)
        pages   = list(range(start_idx, end_idx))
        print(f"\nProcessing pages {start_idx+1}-{end_idx} (Batch {batch_num} of {((total_pages + BATCH_SIZE - 1)//BATCH_SIZE)})...")

        # --- Load batch images from PDF ---
        batch_images = []
        for page_idx in pages:
            page = doc.load_page(page_idx)
            pix  = page.get_pixmap()
            img  = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
            batch_images.append(img)

        if not batch_images:
            print(f"  No images for pages {start_idx+1}-{end_idx}, skipping.")
            continue

        # --- Step 1: OCR ---
        print(f"  Running OCR for pages {start_idx+1}-{end_idx}...")
        rec_preds = recognition_predictor(batch_images, det_predictor=detection_predictor)
        ocr_path  = os.path.join(OUTPUT_DIR, f"ocr_results_pages_{start_idx+1}_to_{end_idx}.json")
        with open(ocr_path, 'w', encoding='utf-8') as f:
            json.dump([serialize_surya(p) for p in rec_preds], f, indent=4, ensure_ascii=False)
        print(f"  OCR results saved to {ocr_path}")

        # --- Step 2: Layout Analysis ---
        print(f"  Running Layout Analysis for pages {start_idx+1}-{end_idx}...")
        layout_preds = layout_predictor(batch_images)
        layout_path  = os.path.join(OUTPUT_DIR, f"layout_results_pages_{start_idx+1}_to_{end_idx}.json")
        with open(layout_path, 'w', encoding='utf-8') as f:
            json.dump([serialize_surya(p) for p in layout_preds], f, indent=4, ensure_ascii=False)
        print(f"  Layout results saved to {layout_path}")

except Exception as e:
    print(f"Unexpected error: {e}")
finally:
    doc.close()

print("\nBatched processing complete. Results saved to:", OUTPUT_DIR)

Starting batched processing of 'ANTI_OEDIPUS_FR.pdf'...
Total pages in PDF: 451

Processing pages 1-10 (Batch 1 of 46)...
  Running OCR for pages 1-10...


Detecting bboxes: 100%|██████████| 2/2 [00:21<00:00, 10.75s/it]
Recognizing Text: 100%|██████████| 215/215 [01:29<00:00,  2.40it/s]


  OCR results saved to surya_batch_results/ocr_results_pages_1_to_10.json
  Running Layout Analysis for pages 1-10...


Recognizing layout: 100%|██████████| 3/3 [00:46<00:00, 15.53s/it]


  Layout results saved to surya_batch_results/layout_results_pages_1_to_10.json

Processing pages 11-20 (Batch 2 of 46)...
  Running OCR for pages 11-20...


Detecting bboxes: 100%|██████████| 2/2 [00:15<00:00,  7.68s/it]
Recognizing Text:  21%|██        | 77/369 [01:20<01:02,  4.66it/s] 

: 