# Batch PDF Processing - Philatelic RAG

Efficient processing of all PDFs in the `pdfs/` folder to generate philatelic-enriched JSON files.

In [None]:
import os
import json
import torch
from pathlib import Path
from tqdm.auto import tqdm
from PIL import Image
from omegaconf import OmegaConf

from demo_page import *
from philatelic_patterns import *
from dolphin_transformer import transform_dolphin_to_oxcart_preserving_labels

In [None]:
# Setup
config_path = "./config/Dolphin.yaml"
pdfs_dir = Path("./pdfs")
save_dir = "./results"
parsed_jsons_dir = Path("./results/parsed_jsons")
max_batch_size = 8  # Increased from 4 for better throughput

# Initialize model once
print("🚀 Initializing Dolphin model...")
config = OmegaConf.load(config_path)
model = DOLPHIN(config)

# Apply FP16 optimization like in dolphin_parser
orig_chat = model.chat
def chat_fp16(*args, **kwargs):
    with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.float16):
        return orig_chat(*args, **kwargs)
model.chat = chat_fp16  # Apply FP16 optimization

setup_output_dirs(save_dir)

print(f"✅ Model loaded with FP16 optimization. GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Get PDF files and check existing processed files (optimized)
pdf_files = sorted([f for f in pdfs_dir.glob("*.pdf")])
existing_processed = set()

# Optimize file checking - only check if directory exists
if parsed_jsons_dir.exists():
    existing_processed = {
        json_file.stem.replace("_philatelic", "") 
        for json_file in parsed_jsons_dir.glob("*_philatelic.json")
    }

to_process = [f for f in pdf_files if f.stem not in existing_processed]

print(f"📁 Total PDFs: {len(pdf_files)} | Already processed: {len(existing_processed)} | To process: {len(to_process)}")

# Only show detailed info if there are many existing files
if len(existing_processed) > 10:
    print(f"Sample already processed: {sorted(list(existing_processed))[:5]}...")
elif existing_processed:
    print(f"Already processed: {sorted(existing_processed)}")

In [None]:
# Batch processing with error handling
failed_pdfs = []
processed_count = 0
memory_cleanup_interval = 10  # Clean memory every N PDFs instead of every PDF

for pdf_file in tqdm(to_process, desc="Processing PDFs"):
    pdf_name = pdf_file.stem
    
    try:
        if processed_count % 5 == 0:  # Reduce logging frequency
            print(f"\n🔄 Processing: {pdf_name} ({processed_count+1}/{len(to_process)})")
        
        # Process PDF with Dolphin
        json_path, recognition_results = process_document(
            document_path=str(pdf_file),
            model=model,
            save_dir=save_dir,
            max_batch_size=max_batch_size
        )
        
        # Transform to OXCART format with philatelic enrichment
        ox = transform_dolphin_to_oxcart_preserving_labels(
            recognition_results,
            doc_id=pdf_name,
            page_dims_provider=lambda p: Image.open(f"./results/pages/page_{p:03d}.png").size,
            para_max_chars=1500,
            target_avg_length=300,
            max_chunk_length=1200,
            table_row_block_size=None,
            optimize_for_rag=True
        )
        
        # Enrich with philatelic metadata
        ox = enrich_all_chunks_advanced_philatelic(ox)
        
        # Save philatelic JSON
        output_path = parsed_jsons_dir / f"{pdf_name}_philatelic.json"
        save_json(ox, str(output_path))
        
        processed_count += 1
        
    except Exception as e:
        error_msg = f"Error processing {pdf_name}: {str(e)}"
        print(f"❌ {error_msg}")
        failed_pdfs.append({"pdf": pdf_name, "error": str(e)})
    
    finally:
        # Clean GPU memory periodically instead of every PDF
        if torch.cuda.is_available() and (processed_count % memory_cleanup_interval == 0):
            torch.cuda.empty_cache()

print(f"\n🎉 Processing complete!")
print(f"✅ Successfully processed: {processed_count}")
print(f"❌ Failed: {len(failed_pdfs)}")

In [None]:
# Final cleanup and summary
# Clean GPU memory one final time
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Save failed PDFs list for retry
if failed_pdfs:
    failed_file = Path("failed_pdfs.json")
    with open(failed_file, 'w', encoding='utf-8') as f:
        json.dump(failed_pdfs, f, indent=2, ensure_ascii=False)
    
    print(f"\n📋 Failed PDFs saved to: {failed_file}")
    for failed in failed_pdfs[:3]:  # Show only first 3 to reduce output
        print(f"  - {failed['pdf']}: {failed['error']}")
    if len(failed_pdfs) > 3:
        print(f"  ... and {len(failed_pdfs) - 3} more")
else:
    print("\n🎯 All PDFs processed successfully!")

# Optimized final summary
if parsed_jsons_dir.exists():
    total_processed = len(list(parsed_jsons_dir.glob("*_philatelic.json")))
else:
    total_processed = 0

print(f"\n📊 Final Status:")
print(f"Total philatelic JSONs: {total_processed}")
print(f"Total PDFs: {len(pdf_files)}")
print(f"Completion rate: {total_processed/len(pdf_files)*100:.1f}%")