# Batch PDF Processing - Philatelic RAG (Optimized v4)

Ultra-fast processing of all PDFs in the `pdfs/` folder with minimal overhead.

In [None]:
import os
import json
import torch
from pathlib import Path
from tqdm.auto import tqdm
from PIL import Image
from omegaconf import OmegaConf

from demo_page import *
from philatelic_patterns import *
from dolphin_transformer import transform_dolphin_to_oxcart_preserving_labels

In [None]:
# Setup - Optimized configuration
config_path = "./config/Dolphin.yaml"
pdfs_dir = Path("./pdfs")
save_dir = "./results"
parsed_jsons_dir = Path("./results/parsed_jsons")
max_batch_size = 2  # Increased for better GPU utilization

# Pre-clean GPU memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

print("🚀 Initializing Dolphin model...")
config = OmegaConf.load(config_path)
model = DOLPHIN(config)

# Apply FP16 optimization
orig_chat = model.chat
def chat_fp16(*args, **kwargs):
    with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.float16):
        return orig_chat(*args, **kwargs)
model.chat = chat_fp16

setup_output_dirs(save_dir)

print(f"✅ Model loaded. GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

In [None]:
# Get PDF files - optimized file discovery with recursive search
pdf_files = sorted(pdfs_dir.glob("**/*.pdf"))

# Quick check for existing files
existing_processed = set()
if parsed_jsons_dir.exists():
    existing_processed = {
        json_file.stem.replace("_philatelic", "") 
        for json_file in parsed_jsons_dir.glob("*_philatelic.json")
    }

to_process = [f for f in pdf_files if f.stem not in existing_processed]

print(f"📁 {len(pdf_files)} total | {len(existing_processed)} done | {len(to_process)} to process")

In [None]:
# Ultra-fast batch processing
failed_pdfs = []
processed_count = 0
memory_cleanup_interval = 5  # More frequent cleanup

# Minimal progress bar with reduced update frequency
progress_bar = tqdm(to_process, desc="Processing", mininterval=2.0, maxinterval=10.0)

for pdf_file in progress_bar:
    pdf_name = pdf_file.stem
    
    try:
        # Process PDF with Dolphin
        json_path, recognition_results = process_document(
            document_path=str(pdf_file),
            model=model,
            save_dir=save_dir,
            max_batch_size=max_batch_size
        )
        
        # Transform to OXCART format with philatelic enrichment
        ox = transform_dolphin_to_oxcart_preserving_labels(
            recognition_results,
            doc_id=pdf_name,
            page_dims_provider=lambda p: Image.open(f"./results/pages/page_{p:03d}.png").size,
            para_max_chars=1500,
            target_avg_length=300,
            max_chunk_length=1200,
            table_row_block_size=None,
            optimize_for_rag=True
        )
        
        # Enrich with philatelic metadata
        ox = enrich_all_chunks_advanced_philatelic(ox)
        
        # Save philatelic JSON
        output_path = parsed_jsons_dir / f"{pdf_name}_philatelic.json"
        save_json(ox, str(output_path))
        
        processed_count += 1
        
        # Update progress bar description periodically
        if processed_count % 10 == 0:
            progress_bar.set_description(f"Processed {processed_count}")
        
    except Exception as e:
        failed_pdfs.append({"pdf": pdf_name, "error": str(e)})
    
    # Optimized memory cleanup
    if torch.cuda.is_available() and (processed_count % memory_cleanup_interval == 0):
        torch.cuda.synchronize()
        torch.cuda.empty_cache()

progress_bar.close()
print(f"🎉 Complete! Processed: {processed_count} | Failed: {len(failed_pdfs)}")

In [None]:
# Final cleanup and results
if torch.cuda.is_available():
    torch.cuda.synchronize()
    torch.cuda.empty_cache()

# Save failed PDFs if any
if failed_pdfs:
    with open("failed_pdfs.json", 'w', encoding='utf-8') as f:
        json.dump(failed_pdfs, f, indent=2, ensure_ascii=False)
    print(f"❌ {len(failed_pdfs)} failed - saved to failed_pdfs.json")

# Final summary
total_processed = len(list(parsed_jsons_dir.glob("*_philatelic.json"))) if parsed_jsons_dir.exists() else 0
completion_rate = total_processed/len(pdf_files)*100 if pdf_files else 0

print(f"📊 {total_processed}/{len(pdf_files)} PDFs ({completion_rate:.1f}%)")

In [None]:
import json
import re

# Leer el notebook
with open(r"C:\Users\VM-SERVER\Desktop\Oxcart RAG\combine_chunks.ipynb", 'r', encoding='utf-8') as f:
    notebook = json.load(f)

# Buscar y corregir las llamadas incorrectas a should_combine_chunks
corrections_made = 0

for cell in notebook['cells']:
    if cell['cell_type'] == 'code':
        for i, line in enumerate(cell['source']):
            # Buscar las llamadas con 3 argumentos y corregir a 2
            if 'should_combine_chunks(current_chunk, test_processed_chunks, test_used_headers)' in line:
                cell['source'][i] = line.replace(
                    'should_combine_chunks(current_chunk, test_processed_chunks, test_used_headers)',
                    'should_combine_chunks(current_chunk, test_processed_chunks)'
                )
                corrections_made += 1
                print(f"Corregido en línea: {line.strip()}")

# Guardar el notebook corregido
with open(r"C:\Users\VM-SERVER\Desktop\Oxcart RAG\combine_chunks.ipynb", 'w', encoding='utf-8') as f:
    json.dump(notebook, f, ensure_ascii=False, indent=1)

print(f"\nTotal de correcciones realizadas: {corrections_made}")