In [1]:
!pip install pymupdf pillow torch torchvision python-doctr -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m80.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m97.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m71.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s

In [2]:
import fitz  # PyMuPDF
import tempfile, os
from PIL import Image, ImageDraw
import torch
from doctr.models import ocr_predictor
from doctr.io import DocumentFile

# ================================
# 0. Check GPU and load OCR model
# ================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load docTR OCR model once
ocr_model = ocr_predictor(pretrained=True).to(device)


# ================================
# 1. Define function
# ================================
def extract_pdf_text(pdf_path: str, ocr_model) -> list[str]:
    """
    Extract text from a PDF that may contain both renderable and non-renderable text.

    Args:
        pdf_path (str): Path to the PDF file.
        ocr_model: Pre-loaded docTR OCR model on desired device.

    Returns:
        full_text (list[str]): List of strings, one per page.
    """
    doc = fitz.open(pdf_path)
    temp_dir = tempfile.mkdtemp()
    image_paths = []
    full_text = []

    for page_index, page in enumerate(doc):
        # -----------------------------
        # a. Extract existing text blocks
        # -----------------------------
        text_blocks = page.get_text("blocks")
        existing_text = [block[4] for block in text_blocks if block[4].strip()]

        # -----------------------------
        # b. Render page as image
        # -----------------------------
        pix = page.get_pixmap(dpi=300)
        img_path = os.path.join(temp_dir, f"page_{page_index}.png")
        pix.save(img_path)
        image_paths.append(img_path)

        # -----------------------------
        # c. Mask existing text on image
        # -----------------------------
        if existing_text:
            img = Image.open(img_path).convert("RGB")
            draw = ImageDraw.Draw(img)
            for block in text_blocks:
                x0, y0, x1, y1, text = block[:5]
                if text.strip():
                    draw.rectangle([x0, y0, x1, y1], fill="white")
            img.save(img_path)  # overwrite image

    # -----------------------------
    # d. Run OCR on masked images
    # -----------------------------
    doc_tr = DocumentFile.from_images(image_paths)
    result = ocr_model(doc_tr)  # uses GPU if available

    # -----------------------------
    # e. Combine extracted text + OCR text
    # -----------------------------
    for page_index, page in enumerate(result.pages):
        ocr_text = []
        for block in page.blocks:
            for line in block.lines:
                line_text = " ".join([word.value for word in line.words])
                ocr_text.append(line_text)
        combined_text = " ".join(existing_text) + " " + " ".join(ocr_text)
        full_text.append(combined_text.strip())

    return full_text


Using device: cuda
Downloading https://doctr-static.mindee.com/models?id=v0.8.1/fast_base-688a8b34.pt&src=0 to /root/.cache/doctr/models/fast_base-688a8b34.pt


  0%|          | 0/65814772 [00:00<?, ?it/s]

Downloading https://doctr-static.mindee.com/models?id=v0.12.0/crnn_vgg16_bn-0417f351.pt&src=0 to /root/.cache/doctr/models/crnn_vgg16_bn-0417f351.pt


  0%|          | 0/63303144 [00:00<?, ?it/s]

In [3]:
# !gdown 1uWLh9YJt4uBNn5ptvT4VnhK2P2ch9sR6

In [4]:
# ================================
# 2. Example usage
# ================================
# pdf_path = "sample.pdf"
# full_text = extract_pdf_text(pdf_path, ocr_model)

# for i, page_text in enumerate(full_text, start=1):
#     print(f"\n--- PAGE {i} ---\n")
#     print(page_text)

In [5]:
!gdown 1eCsZYNSpPATG81MeWie4aeJHLqulmzQs

Downloading...
From (original): https://drive.google.com/uc?id=1eCsZYNSpPATG81MeWie4aeJHLqulmzQs
From (redirected): https://drive.google.com/uc?id=1eCsZYNSpPATG81MeWie4aeJHLqulmzQs&confirm=t&uuid=c7ae1ee8-bd13-4376-857f-91a417359b50
To: /kaggle/working/acts 01 - 02.zip
100%|██████████████████████████████████████| 83.5M/83.5M [00:01<00:00, 42.6MB/s]


In [6]:
import zipfile
import os
import json
import tempfile

# ================================
# 1. Unzip PDFs
# ================================
zip_path = "/kaggle/working/acts 01 - 02.zip"
extract_dir = tempfile.mkdtemp()

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

pdf_files = [os.path.join(extract_dir, f) for f in os.listdir(extract_dir) if f.lower().endswith(".pdf")]


In [7]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import torch

# ================================
# 2. Define wrapper for CPU/GPU pipeline
# ================================
def process_pdf_async(pdf_path):
    """
    CPU: read PDF, render pages, mask existing text
    GPU: run OCR
    Returns (filename, list_of_texts)
    """
    try:
        texts = extract_pdf_text(pdf_path, ocr_model)
        return os.path.basename(pdf_path), texts
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return os.path.basename(pdf_path), []

# ================================
# 3. Process PDFs concurrently
# ================================
output_data = {}

# You can limit max_workers based on CPU cores; OCR runs asynchronously on GPU
max_workers = min(4, os.cpu_count())  # adjust as needed
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    future_to_pdf = {executor.submit(process_pdf_async, pdf): pdf for pdf in pdf_files}
    
    for future in as_completed(future_to_pdf):
        pdf_name, texts = future.result()
        output_data[pdf_name] = texts
        print(f"Finished {pdf_name}")

# ================================
# 4. Save results as JSON
# ================================
output_json = "extracted_texts 01 - 02.json"
with open(output_json, "w", encoding="utf-8") as f:
    json.dump(output_data, f, indent=2, ensure_ascii=False)

print(f"Extraction complete. Results saved to {output_json}")


Finished 2002-10-18-2002_E.pdf
Finished 2002-7-11-2002_E.pdf
Finished 2001-8-09-2001_E.pdf
Finished 2002-10-17-2002_E.pdf
Finished 2002-12-33-2002_E.pdf
Finished 2002-10-25-2002_E.pdf
Finished 2001-8-10-2001_E.pdf
Finished 2002-12-29-2002_E.pdf
Finished 2002-6-08-2002_E.pdf
Finished 2002-3-03-2002_E.pdf
Finished 2002-12-34-2002_E.pdf
Finished 2002-7-14-2002_E.pdf
Finished 2001-4-03-2001_E.pdf
Finished 2002-12-28-2002_E.pdf
Finished 2002-10-23-2002_E.pdf
Finished 2002-12-30-2002_E.pdf
Finished 2002-3-02-2002_E.pdf
Finished 2002-10-19-2002_E.pdf
Finished 2001-4-05-2001_E.pdf
Finished 2002-10-26-2002_E.pdf
Finished 2001-4-04-2001_E.pdf
Finished 2002-12-35-2002_E.pdf
Finished 2002-5-07-2002_E.pdf
Finished 2002-10-22-2002_E.pdf
Finished 2002-7-13-2002_E.pdf
Finished 2002-3-05-2002_E.pdf
Finished 2001-7-08-2001_E.pdf
Finished 2002-8-16-2002_E.pdf
Finished 2001-4-07-2001_E.pdf
Finished 2002-10-21-2002_E.pdf
Finished 2002-6-09-2002_E.pdf
Finished 2002-3-04-2002_E.pdf
Finished 2002-7-12-2002_E.