In [None]:
import fitz
import os
from pathlib import Path
import json
from tqdm import tqdm
from PIL import Image
import pytesseract
import io

In [15]:
def extract_text_from_pdf(pdf_path):
    # Extract all text from a PDF file
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text.strip()

def extract_text_with_ocr(pdf_path):
    # Extract text using OCR for scanned PDFs
    doc = fitz.open(pdf_path)
    text = ""

    for page_num in range(len(doc)):
        page = doc[page_num]

        pix = page.get_pixmap()
        img_data = pix.tobytes("png")
        img = Image.open(io.BytesIO(img_data))

        page_text = pytesseract.image_to_string(img)
        text += page_text + "\n"

    doc.close()


def extract_text(pdf_path):
    text_plain = extract_text_from_pdf(pdf_path) or " "
    text_ocr = extract_text_with_ocr(pdf_path) or " "
    text = text_plain + '\n' + text_ocr

    return text

In [18]:
input_dir = "../../data/records"
output_dir = "../../data/processed"

In [19]:
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)

pdf_files = list(input_path.glob("*.pdf"))
print(f"Found {len(pdf_files)} PDF files")

Found 185 PDF files


In [None]:
results = []

for pdf_file in tqdm(pdf_files, desc="Extracting text"):
    text = extract_text(pdf_file)
    if text and len(text) > 50:
        result = {
            'filename': pdf_file.name,
            'text': text,
            'char_count': len(text),
        }
        results.append(result)


print(f"Number of documents processed successfully: {len(results)}")

Extracting text:  80%|████████  | 148/185 [26:00<06:40, 10.83s/it] 

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



Extracting text: 100%|██████████| 185/185 [30:32<00:00,  9.91s/it]

Number of documents processed successfully: 185





In [40]:
output_file = output_path / "extracted_texts.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=4)

print(f"Saved to {output_file}")

Saved to ../../data/processed/extracted_texts.json
