In [1]:
import fitz
import os
from pathlib import Path
import json
from tqdm import tqdm

In [2]:
def extract_text_from_pdf(pdf_path):
    """Extract all text from a PDF file."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text.strip()

In [3]:
input_dir = "../../data/records"
output_dir = "../../data/processed"

In [4]:
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)

pdf_files = list(input_path.glob("*.pdf"))
print(f"Found {len(pdf_files)} PDF files")

Found 105 PDF files


In [5]:
results = []

for pdf_file in tqdm(pdf_files, desc="Extracting text"):
    text = extract_text_from_pdf(pdf_file)

    if text:
        result = {
            'filename': pdf_file.name,
            'text': text,
            'char_count': len(text)
        }
        results.append(result)

print(f"Extracted text from {len(results)} PDFs")

Extracting text: 100%|██████████| 105/105 [00:03<00:00, 32.08it/s]

Extracted text from 64 PDFs





In [6]:
output_file = output_path / "extracted_texts.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"Saved to {output_file}")

Saved to ../../data/processed/extracted_texts.json
