## 01. PDF fetch and data exploration

In [10]:
import json
from pathlib import Path
import pymupdf
import pytesseract
from pdf2image import convert_from_path
from tqdm import tqdm
from os import listdir
from os.path import isfile, join

In [17]:
pdf_path = Path("../data/raw/") 
pdf_files = [f for f in listdir(pdf_path) if isfile(join(pdf_path, f))]
print(pdf_path)
print(pdf_files)

../data/raw
['Old_school_primer.pdf', 'MORK_BORG_BARE_BONES_EDITION.pdf']


In [24]:

def needs_ocr(text: str, min_chars: int = 50) -> bool:
    """Check if page needs OCR based on extraction quality."""
    if len(text.strip()) < min_chars:
        return True
    if "\u0007" in text:
        return True
    # Too many tabs with little content suggests table extraction failed
    tab_ratio = text.count("\t") / max(len(text), 1)
    if tab_ratio > 0.3:
        return True
    return False


def extract_page_ocr(path: Path, page_num: int, dpi: int = 300) -> str:
    """Extract text from a single page using OCR."""
    pages = convert_from_path(
        path,
        first_page=page_num,
        last_page=page_num,
        dpi=dpi,
    )
    return pytesseract.image_to_string(pages[0])


def extract_pdf(path: Path, file_name: str, output_path: Path, dpi: int = 300) -> list[dict]:
    """
    Extract PDF with hybrid approach: pymupdf for clean pages, OCR for problematic ones.
    """
    path_to_doc = f"{path}/{file_name}"
    doc = pymupdf.open(path_to_doc)
    print(f"Processing: {path_to_doc}")
    print(f"Pages: {len(doc)}")
    
    pages_data = []
    ocr_count = 0
    
    for page_num in tqdm(range(len(doc)), desc="Extracting"):
        page = doc[page_num]
        text = page.get_text()
        method = "pymupdf"
        
        if needs_ocr(text):
            text = extract_page_ocr(path_to_doc, page_num + 1, dpi) 
            method = "ocr"
            ocr_count += 1
        
        pages_data.append({
            "page": page_num + 1,
            "chars": len(text),
            "method": method,
            "text": text,
        })
    
    doc.close()
    
    # save
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(pages_data, f, ensure_ascii=False, indent=2)
    
    print(f"Done. OCR used on {ocr_count}/{len(pages_data)} pages")
    print(f"Saved to {output_path}")
    
    return pages_data


if __name__ == "__main__":
    for file_name in pdf_files:
        output_path = Path(f"../data/processed/{file_name}_extracted.json")
        extract_pdf(pdf_path, file_name, output_path)

Processing: ../data/raw/Old_school_primer.pdf
Pages: 13


Extracting:   0%|          | 0/13 [00:00<?, ?it/s]

Extracting: 100%|██████████| 13/13 [00:00<00:00, 273.78it/s]


Done. OCR used on 0/13 pages
Saved to ../data/processed/Old_school_primer.pdf_extracted.json
Processing: ../data/raw/MORK_BORG_BARE_BONES_EDITION.pdf
Pages: 76


Extracting: 100%|██████████| 76/76 [00:52<00:00,  1.45it/s]

Done. OCR used on 39/76 pages
Saved to ../data/processed/MORK_BORG_BARE_BONES_EDITION.pdf_extracted.json



