In [None]:
import json
from pathlib import Path
from dotenv import load_dotenv

from tqdm import tqdm
from huggingface_hub import InferenceClient


from utils.render import render_pdf_pages
from ocr.read import PyMuPDFPageOCR
from layout.order import heuristic_reading_order
from layout.detect import PaddleLayoutDetector
from blocks.assemble import BlockAssembler
from blocks.serialize import serialize_blocks
from blocks.sections import split_into_sections
from ocr.toc import extract_toc
from vision.vlm import extract_table

In [None]:
INTERMEDIATE_DIR = Path("data/intermediate/")

PDF_PATH = Path("data/raw/dnd_rulebook.pdf")
RENDER_DIR = INTERMEDIATE_DIR / "pages"
OUT_DIR = Path("data/processed/")

TOC_PAGE = 2

In [None]:
pages = list(render_pdf_pages(PDF_PATH, RENDER_DIR))

toc = extract_toc(
    pdf_path=PDF_PATH,
    img_path=pages[TOC_PAGE-1]["image_path"],
    toc_page=TOC_PAGE,
    num_columns=2,
    out_path=OUT_DIR
)

In [None]:
CROP_DIR = INTERMEDIATE_DIR / "crops"
BLOCKS_DIR = INTERMEDIATE_DIR / "blocks"

MIN_CONFIDENCE = 0.8
PROSE_FONT_SIZE = 9.0

In [None]:
ocr = PyMuPDFPageOCR()
layout_detector = PaddleLayoutDetector(min_confidence=MIN_CONFIDENCE)
assembler = BlockAssembler(crop_dir=CROP_DIR, prose_font_size=PROSE_FONT_SIZE)

In [None]:
for page in tqdm(pages, desc="Processing pages"):

    page_index = page["page_index"]
    image_path = page["image_path"]

    BLOCKS_DIR.mkdir(parents=True, exist_ok=True)

    page_out = BLOCKS_DIR / f"page_{page_index + 1:04d}.blocks.json"
    if page_out.exists():
        print(f"Page {page_index + 1} (cached)")
        continue

    ocr_regions = ocr.ocr_page(PDF_PATH, page_index)
    if not ocr_regions:
        continue
    
    reading_order = heuristic_reading_order(ocr_regions, 2)
    layout_regions = layout_detector.detect_page(image_path)

    blocks = assembler.assemble_page(
        page_index=page_index,
        image_path=image_path,
        ocr_regions=ocr_regions,
        reading_order=reading_order,
        layout_regions=layout_regions,
    )
    
    with page_out.open("w", encoding="utf-8") as f:
        json.dump(
            [block.__dict__ for block in blocks],
            f,
            indent=2,
            ensure_ascii=False,
        )

In [None]:
PAGES_TO_SKIP = [1, 2, 177, 178, 179]

In [None]:
all_blocks = []

for path in sorted(BLOCKS_DIR.glob("page*.blocks.json")):
    with path.open("r", encoding="utf-8") as f:
        page_blocks = json.load(f)
        all_blocks.extend(page_blocks)

text_blocks, table_blocks = serialize_blocks(
    blocks=all_blocks,
    out_path=INTERMEDIATE_DIR,
    toc=toc,
    pages_to_skip=PAGES_TO_SKIP,
)

text = split_into_sections(
    text_blocks=text_blocks,
    out_path=OUT_DIR,
)

In [None]:
TABLE_CACHE_DIR = INTERMEDIATE_DIR / "table_cache"

MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct"
PROVIDER = "novita"

In [None]:
TABLE_CACHE_DIR.mkdir(parents=True, exist_ok=True)

load_dotenv()

client = InferenceClient(
    model=MODEL_ID,
    provider=PROVIDER,
)

In [None]:
tables = []

for block in table_blocks:
    print(f"Processing table block {block['block_id']}")

    cache_file = TABLE_CACHE_DIR / f"{block['block_id']}.json"
    if cache_file.exists():
        print(f"Loading cached table for block {block['block_id']}")
        with cache_file.open("r", encoding="utf-8") as f:
            tables.append(json.load(f))
    else:
        table = extract_table(client, block)

        with cache_file.open("w", encoding="utf-8") as f:
            json.dump(table, f, indent=2, ensure_ascii=False)

        tables.append(table)

out_file = OUT_DIR / "tables.json"
with out_file.open("w", encoding="utf-8") as f:
    json.dump(tables, f, indent=2, ensure_ascii=False)