In [None]:
from rich.jupyter import display
%load_ext autoreload
%autoreload 2

from config import GUIDELINES_DIR

from lib.parsing.methods.implementations.mineru import MinerUParser
from lib.parsing.model.options import ParserOptions

In [None]:
parser = MinerUParser(use_vlm=False)

document_path = GUIDELINES_DIR / "thesis_p8.pdf"
options = {
    ParserOptions.DRAW: True,
    ParserOptions.EXIST_OK: True,
}

result = parser.process_document(document_path, options=options)

In [None]:
from lib.chunking.methods.implementations.fixed_sized import FixedSizeChunker
from lib.chunking.methods.implementations.hierarchical import HierarchicalChunker
from lib.chunking.methods.implementations.recursive import RecursiveChunker

chunkers = [
    FixedSizeChunker(max_tokens=128, overlap=0),
    FixedSizeChunker(max_tokens=256, overlap=0),
    FixedSizeChunker(max_tokens=512, overlap=0),
    FixedSizeChunker(max_tokens=1024, overlap=0),
    RecursiveChunker(max_tokens=128, overlap=0),
    RecursiveChunker(max_tokens=256, overlap=0),
    RecursiveChunker(max_tokens=512, overlap=0),
    RecursiveChunker(max_tokens=1024, overlap=0),
    HierarchicalChunker(max_tokens=128),
    HierarchicalChunker(max_tokens=256),
    HierarchicalChunker(max_tokens=512),
    HierarchicalChunker(max_tokens=1024),
]

In [None]:
import pymupdf
from math import floor

from lib.chunking.model.document_chunker import DocumentChunker
from lib.utils.annotate import create_annotation
from IPython.display import Image


def draw_single_box(chunker: DocumentChunker):
    chunking_result = chunker.process_document(
        result, with_geom=True, draw=False
    )

    chunk_cnt = chunking_result.metadata["chunk_count"]
    middle_point = floor(chunk_cnt / 2)

    chunking_result.chunks = [chunking_result.chunks[middle_point]]
    anno_path = create_annotation(chunking_result, with_label=False, with_fill=False)

    page = pymupdf.open(anno_path)[0]
    img = page.get_pixmap(dpi=150).tobytes("png")
    display(Image(data=img, width=300))

In [None]:
for c in chunkers:
    draw_single_box(c)