### python api tryout

##### base

In [15]:
import copy
import pprint
import sys
from pathlib import Path
from urllib.parse import unquote

import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.transforms.chunker import BaseChunk, DocChunk
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from docling_core.types import DoclingDocument
from docling_core.types.doc import ImageRefMode, RefItem, DocItem, NodeItem, ProvenanceItem, BoundingBox, PictureItem
from dotenv import dotenv_values

In [16]:
def create_converter():
    pipeline_options = PdfPipelineOptions(
        generate_picture_images=True,  # Generate base64-encoded images
        # do_picture_classification=True, # Classify images (optional, but aligns with CLI)
        images_scale=3.0,
    )

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )
    return converter

In [17]:
def create_chunker():
    chunker = HybridChunker()
    return chunker

In [18]:
class MyNode(object):
    cref_original: str
    cref_new: str

    def __init__(self, cref_original):
        self.cref_original = cref_original
        self.parent = None
        self.children = []

    # def __str__(self):
    #     return self.title

    def __repr__(self):
        return self.cref_original

    # depth first search
    def __iter__(self):
        yield self
        for child in self.children:
            for node in child:
                yield node

In [19]:
def construct_chunk_tree(chunks: list[BaseChunk], tree_title: str = 'root'):
    tree = MyNode(tree_title)
    node_dict = {tree_title: tree}

    for idx, chunk in enumerate(chunks):
        chunk_json_dict = chunk.export_json_dict()
        headings = [tree_title]
        headings.extend(chunk_json_dict['meta']['headings'])

        for heading in headings:
            if heading not in node_dict:
                node_dict[heading] = MyNode(heading)

        while len(headings) >= 2:
            node = node_dict[headings[0]]
            child_node = node_dict[headings[1]]
            if child_node not in node.children:
                node.children.append(child_node)
            headings.pop(0)

    return tree

In [20]:
def save_as_html(result: ConversionResult, filename: str, image_mode: ImageRefMode, save_dir: str = 'saves'):
    save_dir = Path(save_dir)
    save_dir.mkdir(exist_ok=True)
    filename = Path(save_dir, filename).with_suffix(".html")

    artifacts_dir = filename.with_suffix("")
    artifacts_dir = artifacts_dir.with_name(artifacts_dir.name + "_artifacts")

    result.document.save_as_html(filename=filename, artifacts_dir=artifacts_dir, image_mode=image_mode)

    return filename, artifacts_dir

In [21]:
def extract_exported_content(soup: BeautifulSoup):
    children_of_body = soup.body.find_all(name=True, recursive=False)

    if len(children_of_body) != 1:
        raise Exception(f'eyyo {len(children_of_body)} children in body. Expected only body_content')

    body_content = children_of_body[0]
    body_content = body_content.find_all(name=True, recursive=False)
    return body_content

In [22]:
async def upload_image(artifact: Path, imgbb_api_key: str):
    """
    upload one
    """
    url = "https://api.imgbb.com/1/upload"
    params = {
        # "expiration": 600,  # what is this
        "key": imgbb_api_key,
    }
    try:
        with open(artifact, 'rb') as f:
            files = {
                'image': f
            }
            response = requests.post(url, params=params, files=files)

        if response.status_code == 200:
            print("Upload successful!")
            print(response.json())
            return response.json()['data']['url']
        else:
            raise Exception(f"Upload failed with status code: {response.status_code}")
    except Exception as e:
        print(e, response.text)

In [23]:
async def upload_images(body_content: list[Tag], artifacts_dir: Path, config: dict):
    figures = []
    for child in body_content:
        if child.name == 'figure':
            child_of_child = child.find_all(name=True, recursive=False)
            if len(child_of_child) == 1 and child_of_child[0].name == 'img':
                figures.append(child)

    artifacts = list(artifacts_dir.glob('*.png'))

    for figure, artifact in zip(figures, artifacts):
        img = figure.find_all(name=True, recursive=False)[0]
        img_path = Path(artifact.parent.parent, unquote(img['src']))

        # validate 2 ways of manually get image reference
        if img_path != artifact:
            raise Exception(f'paths unmatched. \n{img_path}\n{artifact}')

    for figure, artifact in zip(figures, artifacts):
        img_url = await upload_image(artifact, imgbb_api_key=config['IMGBB_API_KEY'])

        img = figure.find_all(name=True, recursive=False)[0]
        img['src'] = img_url

In [24]:
def generate_output(body_content: list[Tag], chunk_tree: MyNode):
    indexes_chunk_begin = []
    chunks_headings = [str(chunk) for chunk in chunk_tree]
    chunks_headings.pop(0)
    for idx, child in enumerate(body_content):
        if 'h' not in child.name:
            continue
        if child.string in chunks_headings:
            indexes_chunk_begin.append(idx)
            chunks_headings.pop(0)
    indexes_chunk_begin.append(len(body_content))
    print(chunks_headings)
    print(indexes_chunk_begin)
    html_chunks = []
    for idx in range(len(indexes_chunk_begin) - 1):
        print(body_content[indexes_chunk_begin[idx]:indexes_chunk_begin[idx + 1]])
        html_chunks.append(
            ''.join([str(child) for child in body_content[indexes_chunk_begin[idx]:indexes_chunk_begin[idx + 1]]]))

    return html_chunks # output tree also

##### test

In [25]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")
torch.zeros(1).cuda()

True
NVIDIA GeForce RTX 5060 Laptop GPU


tensor([0.], device='cuda:0')

In [26]:
# o_source = "input/2025 ICT Bachelor thesis template.docx"
o_source = "input/Flower-Parts-Diagrams.pdf"
o_converter = create_converter()
o_result = o_converter.convert(o_source)
o_document = o_result.document
o_chunker = create_chunker()

o_chunk_iter = o_chunker.chunk(o_document)
o_chunks = list(o_chunk_iter)

In [None]:
o_chunk_tree = construct_chunk_tree(o_chunks)

o_filename: str = 'o_my_raw' #todo
o_save_results: tuple[Path, Path,] = save_as_html(result=o_result, filename=o_filename,
                                                image_mode=ImageRefMode.REFERENCED)
o_filename, o_artifacts_dir = o_save_results

with open(o_filename, encoding='utf-8') as o_f:
    o_soup = BeautifulSoup(''.join([o_line.strip() for o_line in o_f.readlines()]), 'html.parser')

o_body_content = extract_exported_content(o_soup)

o_config = dotenv_values()

# await upload_images(body_content=o_body_content, artifacts_dir=o_artifacts_dir, config=o_config)

o_html_chunks = generate_output(body_content=o_body_content, chunk_tree=o_chunk_tree)

In [40]:
def print_list(l: list):
    for i, e in enumerate(l):
        print(f'{"-"*30}{i}{"-"*30}')
        # pprint.pprint(e.export_json_dict()['meta'].keys(), sort_dicts=False)
        pprint.pprint(e.export_json_dict(), sort_dicts=False)

print_list(o_chunks)

------------------------------0------------------------------
{'text': 'Wild Geranium, Geranium maculatum',
 'meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta',
          'version': '1.0.0',
          'doc_items': [{'self_ref': '#/texts/1',
                         'parent': {'$ref': '#/body'},
                         'children': [],
                         'content_layer': 'body',
                         'label': 'text',
                         'prov': [{'page_no': 1,
                                   'bbox': {'l': 133.81,
                                            't': 652.428,
                                            'r': 412.182,
                                            'b': 632.897,
                                            'coord_origin': 'BOTTOMLEFT'},
                                   'charspan': [0, 33]}]}],
          'headings': ['Parts of a Simple Flower'],
          'origin': {'mimetype': 'application/pdf',
                     'binary_hash': 4

In [13]:
def register_node_bottom_up(
        node_item: NodeItem,  # current NodeItem
        cref_to_my_node: dict[str, MyNode],  # dict of ref to MyNode
        dldoc_original: DoclingDocument
):
    self_cref: str = node_item.get_ref().cref
    parent_cref: str = getattr(node_item.parent, 'cref', '_root_')

    if parent_cref not in cref_to_my_node:
        cref_to_my_node[parent_cref] = MyNode(parent_cref)
    if self_cref not in cref_to_my_node:
        cref_to_my_node[self_cref] = MyNode(self_cref)
    cref_to_my_node[self_cref].parent = cref_to_my_node[parent_cref]
    if cref_to_my_node[self_cref] not in cref_to_my_node[parent_cref].children:
        cref_to_my_node[parent_cref].children.append(cref_to_my_node[self_cref])

    if parent_cref != '_root_':
        node_item = node_item.parent.resolve(dldoc_original)
        register_node_bottom_up(node_item, cref_to_my_node, dldoc_original)


def register_node_top_down(
        node_item: NodeItem,  # current NodeItem
        cref_to_my_node: dict[str, MyNode],  # dict of ref to MyNode
        dldoc_original: DoclingDocument
):
    self_cref: str = node_item.get_ref().cref
    children_ref_items: list[RefItem] = copy.deepcopy(node_item.children)

    while len(children_ref_items) > 0:
        child_ref_item = children_ref_items.pop(0)
        child_cref = child_ref_item.cref
        child_item = child_ref_item.resolve(dldoc_original)

        cref_to_my_node[child_cref] = MyNode(child_cref)
        cref_to_my_node[child_cref].parent = cref_to_my_node[self_cref]
        cref_to_my_node[self_cref].children.append(cref_to_my_node[child_cref])
        if child_item.children:
            register_node_top_down(child_item, cref_to_my_node, dldoc_original)

In [14]:
# todo rename to convert_doc_items_to_dldoc
def convert_chunk_to_doc(chunk: DocChunk, dldoc_original: DoclingDocument):
    dldoc_new = DoclingDocument(name=dldoc_original.name)
    doc_items: list[DocItem] = chunk.meta.doc_items

    # construct a tree of custom nodes represent the NodeItem used in the chunk, furtherly used to add NodeItem into new doc
    cref_to_my_node: dict[str, MyNode] = {}

    # note that doc_items are indexed by the order of their appearance in the original document
    for doc_item in doc_items:
        node_item = doc_item.get_ref().resolve(dldoc_original)

        register_node_bottom_up(node_item, cref_to_my_node, dldoc_original)
        if doc_item.children:
            register_node_top_down(node_item, cref_to_my_node, dldoc_original)

    ordered_my_node = [node for node in cref_to_my_node['_root_']]

    # iterate using DFS, meaning nodes will be added in top-down order
    for my_node in ordered_my_node[1:]:
        # deepcopy is used so that child refs deletion will not affect the original object
        item_in_dldoc_original = copy.deepcopy(RefItem(cref=my_node.cref_original).resolve(dldoc_original))  # type: ignore
        item_in_dldoc_original.children.clear()
        self_path = my_node.cref_original.split('/')
        item_label = self_path[1]

        # parent items are guaranteed to be added before
        parent_item_in_dldoc_new = None if my_node.parent.cref_original == '_root_' else RefItem(cref=my_node.parent.cref_new).resolve(dldoc_new)  # type: ignore
        dldoc_new.append_child_item(child=item_in_dldoc_original, parent=parent_item_in_dldoc_new)

        my_node.cref_new = dldoc_new.__getattribute__(item_label)[-1].get_ref().cref if parent_item_in_dldoc_new else my_node.cref_original
    return dldoc_new

In [15]:
def validate_chunk_bboxes(chunk_bboxes):
    for i in range(1, len(chunk_bboxes)):
        prev_bbox = chunk_bboxes[i - 1]
        curr_bbox = chunk_bboxes[i]
        if curr_bbox[1] > prev_bbox[3]:
            return False
    return True

In [38]:
def find_picture_position(chunk_bboxes_by_page_no: dict[int, list[tuple[float, float, float, float]]], picture: PictureItem):
    prov_list: list[ProvenanceItem] = picture.prov
    if len(prov_list) != 1:
        raise Exception(f'eyyo {len(prov_list)} provenance in doc_item.prov')
    prov: ProvenanceItem = prov_list[0]
    page_no: int = prov.page_no
    bbox: BoundingBox = prov.bbox
    l, t, r, b = bbox.l, bbox.t, bbox.r, bbox.b
    pos: int = 0
    not_found: bool = True
    while not_found:
        chunk_bboxes_of_page = copy.deepcopy(chunk_bboxes_by_page_no[page_no])
        chunk_bboxes_of_page.insert(pos, (l, t, r, b))

        if validate_chunk_bboxes(chunk_bboxes_of_page):
            not_found = False
        else:
            pos += 1
    if not_found:
        raise Exception(f'eyyo can not fit picture')
    return page_no, pos

In [18]:
def get_chunk_bbox(chunk: DocChunk):
    doc_items: list[DocItem] = chunk.meta.doc_items
    page_no = None
    l: float  = sys.maxsize  # left
    t: float  = - sys.maxsize  # top
    r: float  = - sys.maxsize  # right
    b: float  = sys.maxsize  # bottom

    for doc_item in doc_items:
        prov_list: list[ProvenanceItem] = doc_item.prov
        if len(prov_list) != 1:
            raise Exception(f'eyyo {len(prov_list)} provenance in doc_item.prov')
        prov: ProvenanceItem = prov_list[0]
        if page_no is not None and page_no != prov.page_no:
            raise Exception(f'eyyo page_no mismatch')
        page_no = prov.page_no
        bbox: BoundingBox = prov.bbox
        l = min(l, bbox.l)
        t = max(t, bbox.t)
        r = max(r, bbox.r)
        b = min(b, bbox.b)

    return page_no, l, t, r, b

In [37]:
def process_chunks(chunks, document):
    chunk_bboxes: list[tuple[int, float, float, float, float]] = [get_chunk_bbox(chunk) for chunk in chunks]
    chunk_bboxes_by_page_no: dict[int, list[tuple[float, float, float, float]]] = {
        page_no: [] for page_no in document.pages.keys()
    }
    for chunk_bbox in chunk_bboxes:
        page_no, l, t, r, b = chunk_bbox
        chunk_bboxes_by_page_no[page_no].append((l, t, r, b))

    pictures: list[PictureItem] = document.pictures
    for picture in pictures:
        page_no, pos = find_picture_position(chunk_bboxes_by_page_no, picture)
        idx = sum([len(chunk_bboxes_by_page_no[i]) if i != 0 else 0 for i in range(page_no)]) + pos - 1
        chunk = chunks[idx]
        chunk.meta.doc_items.append(DocItem(
            self_ref=picture.self_ref,
            parent=picture.parent,
            children=picture.children,
            content_layer=picture.content_layer,
            label=picture.label,
            prov=picture.prov
        ))

    docs = []
    for idx, chunk in enumerate(chunks):
        chunk_as_doc = convert_chunk_to_doc(chunk, document)
        chunk_as_doc.save_as_html(f'chunk{idx}.html', image_mode=ImageRefMode.REFERENCED)
        docs.append(chunk_as_doc)

    return docs

In [40]:
o_docs: list[DoclingDocument] = process_chunks(o_chunks, o_document)

In [46]:
o_document.save_as_html('full.html', image_mode=ImageRefMode.REFERENCED)

In [63]:
for o_chunk in o_chunks:
    print(get_chunk_bbox(o_chunk))

(1, 133.81, 652.428, 412.182, 632.897)
(1, 46.792, 187.764, 496.681, 38.41300000000001)
(2, 115.32, 656.268, 430.728, 636.737)
(2, 46.492, 161.12400000000002, 499.3, 29.65599999999995)


In [None]:
# for i, chunk in enumerate(o_chunks):
#     print(f"=== {i} ===")
#     print(f"chunk.text:\n{f'{chunk.text[:300]}…'!r}")
#
#     enriched_text = o_chunker.contextualize(chunk=chunk)
#     print(f"chunker.contextualize(chunk):\n{f'{enriched_text[:300]}…'!r}")
#
#     print()

In [55]:
child_item_haha = RefItem(cref='#/pictures/0').resolve(o_document)
child_item_haha.children.clear()
o_docs[1].append_child_item(child=child_item_haha, parent=RefItem(cref='#/body').resolve(o_docs[1]))

In [57]:
o_docs[1].save_as_html('chunk1.html', image_mode=ImageRefMode.REFERENCED)