### python api tryout

##### base

In [16]:
import pprint
from pathlib import Path
from urllib.parse import unquote

import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.transforms.chunker import BaseChunk, DocChunk
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from docling_core.types.doc import ImageRefMode
from dotenv import dotenv_values

In [17]:
def create_converter():
    pipeline_options = PdfPipelineOptions(
        generate_picture_images=True,  # Generate base64-encoded images
        # do_picture_classification=True,  # Classify images (optional, but aligns with CLI)
        images_scale=3.0,
    )

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )
    return converter

In [18]:
def create_chunker():
    chunker = HybridChunker()
    return chunker

In [19]:
class Node(object):

    def __init__(self, title, children=None):
        self.title = title
        self.children = children or []

    def __str__(self):
        return self.title

    # depth first search
    def __iter__(self):
        yield self
        for child in self.children:
            for node in child:
                yield node

In [20]:
def construct_chunk_tree(chunks: list[BaseChunk], tree_title: str = 'root'):
    tree = Node(tree_title)
    node_dict = {tree_title: tree}

    for idx, chunk in enumerate(chunks):
        chunk_json_dict = chunk.export_json_dict()
        headings = [tree_title]
        headings.extend(chunk_json_dict['meta']['headings'])

        for heading in headings:
            if heading not in node_dict:
                node_dict[heading] = Node(heading)

        while len(headings) >= 2:
            node = node_dict[headings[0]]
            child_node = node_dict[headings[1]]
            if child_node not in node.children:
                node.children.append(child_node)
            headings.pop(0)

    return tree

In [21]:
def save_as_html(result: ConversionResult, filename: str, image_mode: ImageRefMode, save_dir: str = 'saves'):
    save_dir = Path(save_dir)
    save_dir.mkdir(exist_ok=True)
    filename = Path(save_dir, filename).with_suffix(".html")

    artifacts_dir = filename.with_suffix("")
    artifacts_dir = artifacts_dir.with_name(artifacts_dir.name + "_artifacts")

    result.document.save_as_html(filename=filename, artifacts_dir=artifacts_dir, image_mode=image_mode)

    return filename, artifacts_dir

In [22]:
def extract_exported_content(soup: BeautifulSoup):
    children_of_body = soup.body.find_all(name=True, recursive=False)

    if len(children_of_body) != 1:
        raise Exception(f'eyyo {len(children_of_body)} children in body. Expected only body_content')

    body_content = children_of_body[0]
    body_content = body_content.find_all(name=True, recursive=False)
    return body_content

In [23]:
async def upload_image(artifact: Path, imgbb_api_key: str):
    """
    upload one
    """
    url = "https://api.imgbb.com/1/upload"
    params = {
        # "expiration": 600,  # what is this
        "key": imgbb_api_key,
    }
    try:
        with open(artifact, 'rb') as f:
            files = {
                'image': f
            }
            response = requests.post(url, params=params, files=files)

        if response.status_code == 200:
            print("Upload successful!")
            print(response.json())
            return response.json()['data']['url']
        else:
            raise Exception(f"Upload failed with status code: {response.status_code}")
    except Exception as e:
        print(e, response.text)

In [24]:
async def upload_images(body_content: list[Tag], artifacts_dir: Path, config: dict):
    figures = []
    for child in body_content:
        if child.name == 'figure':
            child_of_child = child.find_all(name=True, recursive=False)
            if len(child_of_child) == 1 and child_of_child[0].name == 'img':
                figures.append(child)

    artifacts = list(artifacts_dir.glob('*.png'))

    for figure, artifact in zip(figures, artifacts):
        img = figure.find_all(name=True, recursive=False)[0]
        img_path = Path(artifact.parent.parent, unquote(img['src']))

        # validate 2 ways of manually get image reference
        if img_path != artifact:
            raise Exception(f'paths unmatched. \n{img_path}\n{artifact}')

    for figure, artifact in zip(figures, artifacts):
        img_url = await upload_image(artifact, imgbb_api_key=config['IMGBB_API_KEY'])

        img = figure.find_all(name=True, recursive=False)[0]
        img['src'] = img_url

In [29]:
def generate_output(body_content: list[Tag], chunk_tree: Node):
    indexes_chunk_begin = []
    chunks_headings = [str(chunk) for chunk in chunk_tree]
    chunks_headings.pop(0)
    for idx, child in enumerate(body_content):
        if 'h' not in child.name:
            continue
        if child.string in chunks_headings:
            indexes_chunk_begin.append(idx)
            chunks_headings.pop(0)
    indexes_chunk_begin.append(len(body_content))
    print(chunks_headings)
    print(indexes_chunk_begin)
    html_chunks = []
    for idx in range(len(indexes_chunk_begin) - 1):
        print(body_content[indexes_chunk_begin[idx]:indexes_chunk_begin[idx + 1]])
        html_chunks.append(
            ''.join([str(child) for child in body_content[indexes_chunk_begin[idx]:indexes_chunk_begin[idx + 1]]]))

    return html_chunks # output tree also

##### test

In [30]:
o_source = "https://docling-project.github.io/docling/"
o_converter = create_converter()
o_result = o_converter.convert(o_source)
o_chunker = create_chunker()

o_chunk_iter = o_chunker.chunk(o_result.document)
o_chunks = list(o_chunk_iter)

o_chunk_tree = construct_chunk_tree(o_chunks)

o_filename: str = 'o_my_raw' #todo
o_save_results: tuple[Path, Path,] = save_as_html(result=o_result, filename=o_filename,
                                                image_mode=ImageRefMode.REFERENCED)
o_filename, o_artifacts_dir = o_save_results

with open(o_filename, encoding='utf-8') as o_f:
    o_soup = BeautifulSoup(''.join([o_line.strip() for o_line in o_f.readlines()]), 'html.parser')

o_body_content = extract_exported_content(o_soup)

o_config = dotenv_values()

# await upload_images(body_content=o_body_content, artifacts_dir=o_artifacts_dir, config=o_config)

o_html_chunks = generate_output(body_content=o_body_content, chunk_tree=o_chunk_tree)

[]
[0, 17, 19, 21, 30, 34, 36, 43]
[<h1>Docling</h1>, <figure><figcaption><div class="caption">Docling</div></figcaption></figure>, <figure><figcaption><div class="caption">DS4SD%2Fdocling | Trendshift</div></figcaption></figure>, <figure><figcaption><div class="caption">arXiv</div></figcaption></figure>, <figure><figcaption><div class="caption">PyPI version</div></figcaption></figure>, <figure><figcaption><div class="caption">PyPI - Python Version</div></figcaption></figure>, <figure><figcaption><div class="caption">uv</div></figcaption></figure>, <figure><figcaption><div class="caption">Ruff</div></figcaption></figure>, <figure><figcaption><div class="caption">Pydantic v2</div></figcaption></figure>, <figure><figcaption><div class="caption">pre-commit</div></figcaption></figure>, <figure><figcaption><div class="caption">License MIT</div></figcaption></figure>, <figure><figcaption><div class="caption">PyPI Downloads</div></figcaption></figure>, <figure><figcaption><div class="caption"

In [27]:
def print_list(l: list):
    for i, e in enumerate(l):
        print(f'{"-"*30}{i}{"-"*30}')
        pprint.pprint(e)

In [28]:
print_list(o_chunks)

------------------------------0------------------------------
DocChunk(text='Docling\nDS4SD%2Fdocling | Trendshift\narXiv\nPyPI version\nPyPI - Python Version\nuv\nRuff\nPydantic v2\npre-commit\nLicense MIT\nPyPI Downloads\nDocling Actor\nChat with Dosu\nOpenSSF Best Practices\nLF AI & Data\nDocling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[DocItem(self_ref='#/texts/24', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, label=<DocItemLabel.CAPTION: 'caption'>, prov=[]), DocItem(self_ref='#/texts/25', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, label=<DocItemLabel.CAPTION: 'caption'>, prov=[]), DocItem(self_ref='#/texts/26', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.