### python api tryout

##### base

In [1]:
import pprint
from pathlib import Path
from urllib.parse import unquote

import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.transforms.chunker import BaseChunk, DocChunk
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from docling_core.types.doc import ImageRefMode
from dotenv import dotenv_values

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def create_converter():
    pipeline_options = PdfPipelineOptions(
        generate_picture_images=True,  # Generate base64-encoded images
        # do_picture_classification=True,  # Classify images (optional, but aligns with CLI)
        images_scale=3.0,
    )

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )
    return converter

In [3]:
def create_chunker():
    chunker = HybridChunker()
    return chunker

In [4]:
class Node(object):

    def __init__(self, title, children=None):
        self.title = title
        self.children = children or []

    def __str__(self):
        return self.title

    # depth first search
    def __iter__(self):
        yield self
        for child in self.children:
            for node in child:
                yield node

In [5]:
def construct_chunk_tree(chunks: list[BaseChunk], tree_title: str = 'root'):
    tree = Node(tree_title)
    node_dict = {tree_title: tree}

    for idx, chunk in enumerate(chunks):
        chunk_json_dict = chunk.export_json_dict()
        headings = [tree_title]
        headings.extend(chunk_json_dict['meta']['headings'])

        for heading in headings:
            if heading not in node_dict:
                node_dict[heading] = Node(heading)

        while len(headings) >= 2:
            node = node_dict[headings[0]]
            child_node = node_dict[headings[1]]
            if child_node not in node.children:
                node.children.append(child_node)
            headings.pop(0)

    return tree

In [6]:
def save_as_html(result: ConversionResult, filename: str, image_mode: ImageRefMode, save_dir: str = 'saves'):
    save_dir = Path(save_dir)
    save_dir.mkdir(exist_ok=True)
    filename = Path(save_dir, filename).with_suffix(".html")

    artifacts_dir = filename.with_suffix("")
    artifacts_dir = artifacts_dir.with_name(artifacts_dir.name + "_artifacts")

    result.document.save_as_html(filename=filename, artifacts_dir=artifacts_dir, image_mode=image_mode)

    return filename, artifacts_dir

In [7]:
def extract_exported_content(soup: BeautifulSoup):
    children_of_body = soup.body.find_all(name=True, recursive=False)

    if len(children_of_body) != 1:
        raise Exception(f'eyyo {len(children_of_body)} children in body. Expected only body_content')

    body_content = children_of_body[0]
    body_content = body_content.find_all(name=True, recursive=False)
    return body_content

In [8]:
async def upload_image(artifact: Path, imgbb_api_key: str):
    """
    upload one
    """
    url = "https://api.imgbb.com/1/upload"
    params = {
        # "expiration": 600,  # what is this
        "key": imgbb_api_key,
    }
    try:
        with open(artifact, 'rb') as f:
            files = {
                'image': f
            }
            response = requests.post(url, params=params, files=files)

        if response.status_code == 200:
            print("Upload successful!")
            print(response.json())
            return response.json()['data']['url']
        else:
            raise Exception(f"Upload failed with status code: {response.status_code}")
    except Exception as e:
        print(e, response.text)

In [9]:
async def upload_images(body_content: list[Tag], artifacts_dir: Path, config: dict):
    figures = []
    for child in body_content:
        if child.name == 'figure':
            child_of_child = child.find_all(name=True, recursive=False)
            if len(child_of_child) == 1 and child_of_child[0].name == 'img':
                figures.append(child)

    artifacts = list(artifacts_dir.glob('*.png'))

    for figure, artifact in zip(figures, artifacts):
        img = figure.find_all(name=True, recursive=False)[0]
        img_path = Path(artifact.parent.parent, unquote(img['src']))

        # validate 2 ways of manually get image reference
        if img_path != artifact:
            raise Exception(f'paths unmatched. \n{img_path}\n{artifact}')

    for figure, artifact in zip(figures, artifacts):
        img_url = await upload_image(artifact, imgbb_api_key=config['IMGBB_API_KEY'])

        img = figure.find_all(name=True, recursive=False)[0]
        img['src'] = img_url

In [10]:
def generate_output(body_content: list[Tag], chunk_tree: Node):
    indexes_chunk_begin = []
    chunks_headings = [str(chunk) for chunk in chunk_tree]
    chunks_headings.pop(0)
    for idx, child in enumerate(body_content):
        if 'h' not in child.name:
            continue
        if child.string in chunks_headings:
            indexes_chunk_begin.append(idx)
            chunks_headings.pop(0)
    indexes_chunk_begin.append(len(body_content))
    print(chunks_headings)
    print(indexes_chunk_begin)
    html_chunks = []
    for idx in range(len(indexes_chunk_begin) - 1):
        print(body_content[indexes_chunk_begin[idx]:indexes_chunk_begin[idx + 1]])
        html_chunks.append(
            ''.join([str(child) for child in body_content[indexes_chunk_begin[idx]:indexes_chunk_begin[idx + 1]]]))

    return html_chunks # output tree also

##### test

In [16]:
o_source = "C:\\Users\\At Nguyen\\Downloads\\2025 ICT Bachelor thesis template (1).docx"
o_converter = create_converter()
o_result = o_converter.convert(o_source)
o_chunker = create_chunker()

o_chunk_iter = o_chunker.chunk(o_result.document)
o_chunks = list(o_chunk_iter)

Token indices sequence length is longer than the specified maximum sequence length for this model (2092 > 512). Running this sequence through the model will result in indexing errors


In [None]:
o_chunk_tree = construct_chunk_tree(o_chunks)

o_filename: str = 'o_my_raw' #todo
o_save_results: tuple[Path, Path,] = save_as_html(result=o_result, filename=o_filename,
                                                image_mode=ImageRefMode.REFERENCED)
o_filename, o_artifacts_dir = o_save_results

with open(o_filename, encoding='utf-8') as o_f:
    o_soup = BeautifulSoup(''.join([o_line.strip() for o_line in o_f.readlines()]), 'html.parser')

o_body_content = extract_exported_content(o_soup)

o_config = dotenv_values()

# await upload_images(body_content=o_body_content, artifacts_dir=o_artifacts_dir, config=o_config)

o_html_chunks = generate_output(body_content=o_body_content, chunk_tree=o_chunk_tree)

In [13]:
def print_list(l: list):
    for i, e in enumerate(l):
        print(f'{"-"*30}{i}{"-"*30}')
        # pprint.pprint(e.export_json_dict()['meta'].keys(), sort_dicts=False)
        pprint.pprint(e.export_json_dict(), sort_dicts=False)

print_list(o_chunks)

------------------------------0------------------------------
{'text': 'UNIVERSITY OF SCIENCE AND TECHNOLOGY OF HANOI\n'
         '**DEPARTMENT OF INFORMATION AND COMMUNICATION TECHNOLOGY**\n'
         '**BACHELOR THESIS**\n'
         'By\n'
         'At Nguyen\n'
         '22BI13047\n'
         'Title:\n'
         'Automated Inspection of Fiber Optic Components\n'
         'External Supervisor:     B.E. Minh-Giang Chu\n'
         'Internal Supervisor:      Dr. Nhat-Quang Doan\n'
         '**Hanoi, <Month – yyyy>**\n'
         'Hanoi, September 2013\n'
         '**HàNội,  Sept. 2012**\n'
         'To whom it may concern,\n'
         'I, …………………….……………………………… certify that the thesis/ internship report '
         'of Mr ………………………………………… is qualified to be presented in the '
         'Internship Jury 2024-2025.\n'
         'Hanoi, July 14, 2025\n'
         '**Supervisor’s signature**',
 'meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta',
          'version': '1.0.0',
      

In [18]:
def contextualize_chunks(chunks, chunker):
    for i, chunk in enumerate(chunks):
        print(f"=== {i} ===")
        print(f"chunk.text:\n{f'{chunk.text[:300]}…'!r}")

        enriched_text = chunker.contextualize(chunk=chunk)
        print(f"chunker.contextualize(chunk):\n{f'{enriched_text[:300]}…'!r}")

        print()

contextualize_chunks(o_chunks, o_chunker)

=== 0 ===
chunk.text:
'UNIVERSITY OF SCIENCE AND TECHNOLOGY OF HANOI\n**DEPARTMENT OF INFORMATION AND COMMUNICATION TECHNOLOGY**\n**BACHELOR THESIS**\nBy\nAt Nguyen\n22BI13047\nTitle:\nAutomated Inspection of Fiber Optic Components\nExternal Supervisor:     B.E. Minh-Giang Chu\nInternal Supervisor:      Dr. Nhat-Quang Doan\n**Hanoi…'
chunker.contextualize(chunk):
'UNIVERSITY OF SCIENCE AND TECHNOLOGY OF HANOI\n**DEPARTMENT OF INFORMATION AND COMMUNICATION TECHNOLOGY**\n**BACHELOR THESIS**\nBy\nAt Nguyen\n22BI13047\nTitle:\nAutomated Inspection of Fiber Optic Components\nExternal Supervisor:     B.E. Minh-Giang Chu\nInternal Supervisor:      Dr. Nhat-Quang Doan\n**Hanoi…'

=== 1 ===
chunk.text:
'[ACKNOWLEDGEMENTS\t0](.)\n[LIST OF ABBREVIATIONS\t1](.)\n[LIST OF TABLES\t2](.)\n[LIST OF FIGURES\t3](.)\n[ABSTRACT\t4](.)\n[I/ INTRODUCTION\t1](.)\n[II/ OBJECTIVES\t2](.)\n[III/ MATERIALS AND METHODS\t3](.)\n[IV/ RESULTS AND DISCUSSION\t4](.)\n[V/ CONCLUSION & PERSPECTIVE\t7](.)\n[REFERENCES\t8