In [1]:
import pprint

import requests
from bs4 import BeautifulSoup
from bs4.element import NavigableString
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from docling_core.types import DoclingDocument
from docling_core.types.doc import ImageRefMode, ContentLayer, GroupItem, TextItem, DocItem
from dotenv import dotenv_values

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pipeline_options = PdfPipelineOptions(
    generate_picture_images=True,  # Generate base64-encoded images
    # do_picture_classification=True,  # Classify images (optional, but aligns with CLI)
    images_scale=3.0,
)

# source = "D:/App/IDM Downloads/Documents/image-based-pdf-sample.pdf"
source = "Flower-Parts-Diagrams.pdf"
# source = "https://arxiv.org/pdf/2408.09869"

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

In [3]:
%%time
result = converter.convert(source)



CPU times: total: 13.9 s
Wall time: 44.2 s


In [4]:
chunker = HybridChunker()

In [5]:
chunk_iter = chunker.chunk(result.document)
chunks = list(chunk_iter)

In [6]:
chunks_headings = []
for chunk in chunks:
    headings = chunk.export_json_dict()['meta']['headings']
    if len(headings) != 1:
        raise Exception(f'eyyo length of headings is {len(headings)}')
    chunks_headings.append(headings[0])
chunks_headings

['Parts of a Simple Flower',
 'Definitions:',
 'Parts of a Composite Flower',
 'Notes:']

In [7]:
def save_as_files(name: str, image_mode: ImageRefMode):
    result.document.save_as_html(f'{name}.html', image_mode=image_mode)
    result.document.save_as_markdown(f'{name}.md', image_mode=image_mode)
    result.document.save_as_json(f'{name}.json', image_mode=image_mode)

save_as_files('raw', ImageRefMode.REFERENCED)

In [8]:
with open('raw.html', encoding='utf-8') as f:
    soup = BeautifulSoup(f)

In [9]:
type(list(soup.body.children)[0])

bs4.element.NavigableString

In [10]:
children_of_body = list(soup.body.children)
if len(children_of_body) == 3 and children_of_body[0] == '\n' and children_of_body[2] == '\n' and isinstance(children_of_body[0], NavigableString) and isinstance(children_of_body[2], NavigableString):
    print(True)
    children_of_body = children_of_body[1]
children_of_body = list(children_of_body.children)
children_of_body = [child for idx, child in enumerate(children_of_body) if not (children_of_body[idx] == '\n' and isinstance(children_of_body[idx], NavigableString))]
children_of_body

True


[<h2>Parts of a Simple Flower</h2>,
 <p>Wild Geranium, Geranium maculatum</p>,
 <figure><img src="raw_artifacts%5Cimage_000000_661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183.png"/></figure>,
 <h2>Definitions:</h2>,
 <ul>
 <li>Sepals -The outermost part of the flower, often green but sometimes looking like petals, that usually enclose the flower bud before it opens.</li>
 <li>Petals - The inner ring of the flower that tend to be brightly colored and often function to attract pollinators.</li>
 <li>Stamen - The male part of the flower that serves to produce pollen; it is composed of the pollen-bearing anther and the stalk-like filament.</li>
 <li>Pistil The female part of the flower that typically consists of the stigma at the top which receives the pollen, the narrow style , and the ovary which contains ovules that will later develop into seeds.</li>
 </ul>,
 <h2>Parts of a Composite Flower</h2>,
 <p>Green and Gold, Chrysogonum virginianum</p>,
 <figure><img src="raw_a

In [11]:
indexes_chunk_begin = []
for idx, child in enumerate(children_of_body):
    if child.string in chunks_headings:
        indexes_chunk_begin.append(idx)
indexes_chunk_begin.append(len(children_of_body))
indexes_chunk_begin

[0, 3, 5, 8, 10]

In [12]:
config = dotenv_values()

In [13]:
config['IMGBB_API_KEY']

'36217e38ccd1828140e5ed0c24d98759'

In [20]:

file_path = 'raw_artifacts/image_000000_661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183.png'
url = "https://api.imgbb.com/1/upload"
params = {
    # "expiration": 600,
    "key": config['IMGBB_API_KEY'],
}

In [21]:
with open(file_path, 'rb') as f:
    files = {
        'image': f
    }
    response = requests.post(url, params=params, files=files)

if response.status_code == 200:
    print("Upload successful!")
    print(response.json())
else:
    print(f"Upload failed with status code: {response.status_code}")
    print(response.text)

Upload successful!
{'data': {'id': 'yBpBfRsy', 'title': 'image-000000-661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183', 'url_viewer': 'https://ibb.co/yBpBfRsy', 'url': 'https://i.ibb.co/h1c17M8W/image-000000-661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183.png', 'display_url': 'https://i.ibb.co/bRFRW3dB/image-000000-661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183.png', 'width': 1405, 'height': 1151, 'size': 964288, 'time': 1754369443, 'expiration': 0, 'image': {'filename': 'image-000000-661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183.png', 'name': 'image-000000-661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183', 'mime': 'image/png', 'extension': 'png', 'url': 'https://i.ibb.co/h1c17M8W/image-000000-661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183.png'}, 'thumb': {'filename': 'image-000000-661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183.png', 'name': 'image-000000-661e1bf93

In [24]:
pprint.pprint(response.json(), sort_dicts=False)

{'data': {'id': 'yBpBfRsy',
          'title': 'image-000000-661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183',
          'url_viewer': 'https://ibb.co/yBpBfRsy',
          'url': 'https://i.ibb.co/h1c17M8W/image-000000-661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183.png',
          'display_url': 'https://i.ibb.co/bRFRW3dB/image-000000-661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183.png',
          'width': 1405,
          'height': 1151,
          'size': 964288,
          'time': 1754369443,
          'expiration': 0,
          'image': {'filename': 'image-000000-661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183.png',
                    'name': 'image-000000-661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183',
                    'mime': 'image/png',
                    'extension': 'png',
                    'url': 'https://i.ibb.co/h1c17M8W/image-000000-661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d

In [26]:
print(response.json())

{'data': {'id': 'yBpBfRsy', 'title': 'image-000000-661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183', 'url_viewer': 'https://ibb.co/yBpBfRsy', 'url': 'https://i.ibb.co/h1c17M8W/image-000000-661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183.png', 'display_url': 'https://i.ibb.co/bRFRW3dB/image-000000-661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183.png', 'width': 1405, 'height': 1151, 'size': 964288, 'time': 1754369443, 'expiration': 0, 'image': {'filename': 'image-000000-661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183.png', 'name': 'image-000000-661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183', 'mime': 'image/png', 'extension': 'png', 'url': 'https://i.ibb.co/h1c17M8W/image-000000-661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183.png'}, 'thumb': {'filename': 'image-000000-661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183.png', 'name': 'image-000000-661e1bf93aedc8c56f2bf4146c36

In [14]:
for child in children_of_body:
    if child.name == 'figure':
        print(list(child.children)[0]['src'])

raw_artifacts%5Cimage_000000_661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183.png
raw_artifacts%5Cimage_000001_f8a9d12fc9e4159f4588e9bcdfc2967d14b2cb5869b30f1bb69739417d53f5f1.png


In [143]:
html_chunks = []
for idx in range(len(indexes_chunk_begin) - 1):
    html_chunks.append(''.join([str(child) for child in children_of_body[indexes_chunk_begin[idx]:indexes_chunk_begin[idx+1]]]))

<h2>Parts of a Simple Flower</h2><p>Wild Geranium, Geranium maculatum</p><figure><img src="raw_artifacts%5Cimage_000000_661e1bf93aedc8c56f2bf4146c3654d628feeae6f38d3991d268d67780835183.png"/></figure> 


<h2>Definitions:</h2><ul>
<li>Sepals -The outermost part of the flower, often green but sometimes looking like petals, that usually enclose the flower bud before it opens.</li>
<li>Petals - The inner ring of the flower that tend to be brightly colored and often function to attract pollinators.</li>
<li>Stamen - The male part of the flower that serves to produce pollen; it is composed of the pollen-bearing anther and the stalk-like filament.</li>
<li>Pistil The female part of the flower that typically consists of the stigma at the top which receives the pollen, the narrow style , and the ovary which contains ovules that will later develop into seeds.</li>
</ul> 


<h2>Parts of a Composite Flower</h2><p>Green and Gold, Chrysogonum virginianum</p><figure><img src="raw_artifacts%5Cimage_00

In [144]:
chunking_output = {
    'chunks': html_chunks,
}

In [156]:
soup_chunk = BeautifulSoup(html_chunks[0])
with open('soup_chunk.html', 'w', encoding='utf-8') as f:
    f.writelines(str(soup_chunk))

In [8]:
dump = result.document.model_dump()

In [1]:
pprint.pprint(result.document.export_to_element_tree())

NameError: name 'pprint' is not defined

In [12]:
def custom_export_to_element_tree(document: DoclingDocument):
    e_tree = []
    for ix, (item, level) in enumerate(
        document.iterate_items(
            with_groups=True,
            traverse_pictures=True,
            included_content_layers={cl for cl in ContentLayer},
        )
    ):
        if isinstance(item, GroupItem):
            e_tree.append(
                {
                    "index": ix,
                    "level": level,
                    "item_class_name": item.__class__.__name__,
                    "item_label": item.label.value,
                    "item_name": item.name,
                }
            )
        elif isinstance(item, TextItem):
            e_tree.append(
                {
                    "index": ix,
                    "level": level,
                    "item_class_name": item.__class__.__name__,
                    "item_label": item.label.value,
                    "item_content": item.text[:min(len(item.text), 100)],
                }
            )
        elif isinstance(item, DocItem):
            e_tree.append(
                {
                    "index": ix,
                    "level": level,
                    "item_class_name": item.__class__.__name__,
                    "item_label": item.label.value,
                }
            )
    return e_tree

In [13]:
custom_export_to_element_tree(result.document)

[{'index': 0,
  'level': 0,
  'item_class_name': 'GroupItem',
  'item_label': 'unspecified',
  'item_name': '_root_'},
 {'index': 1,
  'level': 1,
  'item_class_name': 'SectionHeaderItem',
  'item_label': 'section_header',
  'item_content': 'Parts of a Simple Flower'},
 {'index': 2,
  'level': 1,
  'item_class_name': 'TextItem',
  'item_label': 'text',
  'item_content': 'Wild Geranium, Geranium maculatum'},
 {'index': 3,
  'level': 1,
  'item_class_name': 'PictureItem',
  'item_label': 'picture'},
 {'index': 4,
  'level': 2,
  'item_class_name': 'TextItem',
  'item_label': 'text',
  'item_content': 'Petal'},
 {'index': 5,
  'level': 2,
  'item_class_name': 'TextItem',
  'item_label': 'text',
  'item_content': 'Sepal'},
 {'index': 6,
  'level': 2,
  'item_class_name': 'TextItem',
  'item_label': 'text',
  'item_content': 'Pistil'},
 {'index': 7,
  'level': 2,
  'item_class_name': 'TextItem',
  'item_label': 'text',
  'item_content': 'Stamen'},
 {'index': 8,
  'level': 2,
  'item_class_n

In [16]:
for idx, chunk in enumerate(chunks):
    print(f'{"-"*30}{idx}{"-"*30}')
    pprint.pprint(chunk.export_json_dict(), sort_dicts=False)

------------------------------0------------------------------
{'text': 'Wild Geranium, Geranium maculatum',
 'meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta',
          'version': '1.0.0',
          'doc_items': [{'self_ref': '#/texts/1',
                         'parent': {'$ref': '#/body'},
                         'children': [],
                         'content_layer': 'body',
                         'label': 'text',
                         'prov': [{'page_no': 1,
                                   'bbox': {'l': 133.81,
                                            't': 652.428,
                                            'r': 412.182,
                                            'b': 632.897,
                                            'coord_origin': 'BOTTOMLEFT'},
                                   'charspan': [0, 33]}]}],
          'headings': ['Parts of a Simple Flower'],
          'origin': {'mimetype': 'application/pdf',
                     'binary_hash': 4

In [12]:
# chunks[0].model_dump()
chunks[1].export_json_dict()

{'text': '- Sepals -The outermost part of the flower, often green but sometimes looking like petals, that usually enclose the flower bud before it opens.\n- Petals - The inner ring of the flower that tend to be brightly colored and often function to attract pollinators.\n- Stamen - The male part of the flower that serves to produce pollen; it is composed of the pollen-bearing anther and the stalk-like filament.\n- Pistil The female part of the flower that typically consists of the stigma at the top which receives the pollen, the narrow style , and the ovary which contains ovules that will later develop into seeds.',
 'meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta',
  'version': '1.0.0',
  'doc_items': [{'self_ref': '#/texts/14',
    'parent': {'$ref': '#/groups/0'},
    'children': [],
    'content_layer': 'body',
    'label': 'list_item',
    'prov': [{'page_no': 1,
      'bbox': {'l': 46.792,
       't': 187.764,
       'r': 493.257,
       'b': 155.53300000000002,


In [29]:
pprint.pprint(chunks[0].export_json_dict(), sort_dicts=False)

{'text': 'Wild Geranium, Geranium maculatum',
 'meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta',
          'version': '1.0.0',
          'doc_items': [{'self_ref': '#/texts/1',
                         'parent': {'$ref': '#/body'},
                         'children': [],
                         'content_layer': 'body',
                         'label': 'text',
                         'prov': [{'page_no': 1,
                                   'bbox': {'l': 133.81,
                                            't': 652.428,
                                            'r': 412.182,
                                            'b': 632.897,
                                            'coord_origin': 'BOTTOMLEFT'},
                                   'charspan': [0, 33]}]}],
          'headings': ['Parts of a Simple Flower'],
          'origin': {'mimetype': 'application/pdf',
                     'binary_hash': 4392405294199241303,
                     'filename': 'Flower-P

In [79]:
graph = {}
for j in range(len(chunks)):
    meta = chunks[j].export_json_dict()['meta']
    graph[j] = {}
    for i in range(len(meta['doc_items'])):
        doc_item = meta['doc_items'][i]
        self_ref = doc_item['self_ref']
        if len(doc_item['parent']) != 1:
            print('warning len parent != 1')
        if len(doc_item['children']) != 0:
            print('warning len children != 0')
        parent_ref = doc_item['parent']['$ref']

        if parent_ref not in graph[j]:
            graph[j][parent_ref] = []
        graph[j][parent_ref].append(self_ref)
pprint.pprint(graph, sort_dicts=False)

{0: {'#/body': ['#/texts/1']},
 1: {'#/groups/0': ['#/texts/14', '#/texts/15', '#/texts/16', '#/texts/17']},
 2: {'#/body': ['#/texts/19']},
 3: {'#/body': ['#/texts/30']}}


In [None]:
class Node:
    def __init__(self, data):
        self.data = data
        self.children = []
    def add_child(self, child):
        self.children.append(child)

In [68]:
pprint.pprint(chunks[1].export_json_dict(), sort_dicts=False)

{'text': '1. Select two datasets with labels.\n'
         '2. Run k-nn on these two datasets. Calculate the classification '
         'error (by comparing the class labels obtained with the prediction '
         'and the original labels of test data).\n'
         '3. Vary the value of k, comment on the results\n'
         '4. Try to normalize the input dataset, is the performance better?\n'
         '5. Apply PCA and SVD on the dataset, then what is the performance of '
         'k-nn on the new projected data? Justify the answer.\n'
         '6. Propose an approach to improve the performance of k-nn with the '
         'use of k-cross validation.\n'
         '7. Apply leave-one-out and calculate the error of classification.',
 'meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta',
          'version': '1.0.0',
          'doc_items': [{'self_ref': '#/texts/4',
                         'parent': {'$ref': '#/groups/0'},
                         'children': [],
               

In [56]:

print(json.dumps(json.loads(str(chunks[4].export_json_dict()).replace('\'', '\"')), indent=4))

{
    "text": "- \u277c Set up the SVM parameters suitable to the selected datasets.\n- \u277c What is the performance of SVMs?\n- \u277c How can SVM handle multi-class datasets?",
    "meta": {
        "schema_name": "docling_core.transforms.chunker.DocMeta",
        "version": "1.0.0",
        "doc_items": [
            {
                "self_ref": "#/texts/16",
                "parent": {
                    "$ref": "#/groups/2"
                },
                "children": [],
                "content_layer": "body",
                "label": "list_item",
                "prov": [
                    {
                        "page_no": 2,
                        "bbox": {
                            "l": 71.56,
                            "t": 733.348,
                            "r": 397.696,
                            "b": 722.9,
                            "coord_origin": "BOTTOMLEFT"
                        },
                        "charspan": [
                           