In [17]:
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    EasyOcrOptions,
    PdfPipelineOptions,
    TableFormerMode
)

from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend

from docling_core.types.doc.document import (
    TextItem,
    TableItem,
    PictureItem,
    ImageRefMode
)

#Accelerator Options

accelerator_options = AcceleratorOptions(
    num_threads=8, device=AcceleratorDevice.CPU
)

#Pipeline Options
pipeline_options = PdfPipelineOptions()
pipeline_options.accelerator_options = accelerator_options
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.generate_picture_images=False
pipeline_options.do_picture_description=False
#...

# (Testing) OCR Options:
ocr_options = EasyOcrOptions(lang=["ja", "en"])


## Custom options are now defined per format.
doc_converter = (
    DocumentConverter(  # all of the below is optional, has internal defaults.
        allowed_formats=[
            InputFormat.PDF,
            InputFormat.IMAGE,
            InputFormat.DOCX,
            InputFormat.HTML,
            InputFormat.PPTX,
        ],  # whitelist formats, non-matching files are ignored.
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options, # pipeline options go here.
                backend=PyPdfiumDocumentBackend # optional: pick an alternative backend
            ),
            InputFormat.DOCX: WordFormatOption(
                pipeline_cls=SimplePipeline # default for office formats and HTML
            ),
        },
    )
)

In [18]:
from docling.datamodel.document import ConversionResult
## Convert a single file (from URL or local path)
conv_result: ConversionResult = doc_converter.convert("./sample_pdf/test.pdf") # previously `convert_single`
# conv_result: ConversionResult = doc_converter.convert("./sample_pdf/docling_technical_report.pdf")

# ## Convert several files at once:

# input_files = [
#     "tests/data/html/wiki_duck.html",
#     "tests/data/docx/word_sample.docx",
#     "tests/data/docx/lorem_ipsum.docx",
#     "tests/data/pptx/powerpoint_sample.pptx",
#     "tests/data/2305.03393v1-pg9-img.png",
#     "tests/data/pdf/2206.01062.pdf",
# ]

# # Directly pass list of files or streams to `convert_all`
# conv_results_iter = doc_converter.convert_all(input_files) # previously `convert`

### 実行時間
- ocr無し・TableFormerMode.ACCURATE無し・picture_generate無し:1分程度
- ocr有り・TableFormerMode.ACCURATE有り・picture_generate有り:66分程度

In [19]:
import pandas as pd

"""
https://ds4sd.github.io/docling/reference/docling_document/#docling_core.types.doc.DoclingDocument
"""

docling_document = conv_result.document

# Print Out Docling Document Contents

# print(conv_result.document.texts)
# print([t.text for t in conv_result.document.texts])

# print(conv_result.document.tables)
# print(conv_result.document.pictures)


# print(docling_document.print_element_tree())

## Iterate the elements in reading order, including hierachy level:
for tuple in conv_result.document.iterate_items():
    print(tuple)
# for item, level in conv_result.document.iterate_items():
    # print(item,level)
#     # if not isinstance(item, PictureItem):
#     # #     print(type(item))
#     #     print(item.text)
#     # if isinstance(item, TextItem):
#     #     print("<<< TEXT >>>")
#     #     print(str(item.parent.get_ref()))
#     #     print(f".text: \n{item.text}")
#     # #     # print(f".children: \n{item.children}")
#     # elif isinstance(item, TableItem):
#     #     table_df: pd.DataFrame = item.export_to_dataframe()
#     #     print("<<< TABLE >>>")
#     #     print(table_df.head())
#     # if isinstance(item, PictureItem):
#     #     for item in item.children:
#     #         # print(type(item))
#     #         # print(type(item.get_ref()))
#     #         if "texts" in str(item.get_ref()):
#     #             print(item.resolve())

(SectionHeaderItem(self_ref='#/texts/0', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, label=<DocItemLabel.SECTION_HEADER: 'section_header'>, prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=86.2850341796875, t=718.45849609375, r=143.92855834960938, b=704.094482421875, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 4))], orig='見出し１', text='見出し１', level=1), 1)
(TextItem(self_ref='#/texts/1', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, label=<DocItemLabel.TEXT: 'text'>, prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=128.1389617919922, t=673.531494140625, r=179.1884002685547, b=660.8955078125, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 4))], orig='見出し２', text='見出し２'), 1)
(TextItem(self_ref='#/texts/2', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, label=<DocItemLabel.TEXT: 'text'>, prov=[ProvenanceItem(page_no=1, bbo

* ##### SAVE AS JSON/MD/HTML FILE

In [20]:
from pathlib import Path
json_output_file=Path("./output/test.json")
# json_output_file=Path("./output/docling_paper.json")

# JSON
docling_document.save_as_json(json_output_file, image_mode=ImageRefMode.EMBEDDED)

#!!! jsonの日本語文字化け発生(UnicodeエスケープでuXXXみたいになる)
# """
# 以下のライブラリコードのsave_as_jsonを修正し文字化け修正：
# .conda/envs/pdf_ext/Lib/site-packages/docling_core/types/doc/document.py"

# BEFORE: json.dump(out, fw, indent=indent)
# AFTER:  json.dump(out, fw, ensure_ascii=False, indent=indent)

# """

# HTML
# html_output_file=Path("./output/trm_docling.html")
# docling_document.save_as_html(html_output_file, image_mode=ImageRefMode.EMBEDDED)

# Markdown
md_output_file=Path("./output/test.md")
docling_document.save_as_markdown(md_output_file, image_mode=ImageRefMode.EMBEDDED)

export_to_dict OUT:
{'schema_name': 'DoclingDocument', 'version': '1.2.0', 'name': 'test', 'origin': {'mimetype': 'application/pdf', 'binary_hash': 9727528158719599069, 'filename': 'test.pdf'}, 'furniture': {'self_ref': '#/furniture', 'children': [], 'content_layer': 'furniture', 'name': '_root_', 'label': 'unspecified'}, 'body': {'self_ref': '#/body', 'children': [{'$ref': '#/texts/0'}, {'$ref': '#/texts/1'}, {'$ref': '#/texts/2'}, {'$ref': '#/texts/3'}, {'$ref': '#/texts/4'}, {'$ref': '#/texts/5'}, {'$ref': '#/tables/0'}, {'$ref': '#/texts/6'}, {'$ref': '#/texts/7'}], 'content_layer': 'body', 'name': '_root_', 'label': 'unspecified'}, 'groups': [], 'texts': [{'self_ref': '#/texts/0', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'section_header', 'prov': [{'page_no': 1, 'bbox': {'l': 86.2850341796875, 't': 718.45849609375, 'r': 143.92855834960938, 'b': 704.094482421875, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 4]}], 'orig': '見出し１', 'text': '見出し

In [9]:
from docling.chunking import HybridChunker

chunker = HybridChunker()
chunk_iter = chunker.chunk(dl_doc=docling_document)

In [10]:
for chunk in chunk_iter:
    print(chunk)

text='見出し１\n見出し２\n見出し３\n3333333\n見出し４\n4444444\na, B = b. a, C = c. a, B = b. a, C = c\n見出し 5\n555555555555' meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[DocItem(self_ref='#/texts/0', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, label=<DocItemLabel.TEXT: 'text'>, prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=86.2850341796875, t=718.45849609375, r=143.92855834960938, b=704.094482421875, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 4))]), DocItem(self_ref='#/texts/1', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, label=<DocItemLabel.TEXT: 'text'>, prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=86.14295959472656, t=673.531494140625, r=137.1884002685547, b=660.8955078125, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 4))]), DocItem(self_ref='#/texts/2', parent=RefItem(cref='#/body'), children=[], content_la

* ##### SAVE IMAGES

In [None]:
"""
参考：
https://ds4sd.github.io/docling/examples/export_figures/
"""
if pipeline_options.generate_picture_images:
    # Save images of figures and tables
    table_counter = 0
    picture_counter = 0
    for element, _level in conv_res.document.iterate_items():
        if isinstance(element, TableItem):
            table_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-table-{table_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

        if isinstance(element, PictureItem):
            picture_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-picture-{picture_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")