# Docling Class Structures and Example Code

In [32]:
''' General Utility Functions '''
def get_obj_methods(obj: object) -> list[str]:
    return [func for func in dir(obj) if not func.startswith("__") and callable(getattr(obj, func))]

import os
from pathlib import Path

DOC_SOURCE = "data/TestPDF.pdf"
output_dir = Path("scratch")
os.makedirs(output_dir, exist_ok=True)

## Docling Classes

### DoclingDocument - [Official docs](https://docling-project.github.io/docling/reference/docling_document/#docling_core.types.doc.DoclingDocument)
<class 'docling_core.types.doc.document.DoclingDocument'>

**Attributes**: ['schema_name', 'version', 'name', 'origin', 'furniture', 'body', 'groups', 'texts', 'pictures', 'tables', 'key_value_items', 'form_items', 'pages']

**Notable Methods**: ['export_to_dict', 'export_to_doctags', 'export_to_document_tokens', 'export_to_element_tree', 'export_to_html', 'export_to_markdown', 'export_to_text']

In [None]:
'''Basic conversion from .pdf -> DoclingDocument'''
from docling.document_converter import DocumentConverter
# extract text and tables as markdown
conv_result = DocumentConverter().convert(DOC_SOURCE)
print(result.document.export_to_markdown())
# 0m52s for test document

2025-12-21 19:59:25,343 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-21 19:59:25,642 - INFO - Going to convert document batch...
2025-12-21 19:59:25,645 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
2025-12-21 19:59:25,656 - INFO - rapidocr cannot be used because onnxruntime is not installed.
2025-12-21 19:59:25,688 - INFO - easyocr cannot be used because it is not installed.
2025-12-21 19:59:25,691 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2025-12-21 19:59:25,993 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-12-21 19:59:25,996 [RapidOCR] device_config.py:50: Using CPU device[0m
[32m[INFO] 2025-12-21 19:59:26,672 [RapidOCR] download_file.py:60: File exists and is valid: /home/mkp/python_code/data-cleaning/.venv/lib/python3.12/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-12-21 19:59:26,674 [RapidOCR] main.py:50: Using /home/mkp/python_code/data-clean

<!-- image -->

<!-- image -->

## Optic flow is used to control human walking

William H. Warren, Jr., Bruce A. Kay, Wendy D. Zosh, Andrew P. Duchon and Stephanie Sahuc

Department of Cognitive and Linguistic Sciences, Brown University, Providence, Rhode Island 02912, USA Correspondence should be addressed to W.W. (bill\_warren@brown.edu)

How is human locomotion visually controlled? Fifty years ago, it was proposed that we steer to a goal using optic flow, the pattern of motion at the eye that specifies the direction of locomotion. However, we might also simply walk in the perceived direction of a goal. These two hypotheses normally predict the same behavior, but we tested them in an immersive virtual environment by displacing the optic flow from the direction of walking, violating the laws of optics. We found that people walked in the visual direction of a lone target, but increasingly relied on optic flow as it was added to the display. The visual control law for steering toward a 

In [None]:
'''Exporting results to different formats'''
import json
# Export Docling document JSON format:
with (output_dir / f"{DOC_SOURCE}.json").open("w+", encoding="utf-8") as fp:
    fp.write(json.dumps(conv_result.document.export_to_dict()))

# Export Markdown format:
with (output_dir / f"{DOC_SOURCE}.md").open("w+", encoding="utf-8") as fp:
    fp.write(conv_result.document.export_to_markdown())

# Export Document Tags format:
with (output_dir / f"{DOC_SOURCE}.doctags").open("w+", encoding="utf-8") as fp:
    fp.write(conv_result.document.export_to_doctags())

In [38]:
'''Custom Conversion .pdf -> Docling Document'''
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.generate_page_images = True
pipeline_options.table_structure_options.do_cell_matching = True

conv_result = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
).convert(DOC_SOURCE)
# 0m52s for test document

2025-12-21 20:11:01,500 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-21 20:11:01,520 - INFO - Going to convert document batch...
2025-12-21 20:11:01,522 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e784f57468f152386a904df3a8b24919


2025-12-21 20:11:01,573 - INFO - rapidocr cannot be used because onnxruntime is not installed.
2025-12-21 20:11:01,576 - INFO - easyocr cannot be used because it is not installed.
2025-12-21 20:11:01,579 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2025-12-21 20:11:01,753 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-12-21 20:11:01,755 [RapidOCR] device_config.py:50: Using CPU device[0m
[32m[INFO] 2025-12-21 20:11:01,928 [RapidOCR] download_file.py:60: File exists and is valid: /home/mkp/python_code/data-cleaning/.venv/lib/python3.12/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-12-21 20:11:01,931 [RapidOCR] main.py:50: Using /home/mkp/python_code/data-cleaning/.venv/lib/python3.12/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-12-21 20:11:02,406 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-12-21 20:11:02,408 [RapidOCR] device_config.py:50: Using CPU device[0m
[32m[INFO] 20

In [None]:
import json
import time
from collections.abc import Iterable
from pathlib import Path

import yaml
from docling_core.types.doc import ImageRefMode

from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

def export_documents(
    conv_results: Iterable[ConversionResult],
    output_dir: Path,
):
    output_dir.mkdir(parents=True, exist_ok=True)

    success_count = 0
    failure_count = 0
    partial_success_count = 0

    for conv_res in conv_results:
        if conv_res.status == ConversionStatus.SUCCESS:
            success_count += 1
            doc_filename = conv_res.input.file.stem

            conv_res.document.save_as_markdown(
                output_dir / f"{doc_filename}.md",
                image_mode=ImageRefMode.PLACEHOLDER,
            )

            # Export Docling document format to markdown:
            with (output_dir / f"{doc_filename}.md").open("w") as fp:
                fp.write(conv_res.document.export_to_markdown())

        elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
            _log.info(
                f"Document {conv_res.input.file} was partially converted with the following errors:"
            )
            for item in conv_res.errors:
                _log.info(f"\t{item.error_message}")
            partial_success_count += 1
        else:
            _log.info(f"Document {conv_res.input.file} failed to convert.")
            failure_count += 1

    _log.info(
        f"Processed {success_count + partial_success_count + failure_count} docs, "
        f"of which {failure_count} failed "
        f"and {partial_success_count} were partially converted."
    )
    return success_count, partial_success_count, failure_count

In [None]:
# Location of sample PDFs used by this example. If your checkout does not
# include test data, change `data_folder` or point `input_doc_paths` to
# your own files.
data_folder = Path("data")
input_doc_paths = [
    data_folder / "TestPDF.pdf",
    data_folder / "ResearchPaper.pdf"
]

# Configure the PDF pipeline. Enabling page image generation improves HTML
# previews (embedded images) but adds processing time.
pipeline_options = PdfPipelineOptions()
pipeline_options.generate_page_images = True

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options, backend=DoclingParseV4DocumentBackend
        )
    }
)

start_time = time.time()

# Convert all inputs. Set `raises_on_error=False` to keep processing other
# files even if one fails; errors are summarized after the run.
conv_results = doc_converter.convert_all(
    input_doc_paths,
    raises_on_error=False,  # to let conversion run through all and examine results at the end
)
# Write outputs to ./scratch and log a summary.
_success_count, _partial_success_count, failure_count = export_documents(
    conv_results, output_dir=Path("scratch")
)

end_time = time.time() - start_time

print(f"Document conversion complete in {end_time:.2f} seconds.")

if failure_count > 0:
    raise RuntimeError(
        f"The example failed converting {failure_count} on {len(input_doc_paths)}."
    )

2025-12-21 20:26:43,612 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-21 20:26:43,835 - INFO - Going to convert document batch...
2025-12-21 20:26:43,838 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e784f57468f152386a904df3a8b24919
2025-12-21 20:26:43,844 - INFO - rapidocr cannot be used because onnxruntime is not installed.
2025-12-21 20:26:43,847 - INFO - easyocr cannot be used because it is not installed.
2025-12-21 20:26:43,850 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2025-12-21 20:26:43,938 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-12-21 20:26:43,940 [RapidOCR] device_config.py:50: Using CPU device[0m
[32m[INFO] 2025-12-21 20:26:44,086 [RapidOCR] download_file.py:60: File exists and is valid: /home/mkp/python_code/data-cleaning/.venv/lib/python3.12/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-12-21 20:26:44,088 [RapidOCR] main.py:50: Using /home/mkp/python_code/data-clean

## Text Processing & Chunking

### Hybrid Chunker - [Official Docs](https://docling-project.github.io/docling/concepts/chunking/#hybrid-chunker)
Hybrid chunking applies tokenization-aware refinements on top of document-based hierarchical chunking. This means it creates one chunk for each individual document element. It starts with the `HierarchicalChunker` and, using a user-provided tokenizern it completes two passes:
1) Splits chunks only when needed.
2) Merges chunks when possible (can turn this step off using `merge_peers=False` in constructor).
The `HeirarchicalChunker` attaches relevant metadata like headers and captions by default.

In [None]:
from docling.chunking import HybridChunker
from docling.document_converter import DocumentConverter

# load in data
doc = DocumentConverter().convert(source=DOC_SOURCE).document

# create chunker
chunker = HybridChunker()
chunk_iter = chunker.chunk(dl_doc=doc)

# usually want to embed context rich versions (includes metadata in each chunk)
for i, chunk in enumerate(chunk_iter):
    enriched_text = chunker.contextualize(chunk=chunk)