### Docling 


---
#### Library Indtallation

In [1]:
%pip install docling transformers pypdf pymupdf

Note: you may need to restart the kernel to use updated packages.


#### Basic Docling pdf to markdown conversion

In [1]:
from docling.document_converter import DocumentConverter

source = r"/home/qb-admin/Desktop/ap/BIOS.pdf"
converter = DocumentConverter()
result = converter.convert(source)
md = result.document.export_to_markdown()

print(md)  # output: "## Docling Technical Report[...]"

<!-- image -->

## PRODUCT MANUAL

<!-- image -->

## Dear Biosian,

As we all know…

## ' Knowledge is power '

If  we  have  to  stand  successfully  in  this  competitive  scenario,  we  must  have  to  be  a knowledgeable. In Pharma sales market you are dealing with most knowledgeable persons. For  that first you  have  to  be  a knowledgeable  about  your  products,  science  and competitors.

Think about what is gained by learning more about your product. By knowing more about it  than  your  customers,  competitors  and  other  salespeople  you  will  gain  a  tremendous advantage that will lead to increased sales.

Sales don t always have to be about the prospect ' - being knowledgeable in your product can make you feel accomplished as well.

 Product knowledge builds enthusiasm, gives you courage and the satisfaction of being experts.

 Product  knowledge  also  allows  you  to  speak  with  confidence  around  other experts and helps you respond to any objections properly.


In [2]:
# Save the extracted text as a Markdown file
with open("BIOS_1.md", "w", encoding="utf-8") as md_file:
    md_file.write(md)

---

#### Extract Image and Tables with Markdown

In [3]:
%pip install rapidocr_onnxruntime

Collecting rapidocr_onnxruntime
  Downloading rapidocr_onnxruntime-1.4.4-py3-none-any.whl.metadata (1.3 kB)
Collecting opencv-python>=4.5.1.48 (from rapidocr_onnxruntime)
  Downloading opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting onnxruntime>=1.7.0 (from rapidocr_onnxruntime)
  Downloading onnxruntime-1.21.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime>=1.7.0->rapidocr_onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting flatbuffers (from onnxruntime>=1.7.0->rapidocr_onnxruntime)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting protobuf (from onnxruntime>=1.7.0->rapidocr_onnxruntime)
  Downloading protobuf-6.30.2-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime>=1.7.0->rapidocr_onnxruntime)
  Downloading 

In [1]:
import logging
import time
from pathlib import Path

from docling_core.types.doc import ImageRefMode, PictureItem, TableItem

from docling.backend.docling_parse_backend import DoclingParseDocumentBackend

from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions, RapidOcrOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

_log = logging.getLogger(__name__)

IMAGE_RESOLUTION_SCALE = 1.0

#### Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions(Mac only), RapidOcrOptions
##### ocr_options = EasyOcrOptions(force_full_page_ocr=True)
##### ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
##### ocr_options = OcrMacOptions(force_full_page_ocr=True)
##### ocr_options = RapidOcrOptions(force_full_page_ocr=True)
##### ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)

In [2]:
def main():
    logging.basicConfig(level=logging.INFO)

    input_doc_path = Path(r"/home/qb-admin/Desktop/ap/budget_speech_2025.pdf")
    output_dir = Path(r"/home/qb-admin/Desktop/ap/extracted_images")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
    # This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
    # The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
    # with the image field
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    ocr_options = EasyOcrOptions(force_full_page_ocr=True)
    pipeline_options.ocr_options = ocr_options

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    start_time = time.time()

    conv_res = doc_converter.convert(input_doc_path)

    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem

    # Save page images
    for page_no, page in conv_res.document.pages.items():
        page_no = page.page_no
        page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
        with page_image_filename.open("wb") as fp:
            page.image.pil_image.save(fp, format="PNG")

    # Save images of figures and tables
    table_counter = 0
    picture_counter = 0
    for element, _level in conv_res.document.iterate_items():
        if isinstance(element, TableItem):
            table_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-table-{table_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

        if isinstance(element, PictureItem):
            picture_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-picture-{picture_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

    # Save markdown with embedded pictures
    # md_filename = output_dir / f"{doc_filename}-with-images.md"
    # conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)

    # Save markdown with externally referenced pictures
    md_filename = output_dir / f"{doc_filename}-with-image-refs.md"
    conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED)

    end_time = time.time() - start_time

    _log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")

In [3]:
if __name__ == "__main__":
    main()

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash 3dc0cf0ba9d0223f58b4ea49c0396c77
INFO:docling.models.factories.base_factory:Loading plugin 'docling_defaults'
INFO:docling.models.factories:Registered ocr engines: ['easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.models.factories.base_factory:Loading plugin 'docling_defaults'
INFO:docling.models.factories:Registered picture descriptions: ['vlm', 'api']
INFO:docling.pipeline.base_pipeline:Processing document budget_speech_2025.pdf
INFO:docling.document_converter:Finished converting document budget_speech_2025.pdf in 246.25 sec.
INFO:__main__:Document converted and figures exported in 250.11 seconds.


#### AnotherCustom conversion approach to covert pdf to markdown without storing images

In [1]:
import json
import logging
import time
from pathlib import Path

In [2]:
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.ocr_mac_model import OcrMacOptions
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
from docling.models.tesseract_ocr_model import TesseractOcrOptions

In [3]:
_log = logging.getLogger(__name__)

In [4]:
def main():
    logging.basicConfig(level=logging.INFO)

    input_doc_path = Path("/home/qb-admin/Desktop/ap/budget_speech_2025.pdf")

    ###########################################################################

    # The following sections contain a combination of PipelineOptions
    # and PDF Backends for various configurations.
    # Uncomment one section at the time to see the differences in the output.


    # PyPdfium with EasyOCR
    # -----------------
    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = True
    # pipeline_options.do_table_structure = True
    # pipeline_options.table_structure_options.do_cell_matching = True

    # doc_converter = DocumentConverter(
    #     format_options={
    #         InputFormat.PDF: PdfFormatOption(
    #             pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
    #         )
    #     }
    # )

    # Docling Parse without EasyOCR
    # -------------------------
    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = False
    # pipeline_options.do_table_structure = True
    # pipeline_options.table_structure_options.do_cell_matching = True

    # doc_converter = DocumentConverter(
    #     format_options={
    #         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    #     }
    # )

    # Docling Parse with EasyOCR
    # ----------------------
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    # pipeline_options.ocr_options.lang = ["es"]
    pipeline_options.accelerator_options = AcceleratorOptions(
        num_threads=4, device=AcceleratorDevice.AUTO
    )

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )



    ###########################################################################

    start_time = time.time()
    conv_result = doc_converter.convert(input_doc_path)
    end_time = time.time() - start_time

    _log.info(f"Document converted in {end_time:.2f} seconds.")

    ## Export results
    output_dir = Path("scratch")
    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_result.input.file.stem

    # Export Deep Search document JSON format:
    # with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
    #     fp.write(json.dumps(conv_result.document.export_to_dict()))

    # Export Text format:
    with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
        fp.write(conv_result.document.export_to_text())

    # Export Markdown format:
    with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
        fp.write(conv_result.document.export_to_markdown())

    # Export Document Tags format:
    # with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp:
    #     fp.write(conv_result.document.export_to_document_tokens())

In [5]:
if __name__ == "__main__":
    main()

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash b6c066811298ccb7b6ca58fb8cfe4c84
INFO:docling.models.factories.base_factory:Loading plugin 'docling_defaults'
INFO:docling.models.factories:Registered ocr engines: ['easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.models.factories.base_factory:Loading plugin 'docling_defaults'
INFO:docling.models.factories:Registered picture descriptions: ['vlm', 'api']
INFO:docling.pipeline.base_pipeline:Processing document budget_speech_2025.pdf
INFO:docling.document_converter:Finished converting document budget_speech_2025.pdf in 46.40 sec.
INFO:__main__:Document converted in 46.40 seconds.
