In [None]:
from logging import basicConfig, getLogger, INFO
from time import time
from pathlib import Path

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

In [None]:
_log = getLogger(__name__)

In [None]:
basicConfig(level=INFO)

input_doc_path = Path('PATH_TO_DOC')

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

start_time = time()
conv_result = doc_converter.convert(input_doc_path)
end_time = time() - start_time

_log.info(f"Document converted in {end_time:.2f} seconds.")

output_dir = Path("scratch")
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = conv_result.input.file.stem

with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
    fp.write(conv_result.document.export_to_markdown())