### Docling Document Conversion Example

This example demonstrates how to convert a PDF file into a Docling Document and explore other Docling features

In [None]:
# install required libraries
%pip install -q docling pdfplumber

In [None]:
# import docling and python libs into the notebook
try:
    import mimetypes as mt
    from pathlib import Path
    # docling
    from docling_core.types.doc import DoclingDocument
    from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
    from docling.document_converter import DocumentConverter, PdfFormatOption
    from docling.datamodel.base_models import InputFormat, DocumentStream
    from docling.datamodel.pipeline_options import PdfPipelineOptions
    from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
    from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend      
    from docling.chunking import HybridChunker
except Exception as e:
    print(f"Caught fatal exception: {e}")

#### Declare some helper functions to handle file conversion and other features

In the following cell we define some helper functions that will handle the document load/conversion process

- `createDoclingConverter`: function that returns a DoclingConverter instance that can be used for converting a PDF file into a DoclingDocument.
- `prepareDocuments`: function that takes a list of file paths and returns a list of `DoclingDocument` objects.
- `chunkFiles`: function that takes a list of Docling Documents and returns the list of chunks for each document.

In [None]:
def createDoclingConverter(do_ocr: bool = False, 
                          do_table_structure: bool = True, 
                          pdf_backend: PyPdfiumDocumentBackend|DoclingParseV4DocumentBackend = PyPdfiumDocumentBackend,
                          threads: int = 4) -> DocumentConverter:
    # Instantiate the docling conversion engine
    pdf_options = PdfPipelineOptions()
    pdf_options.do_ocr = do_ocr
    pdf_options.do_table_structure = do_table_structure
    pdf_options.accelerator_options = AcceleratorOptions(num_threads=threads, device=AcceleratorDevice.AUTO)

    # Convert PDF to Docling Document
    converter = DocumentConverter(
        allowed_formats=[InputFormat.PDF, 
                        InputFormat.HTML,
                        InputFormat.MD,
                        InputFormat.DOCX, 
                        InputFormat.XLSX],
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pdf_options,
                backend=pdf_backend,
            )
        }
    )

    # return handler
    return converter

In [None]:
def prepareDocuments(converter: DocumentConverter, uploaded_files: list) -> list:
    converted_docs = []

    for i, ufile in enumerate(uploaded_files):       
        # convert file
        docling_doc = converter.convert(source=ufile.get("name"))
        
        # get mimetype
        mimetype = mt.guess_type(ufile.get("name"))[0]
        
        # Add metadata to the Docling Document
        metadata = {
            "name": f"{ufile.get('name')}",
            "mimetype":f"{mimetype}",
            "document_id": f"document_id_{i}",
        }
        
        # push to array & free resources
        converted_docs.append({
            "doc": docling_doc,
            "metadata": metadata,
        })
    
    # return documents
    return converted_docs

In [None]:
def chunkFiles(converted_docs: list) -> list:
    for i, ufile in enumerate(converted_docs):
        # perform chunking on the converted documents
        chunker = HybridChunker()
        docs = []

        for doc in converted_docs:
            chunks = list(chunker.chunk(dl_doc=doc["doc"].document))
            for i, chunk in enumerate(chunks):
                # contextualize chunk for content storage
                chunk_content = chunker.contextualize(chunk=chunk)
                # fill metadata
                metadata = {
                    "name": chunk.meta.origin.filename,
                    "uri": chunk.meta.origin.uri,
                    "headings": chunk.meta.headings,
                    "captions": chunk.meta.captions,
                    "mimetype": chunk.meta.origin.mimetype,
                }
                # fill chunk metadata
                chunk_metadata = {
                    "document_id": f"{chunk.meta.origin.filename}_{chunk.meta.origin.binary_hash}",
                    "chunk_id": f"{chunk.meta.origin.filename}_{chunk.meta.origin.binary_hash}_chunk_{i}",
                    "source": metadata.get('url') or metadata.get("name"),
                }

                # append chunk to doc list                
                docs.append({
                    "content": chunk_content,
                    "mime_type": chunk.meta.origin.mimetype,
                    "metadata": metadata,
                    "chunk_metadata": chunk_metadata,
                })
    
    # return docs
    return docs

#### Load and Prepare a document with Docling

Let's load a PDF file and process it with Docling

In [None]:
# the file to be converted
pdf_file = Path("./").parent / "rfc2104.pdf"
print(f"Working on file {pdf_file}")

# file object to be processed
uploaded_files = [
    { "name": pdf_file,
      "content_type": "application/pdf"
    }
]

In [None]:
# create the docling converter object...
try:
    doclingConverter = createDoclingConverter()

    # prepare documents
    docs = prepareDocuments(doclingConverter, uploaded_files)
except Exception as e:
    print(f"{e}")

In [None]:
# explore converted document
for d in docs:
    # display info
    print(f"Document Metadata: {d.get('metadata')}\nDoc Filename: {d.get('doc').input.file.stem}")
    

#### Convert File into different Formats

Now that the document has been converted to Docling Format, we can export it in different output formats.

For that purpose, we define a function:

- `exportToFormat`: that takes as input the DoclingDocument we want to convert and the desired output format.

Supported formats:

- JSON
- MD
- TXT
- DOCTAGS

In [None]:
# define a conversion function
def exportToFormat(doc: DoclingDocument,
                   output_format: str = "json",
                   output_dir: str = "conversion_output") -> None:
    # output file root
    outfile = doc.input.file.stem
    
    # Create Output Dir
    try:
        output_dir = Path("./").parent / output_dir
        output_dir.mkdir(parents=True, exist_ok=True)
    except Exception as e:
        print(f"{e}")

    if output_format not in ["json", "md", "txt", "doctag"]:
        raise Exception(f"{output_format}: Unsupported Conversion Format")
    
    # convert file
    match output_format:
        case "json":
            import json
            with (output_dir / f"{outfile}.json").open("w", encoding="utf-8") as jsonfile:
                jsonfile.write(json.dumps(doc.document.export_to_dict()))
        case "txt":
            with (output_dir / f"{outfile}.txt").open("w", encoding="utf-8") as txtfile:
                txtfile.write(doc.document.export_to_text())
        case "md":
            with (output_dir / f"{outfile}.md").open("w", encoding="utf-8") as mdfile:
                mdfile.write(doc.document.export_to_markdown())
        case "doctag":
            with (output_dir / f"{outfile}.doctags").open("w", encoding="utf-8") as doctag:
                doctag.write(doc.document.export_to_doctags())

In [None]:
# Export file to different formats
out_formats = ["json", "md", "txt", "doctag"]
try:
    for fmt in out_formats:
        for doc in docs:
            d = doc.get('doc')
            print(f"Exporting {d.input.file.stem} to {fmt}...")
            exportToFormat(d, output_format=fmt, output_dir="converted_files")
except Exception as e:
    print(f"Conversion Error: {e}")


#### Explore Chunking Capabilities

Use the HybridChunker() class from docling to successfully chunk text for further processing (e.g. RAG ingestion)

In [None]:
import pprint
# try chunking the document
chunked_docs = chunkFiles(docs)

# explore chunks
print(f"Generated {len(chunked_docs)} chunks")

# print
pprint.pprint(list(chunked_docs))