<a href="https://colab.research.google.com/github/melrahmtz/purple-box/blob/main/hands-on-practice/multiformat_conversion_docling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
import logging
import time
import warnings

warnings.filterwarnings("ignore")

# Set up logging
logging.basicConfig(level=logging.INFO)
_log = logging.getLogger(__name__)

In [None]:
import torch

# Check if GPU or MPS is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"CUDA GPU is enabled: {torch.cuda.get_device_name(0)}")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS GPU is enabled.")
else:
    raise EnvironmentError(
        "No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured."
    )

#Install Dependencies

In [2]:
!pip install llama-index>=0.12.8 llama-index-core>=0.12.8
!pip install llama-index-node-parser-docling>=0.3.0 llama-index-readers-docling>=0.3.0
!pip install pypdf2>=3.0.1
!pip install easyocr>=1.7.2

#!pip install ipykernel>=6.29.5
# !pip install llama-index-embeddings-ollama>=0.5.0 llama-index-embeddings-huggingface>=0.4.0
# !pip install llama-index-llms-huggingface-api>=0.3.0 llama-index-llms-ollama>=0.5.0
# !pip install llama-index-readers-file>=0.4.1
# !pip install llama-index-vector-stores-milvus>=0.4.0
# !pip install python-dotenv>=1.0.1
# !pip install rich>=13.9.4
# !pip install pillow>=10.4.0
# !pip install pyarrow>=18.1.0
# !pip install fastparquet>=2024.11.0
# !pip install datasets>=3.2.0
# #!pip install ocrmac>=1.0.0
# !pip install matplotlib>=3.10.0
# !pip install toml>=0.10.2

# Get Data

In [18]:
# English Files
!mkdir -p data

# !mv 17.pdf data/17.pdf
# !mv 2014_monarch_plus_service_manual.pdf data/2014_monarch_plus_service_manual.pdf
# !mv 2024_11_05_Ferrari_Q3_2024_Results_Press_Release.pdf data/2024_11_05_Ferrari_Q3_2024_Results_Press_Release.pdf
# !mv 231161_OperationsMaintenanceManual.docx data/231161_OperationsMaintenanceManual.docx
# !mv PDF1.pdf data/PDF1.pdf
# !mv SUPO-744_REV_A.pdf data/SUPO-744_REV_A.pdf
# !mv VVS005s_030s_AHU_EN.pdf data/VVS005s_030s_AHU_EN.pdf
# !mv ai-in-america-oai-economic-blueprint-20250113.pdf data/ai-in-america-oai-economic-blueprint-20250113.pdf
# !mv creatingsystem.pdf data/creatingsystem.pdf
# !mv Ducati\ Panigale\ V4\ Engine.txt data/Ducati\ Panigale\ V4\ Engine.txt
# !mv monarch_exploded_view.png data/monarch_exploded_view.png
!mv image1.png data/image1.png
!mv Test-OCR-Handwritten.jpg data/Test-OCR-Handwritten.jpg

# Docling Conversion

In [13]:
import time
from docling.datamodel.base_models import InputFormat
from docling_core.types.doc import ImageRefMode
from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend  # Import the backend
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline # Import the Pipeline
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, EasyOcrOptions, TesseractOcrOptions, OcrMacOptions
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.datamodel.settings import settings

IMAGE_RESOLUTION_SCALE = 2.0

def create_pipeline_options(input_format):
    """Creates dynamic pipeline options based on the input format."""
    if input_format == InputFormat.PDF:

        return PdfFormatOption(
            pipeline_cls=StandardPdfPipeline,  # Specify the pipeline class
            backend=PyPdfiumDocumentBackend,  # Specify the backend explicitly
            do_table_structure=True,  # Enable table structure detection
            do_ocr=True,  # Enable OCR
            ocr_options=EasyOcrOptions(force_full_page_ocr=True, lang=["en"]),
            table_structure_option=dict(
                do_cell_matching=True,
                mode=TableFormerMode.ACCURATE
            ),
            generate_page_images=True,
            generate_picture_images=True,
            images_scale=IMAGE_RESOLUTION_SCALE,
        )

    elif input_format == InputFormat.DOCX:
        return WordFormatOption(
            pipeline_cls=SimplePipeline  # Configure Word document pipeline
        )

    elif input_format == InputFormat.IMAGE:
        return None  # Add image-specific options if needed
    elif input_format == InputFormat.HTML:
        return None  # Add HTML-specific options if needed
    elif input_format == InputFormat.PPTX:
        return None  # Add PowerPoint-specific options if needed
    elif input_format == InputFormat.ASCIIDOC or input_format == InputFormat.MD:
        return None  # These formats might not need advanced options
    else:
        raise ValueError(f"Unsupported input format: {input_format}")

In [14]:
from docling.document_converter import DocumentConverter

def initialize_converter():
    """Initializes the document converter with multiformat support and advanced options."""
    allowed_formats = [
        InputFormat.PDF,
        InputFormat.IMAGE,
        InputFormat.DOCX,
        InputFormat.HTML,
        InputFormat.PPTX,
        InputFormat.ASCIIDOC,
        InputFormat.MD,
    ]

    format_options = {
        input_format: create_pipeline_options(input_format)
        for input_format in allowed_formats
        if create_pipeline_options(input_format) is not None
    }

    return DocumentConverter(
        allowed_formats=allowed_formats,
        format_options=format_options,
    )


In [15]:
from pathlib import Path
import json
import yaml

def convert_and_save(input_paths, output_dir, image_mode=ImageRefMode.REFERENCED):
    """Converts documents to Markdown and saves the output."""
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    doc_converter = initialize_converter()
    conv_results = doc_converter.convert_all(input_paths)

    for res in conv_results:
        file_name = res.input.file.stem
        markdown_path = output_dir / f"{file_name}.md"
        json_path = output_dir / f"{file_name}.json"
        yaml_path = output_dir / f"{file_name}.yaml"

        # Save Markdown output
        res.document.save_as_markdown(markdown_path, image_mode=image_mode)
        _log.info(f"Markdown content saved to {markdown_path}")

        # Save JSON output
        with json_path.open("w") as fp:
            fp.write(json.dumps(res.document.export_to_dict(), indent=4))
        _log.info(f"JSON content saved to {json_path}")

        # Save YAML output
        with yaml_path.open("w") as fp:
            fp.write(yaml.safe_dump(res.document.export_to_dict()))
        _log.info(f"YAML content saved to {yaml_path}")


In [28]:
def main():
    settings.debug.profile_pipeline_timings = True

    input_paths = [
        # Path("data/17.pdf"),
        # Path("data/2024_11_05_Ferrari_Q3_2024_Results_Press_Release.pdf"),
        # Path("data/231161_OperationsMaintenanceManual.docx"),
        # Path("data/PDF1.pdf"),
        # Path("data/SUPO-744_REV_A.pdf"),
        # Path("data/ai-in-america-oai-economic-blueprint-20250113.pdf"),
        # Path("data/creatingsystem.pdf"),
        # Path("data/image1.png"),
        # Path("data/Test-OCR-Handwritten.jpg"),
        Path("/content/sample_data/README.md"),
    ]

    output_dir = "output-docs"
    convert_and_save(input_paths, output_dir, image_mode=ImageRefMode.REFERENCED)


In [29]:
if __name__ == "__main__":
    main()

# Nodes Extraction

In [None]:
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.readers.docling import DoclingReader

#reader = DoclingReader()
reader = DoclingReader(DocumentConverter = doc_converter_global)
node_parser = MarkdownNodeParser()

documents = reader.load_data(file_path)

# Display the number of nodes extracted
nodes = node_parser.get_nodes_from_documents(documents)
print(f"Number of nodes extracted: {len(nodes)}")

# Iterate over each node to print its text and metadata
for index, node in enumerate(nodes, start=1):
    print(f"\nNode {index}:")
    print("Text:")
    print(node.text)
    print("Metadata:")
    print(node.metadata)


# Embedding and Chunking

#Trial from RAW Code

In [None]:
!pip install llama-index-node-parser-docling>=0.3.0 llama-index-readers-docling>=0.3.0

In [None]:
import json
import logging
from pathlib import Path

import yaml

from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline

_log = logging.getLogger(__name__)

In [None]:
# English Files
!mkdir -p data

!mv 17.pdf data/17.pdf
!mv 2014_monarch_plus_service_manual.pdf data/2014_monarch_plus_service_manual.pdf
!mv 2024_11_05_Ferrari_Q3_2024_Results_Press_Release.pdf data/2024_11_05_Ferrari_Q3_2024_Results_Press_Release.pdf
!mv 231161_OperationsMaintenanceManual.docx data/231161_OperationsMaintenanceManual.docx
!mv PDF1.pdf data/PDF1.pdf
!mv SUPO-744_REV_A.pdf data/SUPO-744_REV_A.pdf
!mv VVS005s_030s_AHU_EN.pdf data/VVS005s_030s_AHU_EN.pdf
!mv ai-in-america-oai-economic-blueprint-20250113.pdf data/ai-in-america-oai-economic-blueprint-20250113.pdf
!mv creatingsystem.pdf data/creatingsystem.pdf
!mv Ducati\ Panigale\ V4\ Engine.txt data/Ducati\ Panigale\ V4\ Engine.txt
!mv monarch_exploded_view.png data/monarch_exploded_view.png

In [10]:
def main():
    input_paths = [
        #Path("README.md"),
        Path("data/17.pdf"),
        #Path("data/2014_monarch_plus_service_manual.pdf"),
        Path("data/2024_11_05_Ferrari_Q3_2024_Results_Press_Release.pdf"),
        Path("data/231161_OperationsMaintenanceManual.docx"),
        Path("data/PDF1.pdf"),
        Path("data/SUPO-744_REV_A.pdf"),
        #Path("data/VVS005s_030s_AHU_EN.pdf"),
        Path("data/ai-in-america-oai-economic-blueprint-20250113.pdf"),
        Path("data/creatingsystem.pdf"),
        #Path("data/Ducati\ Panigale\ V4\ Engine.txt"),
        Path("data/monarch_exploded_view.png"),
        #Path("tests/data/pptx/powerpoint_sample.pptx"),
    ]

    doc_converter = (
        DocumentConverter(  # all of the below is optional, has internal defaults.
            allowed_formats=[
                InputFormat.PDF,
                InputFormat.IMAGE,
                InputFormat.DOCX,
                InputFormat.HTML,
                InputFormat.PPTX,
                InputFormat.ASCIIDOC,
                InputFormat.MD,
            ],  # whitelist formats, non-matching files are ignored.
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
                ),
                InputFormat.DOCX: WordFormatOption(
                    pipeline_cls=SimplePipeline  # , backend=MsWordDocumentBackend
                ),
            },
        )
    )

    conv_results = doc_converter.convert_all(input_paths)

    for res in conv_results:
        out_path = Path("scratch")
        out_path.mkdir(parents=True, exist_ok=True)
        print(
            f"Document {res.input.file.name} converted."
            f"\nSaved markdown output to: {str(out_path)}"
        )
        _log.debug(res.document._export_to_indented_text(max_text_len=16))
        # Export Docling document format to markdowndoc:
        with (out_path / f"{res.input.file.stem}.md").open("w") as fp:
            fp.write(res.document.export_to_markdown())

        with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
            fp.write(json.dumps(res.document.export_to_dict()))

        with (out_path / f"{res.input.file.stem}.yaml").open("w") as fp:
            fp.write(yaml.safe_dump(res.document.export_to_dict()))

In [11]:
if __name__ == "__main__":
    main()

Document 17.pdf converted.
Saved markdown output to: scratch
Document 2024_11_05_Ferrari_Q3_2024_Results_Press_Release.pdf converted.
Saved markdown output to: scratch
Document 231161_OperationsMaintenanceManual.docx converted.
Saved markdown output to: scratch
Document PDF1.pdf converted.
Saved markdown output to: scratch
Document SUPO-744_REV_A.pdf converted.
Saved markdown output to: scratch
Document ai-in-america-oai-economic-blueprint-20250113.pdf converted.
Saved markdown output to: scratch
Document creatingsystem.pdf converted.
Saved markdown output to: scratch
Document monarch_exploded_view.png converted.
Saved markdown output to: scratch
