In [19]:
import json
import logging
from pathlib import Path
from typing import Tuple, List

import yaml
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.document import TableItem, PictureItem

In [20]:
import json
import logging
from pathlib import Path
from typing import Tuple, List

import yaml
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
#from docling.pipeline.pipeline_options import PdfPipelineOptions
#from docling.types.doc import ImageRefMode, PictureItem, TableItem

In [21]:


def get_document_files(input_dir: Path) -> list[Path]:
    """
    Recursively scan directory for document files.
    Returns a list of Path objects for supported document types.
    """
    supported_extensions = {".pdf", ".docx", ".pptx"}
    document_files = []

    for path in input_dir.rglob("*"):
        if path.is_file() and path.suffix.lower() in supported_extensions:
            document_files.append(path)

    return document_files

In [22]:


def save_images(res, output_subdir: Path, doc_filename: str) -> List[Tuple[str, Path]]:
    """
    Extracts and saves images from the document.
    Returns a list of (image_type, image_path) tuples for future processing.
    """
    images_dir = output_subdir / "images"
    images_dir.mkdir(parents=True, exist_ok=True)
    saved_images = []

    # Save page images
    for page_no, page in res.document.pages.items():
        if hasattr(page, 'image') and page.image:
            image_path = images_dir / f"{doc_filename}-page-{page_no}.png"
            with image_path.open("wb") as fp:
                page.image.pil_image.save(fp, format="PNG")
            saved_images.append(('page', image_path))

    # Save images of figures and tables
    table_counter = 0
    picture_counter = 0
    
    for element, _level in res.document.iterate_items():
        if isinstance(element, TableItem) and hasattr(element, 'image') and element.image:
            table_counter += 1
            image_path = images_dir / f"{doc_filename}-table-{table_counter}.png"
            with image_path.open("wb") as fp:
                element.image.pil_image.save(fp, "PNG")
            saved_images.append(('table', image_path))

        if isinstance(element, PictureItem) and hasattr(element, 'image') and element.image:
            picture_counter += 1
            image_path = images_dir / f"{doc_filename}-figure-{picture_counter}.png"
            with image_path.open("wb") as fp:
                element.image.pil_image.save(fp, "PNG")
            saved_images.append(('figure', image_path))

    return saved_images

def main():
    # Define input and output directories relative to current directory
    input_dir = Path("data/input")
    output_dir = Path("data/output")
    output_dir.mkdir(parents=True, exist_ok=True)

    # Get all document files recursively
    input_paths = get_document_files(input_dir)

    if not input_paths:
        print("No documents found in DATA directory!")
        return

    print(f"Found {len(input_paths)} documents to process:")
    for path in input_paths:
        print(f"- {path}")

    # Configure pipeline options
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = False  # Disable OCR
    pipeline_options.images_scale = 2.0  # Set image resolution
    pipeline_options.generate_page_images = False
    pipeline_options.generate_table_images = False
    pipeline_options.generate_picture_images = True

    # Configure document converter
    doc_converter = DocumentConverter(
        allowed_formats=[
            InputFormat.PDF,
            InputFormat.DOCX,
            InputFormat.PPTX,
        ],
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=StandardPdfPipeline,
                backend=PyPdfiumDocumentBackend,
                pipeline_options=pipeline_options
            ),
            InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline),
        },
    )

    # Process all documents
    conv_results = doc_converter.convert_all(input_paths)

    # Store all extracted images for later processing
    all_extracted_images = []

    # Save results
    for res in conv_results:
        # Preserve directory structure in output
        relative_path = res.input.file.relative_to(input_dir)
        output_subdir = output_dir / relative_path.parent
        output_subdir.mkdir(parents=True, exist_ok=True)

        # Create output filenames
        md_path = output_subdir / f"{res.input.file.stem}.md"
        json_path = output_subdir / f"{res.input.file.stem}.json"

        print(f"Converting: {res.input.file}" f"\nSaving to: {md_path}")

        # Extract and save images
        extracted_images = save_images(res, output_subdir, res.input.file.stem)
        all_extracted_images.extend(extracted_images)

        # Save markdown version with embedded images
        with md_path.open("w", encoding="utf-8") as fp:
            fp.write(res.document.export_to_markdown())

    print(f"\nExtracted {len(all_extracted_images)} images in total")
    print("Ready for image captioning processing")

    # Here you can add your vision language model processing
    # for image_type, image_path in all_extracted_images:
    #     caption = vision_model.generate_caption(image_path)
    #     # Store or process caption as needed

if __name__ == "__main__":
    main()


Found 3 documents to process:
- data/input/Employee-Handbook (1).pdf
- data/input/2023021721-1 (2).pdf
- data/input/Blue_Book_Rev_2024-0926 (2).pdf


Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

Converting: data/input/Employee-Handbook (1).pdf
Saving to: data/output/Employee-Handbook (1).md
Converting: data/input/2023021721-1 (2).pdf
Saving to: data/output/2023021721-1 (2).md


2024-11-18 11:27:06.423 (1072.170s) [          26AEE1]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-unselected
2024-11-18 11:27:06.423 (1072.170s) [          26AEE1]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-unselected
2024-11-18 11:27:06.423 (1072.170s) [          26AEE1]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-unselected
2024-11-18 11:27:06.423 (1072.170s) [          26AEE1]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-unselected
2024-11-18 11:27:06.423 (1072.170s) [          26AEE1]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-unselected
2024-11-18 11:27:06.423 (1072.170s) [          26AEE1]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-unselected
2024-11-18 11:27:06.423 (1072.170s) [          26AEE1]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-unselected
2024-11-18 11:27:06.423 (1072.170s) [          26AEE1]    doc_

Converting: data/input/Blue_Book_Rev_2024-0926 (2).pdf
Saving to: data/output/Blue_Book_Rev_2024-0926 (2).md

Extracted 135 images in total
Ready for image captioning processing
