In [3]:
from langchain_docling import DoclingLoader
from langchain_core.documents import Document

# Docling Core Imports for Configuration
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions, 
    TableFormerMode
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def create_complex_pdf_loader(file_path: str) -> DoclingLoader:
    """
    Creates a DoclingLoader configured for complex layouts:
    - Enables OCR for scanned pages/images.
    - Uses 'Accurate' mode for table structure recognition.
    - Exports to Markdown (best for LLM comprehension).
    """
    
    # 1. Configure the Pipeline Options
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True  # Enable OCR for scanned documents
    pipeline_options.do_table_structure = True  # Enable advanced table parsing
    
    # Set table extraction to 'ACCURATE' (slower but handles merged cells/complex grids better)
    pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE

    # 2. Create the Converter with these options
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    # 3. Initialize the LangChain Loader
    # We use export_type="markdown" because it preserves table structure 
    # in a way that LLMs understand effectively.
    loader = DoclingLoader(
        file_path=file_path,
        converter=converter
    )
    
    return loader

In [8]:
# --- Usage Example ---
if __name__ == "__main__":
    pdf_path = r"C:\Users\MahadevaA\OneDrive - CXIO Technologies Pvt Ltd\Documents\GitHub\ReSearch\PARSING-TECHNIQUES\CRPL-1N60001074-CADPO110494.pdf"  # Replace with your file
    
    # Initialize loader
    loader = create_complex_pdf_loader(pdf_path)

    # Load the documents
    docs = loader.load()

    # Define output filename
    output_filename = "extracted_document.md"

    # Combine all page content into one string
    full_text = ""
    for i, doc in enumerate(docs):
        # Add a page marker for clarity
        full_text += f"\n\n\n\n"
        full_text += doc.page_content

    # Save to a local file
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(full_text)

    print(f"✅ Extracted content saved to: {output_filename}")

2025-11-28 08:24:54,573 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-28 08:24:54,672 - INFO - Going to convert document batch...
2025-11-28 08:24:54,675 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44ae89a68fc272bc7889292e9b5a1bad
2025-11-28 08:24:54,891 - INFO - Loading plugin 'docling_defaults'
2025-11-28 08:24:54,896 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-11-28 08:24:54,954 - INFO - Loading plugin 'docling_defaults'
2025-11-28 08:24:54,962 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-11-28 08:24:54,966 - INFO - rapidocr cannot be used because onnxruntime is not installed.
2025-11-28 08:24:54,966 - INFO - easyocr cannot be used because it is not installed.
2025-11-28 08:24:56,954 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2025-11-28 08:24:56,975 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-11-28 08:24:56,989 [RapidOCR] download_f

✅ Extracted content saved to: extracted_document.md
