# Docling Pipeline with a PostgreSQL Database
Each chunk is embedded using `pgvector` and logs the following metadata:
```json
{
  "page_numbers": [5],
  "headings": ["Q3 Performance", "Revenue Breakdown"],
  "origin": "annual_report.pdf"
}
```
Unlike pure vector databases, this allows for hybrid searches using embeddings and SQL queries. For example we can select vectors similar to "quaterly earnings" but only where metadata.headings = "Q3". Here's the setup code for the database:
```sql
-- enable the pgvector extension
CREATE EXTENSION IF NOT EXISTS vector;

CREATE TABLE IF NOT EXISTS document_chunks (
  id SERIAL PRIMARY KEY,
  doc_filename TEXT NOT NULL,
  -- The actual content (text, table markdown, or image caption)
  chunk_content TEXT NOT NULL,
  -- Rich metadata (Page #, bounding box, section header context)
  metadata JSONB DEFAULT '{}'::jsonb,
  -- Vector embedding (assuming 384 dimensions for all-MiniLM-L6-v2)
  embedding vector(384),
  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- create a specialized index for fast similarity search (HNSW)
CREATE INDEX ON document_chunks USING hnsw (embedding vector_l2_ops);
```

## sqlalchemy Setup
sqlalchemy provides type saftey through Pydantic and connection pooling.

In [None]:
import os
from dotenv import load_dotenv

from sqlalchemy import create_engine, Text, JSON, text
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, Session
from pgvector.sqlalchemy import Vector

load_dotenv()
DB_URL = os.getenv("DB_URL_DOC")

class Base(DeclarativeBase):
    pass
class DocumentChunk(Base):
    __tablename__ = "document_chunks"
    id: Mapped[int] = mapped_column(primary_key=True)
    doc_filename: Mapped[str] = mapped_column(Text)
    metadata_: Mapped[dict] = mapped_column("metadata", JSON)
    embedding: Mapped[list[float]] = mapped_column(Vector(384))

def init_db():
    '''Set up table. '''
    engine = create_engine(DB_URL)
    with engine.connect() as conn:
        conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
        conn.commit()
    Base.metadata.create_all(engine)
    return engine

## Docling Document Processing

In [1]:
# for handling graphs & images, enable VLM pipeline in Docling
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, PictureDescriptionVlmOptions

def get_visual_converter():
    '''
    Creates a Docling converter with vision capabilities enabled for graphs and images.
    '''
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.do_picture_description = True

    # select and prompt model for image to text
    pipeline_options.picture_description_options = PictureDescriptionVlmOptions(
        repo_id="HuggingFaceTB/SmolVLM-256M-Instruct", 
        prompt="Analyze this image. If it is a graph or chart, detail the trends, X/Y axes, and key data points. If it is a photo, describe the scene."
    )

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )
    return converter


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from docling.chunking import HybridChunker
from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer('all-MiniLM-L6-v2')

def process_and_chunk(doc_result):
    '''
    Chunks a coverted Docling result and generates embeddings.
    '''
    doc = doc_result.document
    filename = doc_result.input.file.name

    chunker = HybridChunker(
        tokenizer="sentence-transformers/all-MiniLM-L6-v2",
        max_tokens=512, # target chunk size
        merge_peers=True # merges small items like bulleted lists
    )
    chunk_iter = chunker.chunk(dl_doc=doc)
    processed_chunks = []
    for chunk in chunk_iter:
        text_content = chunk.text
        # metadata stored as json in psql
        meta = {
            "page_numbers": sorted(list(set(
                prov.page_no 
                for item in chunk.meta.doc_items 
                for prov in item.prov
            ))),
            "headings": chunk.metadata.headings,
            "source": filename
        }
        # generate vector embedding
        vector = embed_model.encode(text_content).tolist()
        
        processed_chunks.append({
            "filename": filename,
            "content": text_content,
            "metadata": meta,
            "vector": vector
        })

    return processed_chunks

2025-12-21 17:11:27,001 - INFO - Use pytorch device_name: cpu
2025-12-21 17:11:27,005 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [4]:
def rag_ingestion(pdf_paths):

    engine = init_db()
    converter = get_visual_converter()

    conversion_results = converter.convert_all(pdf_paths)

    with Session(engine) as session:
        for result in conversion_results:
            try:
                filename = result.input.file.name
                print(f"Attempting to convert {filename}...")
                # chunk file content
                chunks_data = process_and_chunk(result)

                orm_objs = []
                for chunk in chunks_data:
                    doc_chunk = DocumentChunk(
                        doc_filename=chunk["filename"],
                        chunk_content=chunk["content"],
                        metadata_=chunk["metadata"], 
                        embedding=chunk["vector"]
                    )
                    orm_objs.append(doc_chunk)

                session.add_all(orm_objs)
                session.flush()
                print(f"Staged {len(orm_objs)} chunks.")
                
            except Exception as e:
                print(f"[Error] Failed to process {filename}: {e}")
                session.rollback() # undo changes for this specific file
        session.commit()
        print("[S] Data ingestion complete.")

## Running the Full Docling + PostgreSQL RAG Pipleline

In [5]:
# running full ingestion pipline
from pathlib import Path
data_fp = Path("../custom-RAG-models/corpuses/pdfs")
files_to_process = [
    str(p) for p in data_fp.glob("*.pdf")
]
print(f"Found {len(files_to_process)} files: {files_to_process}")
print(f"First file: {files_to_process[0]}")

Found 6 files: ['../custom-RAG-models/corpuses/pdfs/Layton2022EstimatingHeading.pdf', '../custom-RAG-models/corpuses/pdfs/Layton2022DistributedEncoding.pdf', '../custom-RAG-models/corpuses/pdfs/Warren2021OpticFlowWalking.pdf', '../custom-RAG-models/corpuses/pdfs/Layton2016NeuralModelofMST.pdf', '../custom-RAG-models/corpuses/pdfs/Perrone2018VisualVestibularEstimation.pdf', '../custom-RAG-models/corpuses/pdfs/Layton2012MotionPoolingModel.pdf']
First file: ../custom-RAG-models/corpuses/pdfs/Layton2022EstimatingHeading.pdf


In [6]:
rag_ingestion([files_to_process[0]])

2025-12-21 17:11:29,425 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]


2025-12-21 17:11:29,681 - INFO - Going to convert document batch...
2025-12-21 17:11:29,685 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 5dd2eb934328b25705639acdb714a5a7
2025-12-21 17:11:29,723 - INFO - Loading plugin 'docling_defaults'
2025-12-21 17:11:29,738 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-12-21 17:11:29,848 - INFO - Accelerator device: 'cpu'
2025-12-21 17:11:33,837 - INFO - Loading plugin 'docling_defaults'
2025-12-21 17:11:33,850 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-12-21 17:11:33,853 - INFO - rapidocr cannot be used because onnxruntime is not installed.
2025-12-21 17:11:33,857 - INFO - easyocr cannot be used because it is not installed.
2025-12-21 17:11:34,643 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2025-12-21 17:11:34,707 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-12-21 17:11:34,730 [RapidOCR] device_config.py:50: Using CP

Attempting to convert Layton2022EstimatingHeading.pdf...


Token indices sequence length is longer than the specified maximum sequence length for this model (530 > 512). Running this sequence through the model will result in indexing errors


[Error] Failed to process Layton2022EstimatingHeading.pdf: 'DocChunk' object has no attribute 'metadata'
[S] Data ingestion complete.
