**Load the PDF content**

This script is to initialize the pdf path and use it later for loading the documents in chunks. This may take time. Please run it step by step to create the vector db in folder.

In [None]:
from langchain_docling import DoclingLoader

FILE_PATH = "https://arxiv.org/pdf/2408.09869"

loader = DoclingLoader(file_path=FILE_PATH)

docs = loader.load()

In [None]:
for d in docs[:3]:
    print(f"- {d.page_content=}")
    print(f"- {d.metadata=}")

This step is to extract only the valuable info for metadata for each docs

In [None]:
from langchain.schema import Document

def flatten_docling_metadata(docs):
    flattened_docs = []
    for doc in docs:
        # Extract critical fields from dl_meta
        dl_meta = doc.metadata.get("dl_meta", {})
        headings = ", ".join(dl_meta.get("headings", []))
        filename = dl_meta.get("origin", {}).get("filename", "unknown")
        page_nos = {item["prov"][0]["page_no"] for item in dl_meta.get("doc_items", [])}
        
        # Build simplified metadata
        simple_metadata = {
            "source": doc.metadata["source"],  # Keep original source
            "filename": filename,
            "headings": headings,
            "page_nos": ", ".join(map(str, sorted(page_nos))),
        }
        
        flattened_docs.append(Document(
            page_content=doc.page_content,
            metadata=simple_metadata
        ))
    return flattened_docs

# Process your documents
flattened_docs = flatten_docling_metadata(docs)

In [None]:
for d in flattened_docs[:3]:
    print(f"- {d.page_content=}")
    print(f"- {d.metadata=}")

# You can see the difference in metadata

**Step to create embeddings**

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# Initialize embedding model (e.g., HuggingFace)
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)

# Create ChromaDB vector store
vector_db = Chroma.from_documents(
    documents=flattened_docs,           # Your loaded/chunked documents
    embedding=embedding_model,
    persist_directory="./chroma_db"  # Save to disk to use it later
)

In [None]:
# Now you can query it!
results = vector_db.similarity_search("What is docling?", k=3)
print(results)