# LangChain MongoDB Integration - Parent Document Retrieval

This notebook is a companion to the [Parent Document Retrieval](https://www.mongodb.com/docs/atlas/ai-integrations/langchain/parent-document-retrieval/) page. Refer to the page for set-up instructions and detailed explanations.

<a target="_blank" href="https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-hybrid-search.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
pip install --quiet --upgrade langchain langchain-community langchain-core langchain-mongodb langchain-voyageai langchain-openai pymongo pypdf

In [None]:
import os

os.environ["VOYAGE_API_KEY"] = "<voyage-api-key>"
os.environ["OPENAI_API_KEY"] = "<openai-api-key>"
MONGODB_URI = "<connection-string>"

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

# Load the PDF
loader = PyPDFLoader("https://investors.mongodb.com/node/12881/pdf") 
data = loader.load()

# Chunk into parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=20)
docs = parent_splitter.split_documents(data)

# Print a document
docs[0]

In [None]:
from langchain_mongodb.retrievers import MongoDBAtlasParentDocumentRetriever
from langchain_voyageai import VoyageAIEmbeddings

# Define the embedding model to use
embedding_model = VoyageAIEmbeddings(model="voyage-3-large")

# Define the chunking method for the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)

# Specify the database and collection name
database_name = "langchain_db"
collection_name = "parent_document"

# Create the parent document retriever
parent_doc_retriever = MongoDBAtlasParentDocumentRetriever.from_connection_string(
    connection_string = MONGODB_URI,
    child_splitter = child_splitter,
    embedding_model = embedding_model,
    database_name = database_name,
    collection_name = collection_name,
    text_key = "page_content",
    relevance_score_fn = "dotProduct",
    search_kwargs = { "k": 10 },
)

In [None]:
# Ingest the documents into Atlas
parent_doc_retriever.add_documents(docs)

In [None]:
# Get the vector store instance from the retriever
vector_store = parent_doc_retriever.vectorstore

# Use helper method to create the vector search index
vector_store.create_vector_search_index(
   dimensions = 1024,       # The dimensions of the vector embeddings to be indexed
   wait_until_complete = 60 # Number of seconds to wait for the index to build (can take around a minute)
)


In [None]:
# Run a vector search query
parent_doc_retriever.invoke("AI technology")

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import  RunnablePassthrough
from langchain_openai import ChatOpenAI

# Define a prompt template
template = """
   Use the following pieces of context to answer the question at the end.
   {context}
   Question: {query}?
"""
prompt = PromptTemplate.from_template(template)
model = ChatOpenAI()

# Construct a chain to answer questions on your data
chain = (
   {"context": parent_doc_retriever, "query": RunnablePassthrough()}
   | prompt
   | model
   | StrOutputParser()
)

# Prompt the chain
query = "In a list, what are MongoDB's latest AI announcements?"
answer = chain.invoke(query)
print(answer)