# Libraries

In [None]:
# pip install langchain langchain_community langchain_core langchain_openai langchain_mongodb pymongo pypdf PyPDFLoader create_metadata_tagger

# Credentials

In [204]:
# ======== Create config.ini

# [OpenAI]
# api_key = TBC

# [MongoDB]
# uri = TBC

In [206]:
from configparser import ConfigParser
config = ConfigParser()
config.read('config.ini')

['config.ini']

In [79]:
my_openai_key = config['OpenAI']['api_key']

In [114]:
mongodb_uri = config['MongoDB']['uri']

In [120]:
db_name = "langchain_demo"
collection_name = "chunked_data"
index = "vector_index"

# Preparing Data

#### Load a PDF file using LangChain and split it into multiple pages.

In [6]:
# TODO import the PyPDFLoader class from langchain_community.document_loaders
from langchain_community.document_loaders import PyPDFLoader

# TODO: Create an instance of the PyPDFLoader class with the path to the mongodb.pdf file
loader = PyPDFLoader('./mongodb.pdf')

# TODO: Load the pages of the PDF file
pages = loader.load()

# TODO: Print the first page of the PDF file
print(pages[0])

page_content='' metadata={'source': './mongodb.pdf', 'page': 0}


#### Clean and chunk data pulled from a PDF using a recursive text splitter

In [9]:
# TODO: Import the RecursiveCharacterTextSplitter from the langchain.text_splitter module
from langchain.text_splitter import RecursiveCharacterTextSplitter

# TODO: Create an empty list called cleaned_pages
cleaned_pages = []

# TODO: Loop through the pages and only append the pages with more than 20 words to the cleaned_pages list
for page in pages:
    if len(page.page_content.split(" ")) > 20:
        cleaned_pages.append(page)
        
# TODO: Create a variable called text_splitter and set it to an instance of RecursiveCharacterTextSplitter 
# with a chunk_size set to 500 and the chunk_overlap set to 150
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
  chunk_overlap=150)

# TODO: Call the split_documents method on the text_splitter object and pass in the cleaned_pages list
split_docs = text_splitter.split_documents(cleaned_pages)

# Print the 21st element in the split_docs list
print(split_docs[21])

page_content='familiar (or a schema for you Oracle folks). Within a MongoDB instance you can
have zero or more databases, each acting as high-level containers for everything
else.
2.A database can have zero or more collections . A collection shares enough in
common with a traditional table that you can safely think of the two as the same
thing.
3.Collections are made up of zero or more documents . Again, a document can safely
be thought of as analogous to a row.' metadata={'source': './mongodb.pdf', 'page': 5}


#### Create metadata for the chunks of data extracted from a PDF using LangChain and a large language model (LLM).

In [12]:
# TODO: Import create_metadata_tagger from langchain_community.document_transformers.openai_functions
from langchain_community.document_transformers.openai_functions import create_metadata_tagger

# TODO: Import ChatOpenAI from langchain_openai
from langchain_openai import ChatOpenAI

# TODO: Define schema for metadata tagging
schema = {
    "properties": {
        "title": {"type": "string"},
        "keywords": {"type": "array", "items": {"type": "string"}},
        "hasCode": {"type": "boolean"},
    },
    "required": ["title", "keywords", "hasCode"],
}

# TODO: Create an instance of ChatOpenAI, called `llm`, using the mock_openai_key, a temperature of 0, and the model "gpt-4o-mini"
llm = ChatOpenAI(openai_api_key=my_openai_key, temperature=0,
    model="gpt-4o-mini")

# TODO: Create a document transformer using the create_metadata_tagger function, passing in the schema and llm
document_transformer = create_metadata_tagger(schema, llm)

# TODO: Create a `docs` variable by calling the transform_documents method on the document_transformer, passing in cleaned_pages
docs = document_transformer.transform_documents(cleaned_pages)

# TODO: Replace cleaned_pages with docs
split_docs = text_splitter.split_documents(docs)

print(split_docs[0])

page_content='About This Book
License
The Little MongoDB Book is licensed under the Attribution-NonCommercial 3.0 Unported
license. Y ou should not have paid for this book.
You are basically free to copy, distribute, modify or display the book. However, please
always attribute the book to its original author - Karl Seguin - and do not use it for com-
mercial purposes. You can see the full text of the license at: http://creativecommons.
org/licenses/by-nc/3.0/legalcode
About The Original Author' metadata={'title': 'The Little MongoDB Book', 'keywords': ['license', 'author', 'company', 'latest version'], 'hasCode': False, 'source': './mongodb.pdf', 'page': 1}


#### Generate vector embeddings for the chunks and store them in MongoDB using a OpenAI

In [57]:
# TODO: Import MongoDBAtlasVectorSearch, ChatOpenAI and OpenAIEmbeddings
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [53]:
# TODO: Create a variable called `embeddings` that is an instance of OpenAIEmbeddings with the `openai_api_key` set to `my_openai_key`
print("Generating vector embeddings for the documents")
embeddings = OpenAIEmbeddings(openai_api_key=my_openai_key)

Generating vector embeddings for the documents


In [85]:
# Initialize the MongoDB client for storing the chunked data
from pymongo import MongoClient
client = MongoClient(mongodb_uri)

In [102]:
collection = client[db_name][collection_name]

In [89]:
# Drop the database before adding new data
print("Deleting the collection before adding new data")
collection.delete_many({})

Deleting the collection before adding new data


DeleteResult({'n': 0, 'electionId': ObjectId('7fffffff0000000000000105'), 'opTime': {'ts': Timestamp(1729745564, 6), 't': 261}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1729745564, 6), 'signature': {'hash': b'^\x1f10~0\xdf\xc6\x06A\xfc\xaf\xf5<]\xf9&\x83AW', 'keyId': 7374128216354586628}}, 'operationTime': Timestamp(1729745564, 6)}, acknowledged=True)

In [91]:
# TODO: Create a variable called `vector_store`. 
# Set it to an instance of MongoDBAtlasVectorSearch.from_documents and pass in the `split_docs`, `embeddings`, and `collection`
print("Storing the vectors in MongoDB Atlas")
vector_store = MongoDBAtlasVectorSearch.from_documents(split_docs, embeddings, collection=collection)

Storing the vectors in MongoDB Atlas


In [93]:
document_count = collection.count_documents({})
print(f"Successfully stored {document_count} documents in MongoDB Atlas")

Successfully stored 173 documents in MongoDB Atlas


# Retrieval

In [164]:
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import OpenAIEmbeddings

#### Create the following vector search index on the chunked_data collection in your Atlas Cluster

In [166]:
# {
#   "fields": [
#     {
#       "numDimensions": 1536,
#       "path": "embedding",
#       "similarity": "cosine",
#       "type": "vector"
#     },
#     {
#       "path": "hasCode",
#       "type": "filter"
#     }
#   ]
# }

In [148]:
# TODO: Create the vector store
vector_store = MongoDBAtlasVectorSearch.from_connection_string(
    connection_string=mongodb_uri,
    namespace=f"{db_name}.{collection_name}",
    embedding=OpenAIEmbeddings(disallowed_special=(),
        openai_api_key=my_openai_key),
    index_name=index,
)

In [150]:
# Define the query_data function
def query_data(query):
    retriever = vector_store.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 3},
    )
    results = retriever.invoke(query)
    print(results)

In [152]:
# Add a prefilter to the retriever by updating the query_data
def query_data(query):
    retriever = vector_store.as_retriever(
        search_type="similarity",
        search_kwargs={
            "k": 3,
            "pre_filter": { "hasCode": { "$eq": False } },
            "score_threshold": 0.01
        },
    )

    results = retriever.invoke(query)
    print(results)

In [154]:
# Query the data
query_data("When did MongoDB begin supporting multi-document transactions?")

[Document(metadata={'_id': '6719d2c776b49b966151d21f', 'title': 'MongoDB Overview', 'keywords': ['MongoDB', 'relational database', 'transactions', 'data storage'], 'hasCode': False, 'source': './mongodb.pdf', 'page': 34}, page_content='restrictions on application developers. The addition of transactions addressed a legiti-\nmate and serious concern. So, when people ask where does MongoDB sit with respect\nto the new data storage landscape? the answer is simple: right in the middle .\n34'), Document(metadata={'_id': '6719d2c776b49b966151d210', 'title': 'MongoDB Features', 'keywords': ['capped collections', 'TTL Indexes', 'Durability', 'journaling', 'full text search', 'ACID transactions'], 'hasCode': False, 'source': './mongodb.pdf', 'page': 31}, page_content='engine. With MongoDB’s support for arrays and full text search, you will only need to\nlook to other solutions if you need a more powerful and full-featured full text search\nengine.\nTransactions\nMongoDB added full support for A

In [155]:
query_data("What is the difference between a database and collection in MongoDB?")

[Document(metadata={'_id': '6719d2c776b49b966151d18a', 'title': 'Chapter 1 - The Basics', 'keywords': ['MongoDB', 'database', 'collection', 'document', 'field', 'index', 'cursor'], 'hasCode': False, 'source': './mongodb.pdf', 'page': 5}, page_content='familiar (or a schema for you Oracle folks). Within a MongoDB instance you can\nhave zero or more databases, each acting as high-level containers for everything\nelse.\n2.A database can have zero or more collections . A collection shares enough in\ncommon with a traditional table that you can safely think of the two as the same\nthing.\n3.Collections are made up of zero or more documents . Again, a document can safely\nbe thought of as analogous to a row.'), Document(metadata={'_id': '6719d2c776b49b966151d18b', 'title': 'Chapter 1 - The Basics', 'keywords': ['MongoDB', 'database', 'collection', 'document', 'field', 'index', 'cursor'], 'hasCode': False, 'source': './mongodb.pdf', 'page': 5}, page_content='thing.\n3.Collections are made up 

# Answer Generation

In [160]:
# build the answer generation component of your RAG application. 
# This involves updating the query_data function to generate answers based on specific prompts 
# using a custom template and a series of steps in LangChain.

In [178]:
import os
from dotenv import load_dotenv
from langchain_core.output_parsers import StrOutputParser # New import
from langchain_core.runnables import RunnablePassthrough # New import
from langchain.prompts import PromptTemplate # New import
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [190]:
def query_data(query):
    retriever = vector_store.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 3},
    )

    # TODO: Define the template string
    template = """
    Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Do not answer the question if there is no given context.
    Do not answer the question if it is not related to the context.
    Do not give recommendations to anything other than MongoDB.
    Context:
    {context}
    Question: {question}
    """

    # TODO: Instatiate a custom PromptTemplate object
    # Instantiate the PromptTemplate using PromptTemplate.from_template()
    # and assign the result to custom_rag_prompt, passing the template as an argument.
    custom_rag_prompt = PromptTemplate.from_template(template)

    # TODO: Add context and question to the retrieve dictionary
    # Update the dictionary called retrieve and provide values for context and question. 
    # Use the retriever object for the context and RunnablePassthrough() for the question.
    retrieve = {
        "context": retriever
        | (lambda docs: "\n\n".join([d.page_content for d in docs])),
        "question": RunnablePassthrough(), 
    }

    llm = ChatOpenAI(openai_api_key=my_openai_key, temperature=0)

    # TODO: Instatiate a custom StrOutputParser object
    response_parser = StrOutputParser()

    # TODO: Create a RAG chain with the retrieve, custom_rag_prompt, llm, and response_parser objects (separated by | )
    rag_chain = retrieve | custom_rag_prompt | llm | response_parser
    answer = rag_chain.invoke(query)
    print(answer)

In [192]:
# Test with a relevant query
question = "When did MongoDB begin supporting multi-document transactions?"
print(f"Running query: {question}")
query_data(question)

Running query: When did MongoDB begin supporting multi-document transactions?
MongoDB began supporting multi-document transactions in version 4.0.


In [194]:
# Test with an irrelevant query
question = "Why is the sky blue?"
print(f"Running query: {question}")
query_data(question)

Running query: Why is the sky blue?
I don't know.


In [196]:
question = "What is the difference between a database and collection in MongoDB?"
print(f"Running query: {question}")
query_data(question)

Running query: What is the difference between a database and collection in MongoDB?
In MongoDB, a database can have zero or more collections. A collection is similar to a traditional table in a relational database, while a database acts as a high-level container for collections. So, the main difference is that a database contains collections, which in turn contain documents.
