[![Lab Documentation and Solutions](https://img.shields.io/badge/Lab%20Documentation%20and%20Solutions-purple)](https://mongodb-developer.github.io/ai-agents-lab/)


# Step 1: Setup prerequisites

In [15]:
import os
from pymongo import MongoClient

In [16]:
MONGODB_URI = "mongodb://admin:mongodb@mongodb:27017/"
# Initialize a MongoDB Python client
mongodb_client = MongoClient(MONGODB_URI, appname="devrel.workshop.agents")
# Check the connection to the server
mongodb_client.admin.command("ping")

{'ok': 1.0,
 '$clusterTime': {'clusterTime': Timestamp(1745421886, 1),
  'signature': {'hash': b'\xb2Lm)\xfcG\x12\x92b-6r\xe2\x1f)\x8c\xd0V\xbc\x19',
   'keyId': 7496241214685970439}},
 'operationTime': Timestamp(1745421886, 1)}

### **Do not change the values assigned to the variables below**

In [None]:
#  Database name
DB_NAME = "mongodb_genai_devday-agents"
# Name of the collection with full documents- used for summarization
FULL_COLLECTION_NAME = "mongodb-docs"
# Name of the collection for vector search- used for Q&A
VS_COLLECTION_NAME = "mongodb-docs-with-embeddings"
# Name of the vector search index
VS_INDEX_NAME = "vector_index"

📚 https://pymongo.readthedocs.io/en/stable/tutorial.html#getting-a-database

In [None]:
# Connect to the `DB_NAME` database.
# Use the `mongodb_client` object instantiated above.
db = <CODE_BLOCK_1>

📚 https://pymongo.readthedocs.io/en/stable/tutorial.html#getting-a-collection

In [None]:
# Connect to the `VS_COLLECTION_NAME` collection.
# Use the `db` and collection name defined above.
vs_collection = <CODE_BLOCK_2>

In [None]:
# Connect to the `FULL_COLLECTION_NAME` collection.
# Use the `db` and collection name defined above.
full_collection = <CODE_BLOCK_3>

### **Pick an LLM provider of your choice**

In [32]:
# Can be one of "aws", "google" or "microsoft"
LLM_PROVIDER = "aws"

# Step 2: Import data into MongoDB

In [33]:
import json

In [34]:
with open('./../../sources/mongodb-docs.json', 'r') as data_file:
    json_data = data_file.read()

data = json.loads(json_data)

full_collection.delete_many({})
full_collection.insert_many(data)

InsertManyResult([ObjectId('6809068eba575960aed5af4f'), ObjectId('6809068eba575960aed5af50'), ObjectId('6809068eba575960aed5af51'), ObjectId('6809068eba575960aed5af52'), ObjectId('6809068eba575960aed5af53'), ObjectId('6809068eba575960aed5af54'), ObjectId('6809068eba575960aed5af55'), ObjectId('6809068eba575960aed5af56'), ObjectId('6809068eba575960aed5af57'), ObjectId('6809068eba575960aed5af58'), ObjectId('6809068eba575960aed5af59'), ObjectId('6809068eba575960aed5af5a'), ObjectId('6809068eba575960aed5af5b'), ObjectId('6809068eba575960aed5af5c'), ObjectId('6809068eba575960aed5af5d'), ObjectId('6809068eba575960aed5af5e'), ObjectId('6809068eba575960aed5af5f'), ObjectId('6809068eba575960aed5af60'), ObjectId('6809068eba575960aed5af61'), ObjectId('6809068eba575960aed5af62')], acknowledged=True)

In [35]:
with open('./../../sources/mongodb-docs-with-embeddings.json', 'r') as data_file:
    json_data = data_file.read()

data = json.loads(json_data)

vs_collection.delete_many({})
vs_collection.insert_many(data)

InsertManyResult([ObjectId('68090692ba575960aed5af63'), ObjectId('68090692ba575960aed5af64'), ObjectId('68090692ba575960aed5af65'), ObjectId('68090692ba575960aed5af66'), ObjectId('68090692ba575960aed5af67'), ObjectId('68090692ba575960aed5af68'), ObjectId('68090692ba575960aed5af69'), ObjectId('68090692ba575960aed5af6a'), ObjectId('68090692ba575960aed5af6b'), ObjectId('68090692ba575960aed5af6c'), ObjectId('68090692ba575960aed5af6d'), ObjectId('68090692ba575960aed5af6e'), ObjectId('68090692ba575960aed5af6f'), ObjectId('68090692ba575960aed5af70'), ObjectId('68090692ba575960aed5af71'), ObjectId('68090692ba575960aed5af72'), ObjectId('68090692ba575960aed5af73'), ObjectId('68090692ba575960aed5af74'), ObjectId('68090692ba575960aed5af75'), ObjectId('68090692ba575960aed5af76'), ObjectId('68090692ba575960aed5af77'), ObjectId('68090692ba575960aed5af78'), ObjectId('68090692ba575960aed5af79'), ObjectId('68090692ba575960aed5af7a'), ObjectId('68090692ba575960aed5af7b'), ObjectId('68090692ba575960aed5af

# Step 3: Create a vector search index

In [None]:
# Create vector index definition specifying:
# path: Path to the embeddings field
# numDimensions: Number of embedding dimensions- depends on the embedding model used
# similarity: Similarity metric. One of cosine, euclidean, dotProduct.
model = {
    "name": VS_INDEX_NAME,
    "type": "vectorSearch",
    "definition": {
        "fields": [
            {
                "type": "vector",
                "path": "embedding",
                "numDimensions": 384,
                "similarity": "cosine",
            }
        ]
    },
}

📚 https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.create_search_index

In [None]:
# Create a vector search index with the above `model` for the `vs_collection` collection
<CODE_BLOCK_4>

# Step 4: Create agent tools


In [None]:
# You may see a warning upon running this cell. You can ignore it.
from langchain.agents import tool
from sentence_transformers import SentenceTransformer
from typing import List

### Vector Search

In [None]:
# Load the `gte-small` model using the Sentence Transformers library
embedding_model = SentenceTransformer("thenlper/gte-small")

📚 https://huggingface.co/thenlper/gte-small#usage (See "Use with sentence-transformers" under Usage)

In [None]:
# Define a function that takes a piece of text (`text`) as input, embeds it using the `embedding_model` instantiated above and returns the embedding as a list
# An array can be converted to a list using the `tolist()` method
def get_embedding(text: str) -> List[float]:
    """
    Generate the embedding for a piece of text.

    Args:
        text (str): Text to embed.

    Returns:
        List[float]: Embedding of the text as a list.
    """
    embedding = <CODE_BLOCK_5>
    return embedding.tolist()

📚 https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-stage/#ann-examples (Refer to the "Basic Example")

In [None]:
# Define a tool to retrieve relevant documents for a user query using vector search
@tool
def get_information_for_question_answering(user_query: str) -> str:
    """
    Retrieve information using vector search to answer a user query.

    Args:
    user_query (str): The user's query string.

    Returns:
    str: The retrieved information formatted as a string.
    """

    # Generate embedding for the `user_query` using the `get_embedding` function defined above
    query_embedding = <CODE_BLOCK_6>

    # Define an aggregation pipeline consisting of a $vectorSearch stage, followed by a $project stage
    # Set the number of candidates to 150 and only return the top 5 documents from the vector search
    # In the $project stage, exclude the `_id` field and include only the `body` field and `vectorSearchScore`
    # NOTE: Use variables defined previously for the `index`, `queryVector` and `path` fields in the $vectorSearch stage
    pipeline = <CODE_BLOCK_7>

    # Execute the aggregation `pipeline` against the `vs_collection` collection and store the results in `results`
    results = <CODE_BLOCK_8>
    # Concatenate the results into a string
    context = "\n\n".join([doc.get("body") for doc in results])
    return context

### Get page content

📚 https://www.mongodb.com/docs/manual/reference/method/db.collection.findOne/#return-all-but-the-excluded-fields

In [None]:
# Define a tool to retrieve the content of a documentation page for summarization
@tool
def get_page_content_for_summarization(user_query: str) -> str:
    """
    Retrieve page content based on provided title.

    Args:
    user_query (str): The user's query string i.e. title of the documentation page.

    Returns:
    str: The content of the page.
    """
    # Query the documents where the `title` field is equal to the `user_query`
    query = <CODE_BLOCK_9>
    # Only return the `body` field from the retrieved documents.
    # NOTE: Set fields to include to 1, those to exclude to 0. `_id` is included by default, so exclude that.
    projection = <CODE_BLOCK_10>
    # Use the `query` and `projection` with the `find_one` method
    # to get the `body` of the document with `title` equal to the `user_query` from the `full_collection` collection
    document = <CODE_BLOCK_11>
    if document:
        return document["body"]
    else:
        return "Document not found"

In [None]:
# Create the list of tools
tools = [
    get_information_for_question_answering,
    get_page_content_for_summarization,
]

### Test out the tools


In [None]:
# Test out the `get_information_for_question_answering` tool with the query "What are some best practices for data backups in MongoDB?"
# You should see a non-empty response
get_information_for_question_answering.invoke(
    "What are some best practices for data backups in MongoDB?"
)

In [None]:
# Test out the `get_page_content_for_summarization` tool with page title "Create a MongoDB Deployment"
# You should see a non-empty response
get_page_content_for_summarization.invoke("Create a MongoDB Deployment")

# Step 5: Define graph state

In [None]:
from typing import Annotated
from langgraph.graph.message import add_messages
from typing_extensions import TypedDict

In [None]:
# Define the graph state
# We are only tracking chat messages but you can track other attributes as well
class GraphState(TypedDict):
    messages: Annotated[list, add_messages]

# Step 6: Instantiate the LLM

In [None]:
from langchain_core.load import load
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

In [None]:
# Obtain the Langchain LLM object from our serverless endpoint
llm_dict = requests.post(
    url=SERVERLESS_URL, json={"task": "get_llm", "data": LLM_PROVIDER}
).json()
llm = load(llm_dict["llm"], secrets_map=llm_dict["secrets_map"])

In [None]:
# Create a Chain-of-Thought (CoT) prompt template for the agent.
# This includes a system prompt with a placeholder for tool names, and a placeholder for messages i.e. user queries and assistant responses
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "You are a helpful AI assistant."
            " You are provided with tools to answer questions and summarize technical documentation related to MongoDB."
            " Think step-by-step and use these tools to get the information required to answer the user query."
            " Do not re-run tools unless absolutely necessary."
            " If you are not able to get enough information using the tools, reply with I DON'T KNOW."
            " You have access to the following tools: {tool_names}."
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
)

In [None]:
# Fill in the prompt template with the tool names
prompt = prompt.partial(tool_names=", ".join([tool.name for tool in tools]))

📚 https://python.langchain.com/v0.1/docs/modules/model_io/chat/function_calling/#binding-tool-schemas

In [None]:
# Bind the `tools` to the `llm` instantiated above
bind_tools = <CODE_BLOCK_12>

📚 https://python.langchain.com/v0.1/docs/expression_language/primitives/sequence/#the-pipe-operator

In [None]:
# Chain the `prompt` with the tool-bound llm using the `|` operator
llm_with_tools = <CODE_BLOCK_13>

In [None]:
# Test that the LLM is making the right tool calls
llm_with_tools.invoke(
    ["Give me a summary of the page titled Create a MongoDB Deployment."]
).tool_calls

In [None]:
# Test that the LLM is making the right tool calls
llm_with_tools.invoke(
    ["What are some best practices for data backups in MongoDB?"]
).tool_calls

# Step 7: Define graph nodes

In [None]:
from langchain_core.messages import ToolMessage
from typing import Dict
from pprint import pprint

In [None]:
# Define the agent node
def agent(state: GraphState) -> Dict[str, List]:
    """
    Agent node

    Args:
        state (GraphState): Graph state

    Returns:
        Dict[str, List]: Updates to messages
    """
    # Get the messages from the graph `state`
    messages = <CODE_BLOCK_14>
    # Invoke `llm_with_tools` with `messages` using the `invoke` method
    # HINT: See Step 6 for how to invoke `llm_with_tools`
    result = <CODE_BLOCK_15>
    # Write `result` to the `messages` attribute of the graph state
    return {"messages": [result]}

In [None]:
# Create a map of tool name to tool call
tools_by_name = {tool.name: tool for tool in tools}
pprint(tools_by_name)

In [None]:
# Define tool node
def tool_node(state: GraphState) -> Dict[str, List]:
    """
    Tool node

    Args:
        state (GraphState): Graph state

    Returns:
        Dict[str, List]: Updates to messages
    """
    result = []
    # Get the list of tool calls from messages
    tool_calls = state["messages"][-1].tool_calls
    # A tool_call looks as follows:
    # {
    #     "name": "get_information_for_question_answering",
    #     "args": {"user_query": "What are Atlas Triggers"},
    #     "id": "call_H5TttXb423JfoulF1qVfPN3m",
    #     "type": "tool_call",
    # }
    # Iterate through `tool_calls`
    for tool_call in tool_calls:
        # Get the tool from `tools_by_name` using the `name` attribute of the `tool_call`
        tool = tools_by_name[tool_call["name"]]
        # Invoke the `tool` using the `args` attribute of the `tool_call`
        # HINT: See previous line to see how to extract attributes from `tool_call`
        observation = <CODE_BLOCK_16>
        # Append the result of executing the tool to the `result` list as a ToolMessage
        # The `content` of the message is `observation` i.e. result of the tool call
        # The `tool_call_id` can be obtained from the `tool_call`
        result.append(ToolMessage(content=observation, tool_call_id=tool_call["id"]))
    # Write `result` to the `messages` attribute of the graph state
    return {"messages": result}

# Step 8: Define conditional edges

In [None]:
from langgraph.graph import END

In [None]:
# Define conditional routing function
def route_tools(state: GraphState):
    """
    Use in the conditional_edge to route to the tool node if the last message
    has tool calls. Otherwise, route to the end.
    """
    # Get messages from graph state
    messages = state.get("messages", [])
    if len(messages) > 0:
        # Get the last AI message from messages
        ai_message = messages[-1]
    else:
        raise ValueError(f"No messages found in input state to tool_edge: {state}")
    # Check if the last message has tool calls
    if hasattr(ai_message, "tool_calls") and len(ai_message.tool_calls) > 0:
        # If yes, return "tools"
        return "tools"
    # If no, return END
    return END

# Step 9: Build the graph

In [None]:
from langgraph.graph import StateGraph, START
from IPython.display import Image, display

In [None]:
# Instantiate the graph
graph = StateGraph(GraphState)

📚 https://blog.langchain.dev/langgraph/#nodes

In [None]:
# Add nodes to the `graph` using the `add_node` function
# Add a `agent` node. The `agent` node should run the `agent` function
<CODE_BLOCK_17>
# Add a `tools` node. The `tools` node should run the `tool_node` function
<CODE_BLOCK_18>

📚 https://langchain-ai.github.io/langgraph/concepts/low_level/#normal-edges

In [None]:
# Add fixed edges to the `graph` using the `add_edge` method
# Add an edge from the START node to the `agent` node
<CODE_BLOCK_19>
# Add an edge from the `tools` node to the `agent` node
<CODE_BLOCK_20>

📚 https://langchain-ai.github.io/langgraph/concepts/low_level/#conditional-edges

In [None]:
# Use the `add_conditional_edges` method to add a conditional edge from the `agent` node to the `tools` node
# based on the output of the `route_tools` function
<CODE_BLOCK_21>

In [None]:
# Compile the `graph`
app = graph.compile()

In [None]:
# Visualize the graph
try:
    display(Image(app.get_graph().draw_mermaid_png()))
except Exception:
    # This requires some extra dependencies and is optional
    pass

# Step 10: Execute the graph

In [None]:
# Stream outputs from the graph as they pass through its nodes
def execute_graph(user_input: str) -> None:
    """
    Stream outputs from the graph

    Args:
        user_input (str): User query string
    """
    # Add user input to the messages attribute of the graph state
    # The role of the message should be "user" and content should be `user_input`
    input = {"messages": [("user", user_input)]}
    # Pass input to the graph and stream the outputs
    for output in app.stream(input):
        for key, value in output.items():
            print(f"Node {key}:")
            print(value)
    print("---FINAL ANSWER---")
    print(value["messages"][-1].content)

In [None]:
# Test the graph execution to view end-to-end flow
execute_graph("What are some best practices for data backups in MongoDB?")

In [None]:
# Test the graph execution to view end-to-end flow
execute_graph("Give me a summary of the page titled Create a MongoDB Deployment")

# Step 11: Add memory to the agent

In [None]:
from langgraph.checkpoint.mongodb import MongoDBSaver

In [None]:
# Initialize a MongoDB checkpointer
checkpointer = MongoDBSaver(mongodb_client)

In [None]:
# Instantiate the graph with the checkpointer
app = graph.compile(checkpointer=checkpointer)

📚 https://langchain-ai.github.io/langgraph/concepts/persistence/#threads

In [None]:
def execute_graph(thread_id: str, user_input: str) -> None:
    """
    Stream outputs from the graph

    Args:
        thread_id (str): Thread ID for the checkpointer
        user_input (str): User query string
    """
    # Add user input to the messages attribute of the graph state
    # The role of the message should be "user" and content should be `user_input`
    input = {"messages": [("user", user_input)]}
    # Define a config containing the thread ID
    config = <CODE_BLOCK_22>
    # Pass `input` and `config` to the graph and stream outputs
    for output in app.stream(input, config):
        for key, value in output.items():
            print(f"Node {key}:")
            print(value)
    print("---FINAL ANSWER---")
    print(value["messages"][-1].content)

In [None]:
# Test graph execution with thread ID
execute_graph(
    "1",
    "What are some best practices for data backups in MongoDB?",
)

In [None]:
# Follow-up question to ensure message history works
execute_graph(
    "1",
    "What did I just ask you?",
)