# LangChain MongoDB Integration - Memory and Semantic Caching for RAG

This notebook is a companion to the [Memory and Semantic Caching](https://www.mongodb.com/docs/atlas/atlas-vector-search/ai-integrations/langchain/memory-semantic-cache/) tutorial. Refer to the page for set-up instructions and detailed explanations.

<a target="_blank" href="https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-memory-and-semantic-caching.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
pip install --quiet --upgrade langchain langchain-community langchain-core langchain-mongodb langchain-openai pymongo

In [None]:
import os

os.environ["OPENAI_API_KEY"] = "<api-key>"
ATLAS_CONNECTION_STRING = "<connection-string>"

## Configure the Vector Store

In [None]:
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import OpenAIEmbeddings

# Use text-embedding-ada-002 since that's what was used to create embeddings in the movies dataset
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")

# Create the vector store
vector_store = MongoDBAtlasVectorSearch.from_connection_string(
   connection_string = ATLAS_CONNECTION_STRING,
   embedding = embedding_model,
   namespace = "sample_mflix.embedded_movies",
   text_key = "plot",
   embedding_key = "plot_embedding",
   relevance_score_fn = "dotProduct"
)

In [None]:
import time

# Use LangChain helper method to create the vector search index
vector_store.create_vector_search_index(
   dimensions = 1536 # The dimensions of the vector embeddings to be indexed
)

# Wait for the index to build (this can take around a minute)
time.sleep(60)

## Implement RAG with Memory

In [None]:
from langchain_openai import ChatOpenAI

# Define the model to use for chat completion
model = ChatOpenAI(model = "gpt-4o")

In [None]:
from langchain_mongodb.chat_message_histories import MongoDBChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.prompts import MessagesPlaceholder
         
# Define a function that gets the chat message history 
def get_session_history(session_id: str) -> MongoDBChatMessageHistory:
    return MongoDBChatMessageHistory(
        connection_string=ATLAS_CONNECTION_STRING,
        session_id=session_id,
        database_name="sample_mflix",
        collection_name="embedded_movies"
    )

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Create a prompt to generate standalone questions from follow-up questions
standalone_system_prompt = """
  Given a chat history and a follow-up question, rephrase the follow-up question to be a standalone question.
  Do NOT answer the question, just reformulate it if needed, otherwise return it as is.
  Only return the final standalone question.
"""

standalone_question_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", standalone_system_prompt),
        MessagesPlaceholder(variable_name="history"),
        ("human", "{question}"),
    ]
)
# Parse output as a string
parse_output = StrOutputParser()

question_chain = standalone_question_prompt | model | parse_output

In [None]:
from langchain_core.runnables import RunnablePassthrough

# Create a retriever
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})

# Create a retriever chain that processes the question with history and retrieves documents
retriever_chain = RunnablePassthrough.assign(
    context=question_chain | retriever | (lambda docs: "\n\n".join([d.page_content for d in docs]))
)

In [None]:
# Create a prompt template that includes the retrieved context and chat history
rag_system_prompt = """Answer the question based only on the following context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", rag_system_prompt),
        MessagesPlaceholder(variable_name="history"),
        ("human", "{question}"),
    ]
)

In [None]:
# Build the RAG chain
rag_chain = (
    retriever_chain
    | rag_prompt
    | model
    | parse_output
)

# Wrap the chain with message history
rag_with_memory = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="question",
    history_messages_key="history",
)

In [None]:
# First question
response1 = rag_with_memory.invoke(
    {"question": "What are some good science fiction movies?"},
    {"configurable": {"session_id": "user123"}}
)
print(response1)

In [None]:
# Follow-up question that references the previous question
response2 = rag_with_memory.invoke(
    {"question": "Which one has the best special effects?"},
    {"configurable": {"session_id": "user123"}}
)
print(response2)

## Add Semantic Caching

In [None]:
from langchain_mongodb.cache import MongoDBAtlasSemanticCache
from langchain_core.globals import set_llm_cache

# Configure the semantic cache
set_llm_cache(MongoDBAtlasSemanticCache(
    connection_string = ATLAS_CONNECTION_STRING,
    database_name = "sample_mflix",
    collection_name = "semantic_cache",
    embedding = embedding_model,
    index_name = "vector_index",
    similarity_threshold = 0.85  # Adjust based on your requirements
))

In [None]:
# First query (not cached)
start_time = time.time()
result1 = rag_with_memory.invoke({"question": "What are some movies about time travel?"})
end_time = time.time()
print(f"First query time: {end_time - start_time:.2f} seconds")
print(result1)

In [None]:
# Semantically similar query (cached)
start_time = time.time()
result2 = rag_with_memory.invoke({"question": "Can you recommend films that involve time travel?"})
end_time = time.time()
print(f"Second query time: {end_time - start_time:.2f} seconds")
print(result2)