MMR (Maximal Marginal Relevance) retrieval in LangChain 

In [1]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

In [2]:
# Step 1: Prepare sample documents
docs = [
    'Green tea contains antioxidants that improve health.',
    'Black tea is rich in caffeine and boosts energy.',
    'Catechins in green tea help reduce cholesterol.',
    'Tea ceremonies are an important cultural tradition in Japan.',
    'Green tea may improve metabolism and aid in weight loss.'
]

In [3]:
# Step 2: Create embeddings

# HuggingFaceEmbeddings converts text into dense vector representations. 
# These embeddings allow us to measure semantic similarity between query and documents.

embeddings = HuggingFaceBgeEmbeddings(model_name='all-MiniLM-L6-v2')
#embeddings

  embeddings = HuggingFaceBgeEmbeddings(model_name='all-MiniLM-L6-v2')


In [4]:
# Step 3: Build FAISS vector store

# FAISS is a fast similarity search library. 
# # Here we create an index of our documents using their embeddings.

vectorstore = FAISS.from_texts(docs, embeddings)
vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x164f6aebad0>

In [None]:
# Step 4: Configure retriever with MMR 
# LangChain retrievers support different search strategies. 
# By setting search_type="mmr", we enable Maximal Marginal Relevance. 
# MMR balances relevance to the query and diversity among selected documents.

retriever = vectorstore.as_retriever(
    search_type='mmr', # Enable MMR reranking
    search_kwargs={
        'k':3, # Final number of documents to return
        'fetch_k': 10, # Number of initial candidates to consider
        'lambda_mult': 0.7 # Balance factor: closer to 1 = more relevance, closer to 0 = more diversity
    }
)

In [None]:
# Step 5: Run a query
# Retrieve top-k documents using MMR

query = 'health benefits of green tea'

results = retriever.get_relevant_documents(query)



# Step 6: Display results
# The selected documents will be diverse yet relevant to the query.

for i, doc in enumerate(results, 1):
    print(f'Doc {i}: {doc.page_content}')

Doc 1: Green tea contains antioxidants that improve health.
Doc 2: Green tea may improve metabolism and aid in weight loss.
Doc 3: Black tea is rich in caffeine and boosts energy.
