# MongoDB Vector Search - LlamaIndex Integration

This notebook is a companion to the [LlamaIndex Get Started](https://www.mongodb.com/docs/atlas/ai-integrations/llamaindex/) page. Refer to the page for set-up instructions and detailed explanations.

<a target="_blank" href="https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/ai-integrations/llamaindex.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
pip install --quiet --upgrade llama-index llama-index-vector-stores-mongodb llama-index-llms-openai llama-index-embeddings-voyageai pymongo

In [None]:
import os

os.environ["VOYAGEAI_API_KEY"] = "<voyageai-api-key>"
os.environ["OPENAI_API_KEY"] = "<openai-api-key>"
MONGODB_URI = "<connection-string>"

In [None]:
from llama_index.embeddings.voyageai import VoyageEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.settings import Settings

embed_model= VoyageEmbedding(
  voyage_api_key = os.environ["VOYAGEAI_API_KEY"],
  model_name = "voyage-3-large",
)

Settings.llm = OpenAI()
Settings.embed_model = embed_model
Settings.chunk_size = 100
Settings.chunk_overlap = 10

In [None]:
from llama_index.core import SimpleDirectoryReader

# Load the sample data
from urllib.request import urlretrieve
urlretrieve("https://investors.mongodb.com/node/13176/pdf", "mongodb-earnings-report.pdf")
sample_data = SimpleDirectoryReader(input_files=["mongodb-earnings-report.pdf"]).load_data()

# Print the first document
sample_data[0]

In [None]:
import pymongo
from llama_index.core import StorageContext
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch

# Connect to your MongoDB cluster
mongo_client = pymongo.MongoClient(MONGODB_URI)

# Instantiate the vector store
atlas_vector_store = MongoDBAtlasVectorSearch(
    mongo_client,
    db_name = "llamaindex_db",
    collection_name = "test",
    vector_index_name = "vector_index"
)
vector_store_context = StorageContext.from_defaults(vector_store=atlas_vector_store)

In [None]:
from llama_index.core import VectorStoreIndex

# Store the data as vector embeddings
vector_store_index = VectorStoreIndex.from_documents(
   sample_data, storage_context=vector_store_context, show_progress=True
)

In [None]:
from pymongo.operations import SearchIndexModel
import time

# Specify the collection for which to create the index
collection = mongo_client["llamaindex_db"]["test"]

# Create your index model, then create the search index
search_index_model = SearchIndexModel(
  definition={
    "fields": [
      {
        "type": "vector",
        "path": "embedding",
        "numDimensions": 1024,
        "similarity": "cosine"
      },
      {
        "type": "filter",
        "path": "metadata.page_label"
      }
    ]
  },
  name="vector_index",
  type="vectorSearch",
)
result = collection.create_search_index(model=search_index_model)

# Wait for initial sync to complete
print("Polling to check if the index is ready. This may take up to a minute.")
predicate=None
if predicate is None:
  predicate = lambda index: index.get("queryable") is True

while True:
  indices = list(collection.list_search_indexes(result))
  if len(indices) and predicate(indices[0]):
    break
  time.sleep(5)
print(result + " is ready for querying.")

## Semantic Search Query

In [None]:
retriever = vector_store_index.as_retriever(similarity_top_k=3)
nodes = retriever.retrieve("MongoDB acquisition")

for node in nodes:
    print(node)

## Semantic Search with Filtering

In [None]:
from llama_index.core.vector_stores import MetadataFilter, MetadataFilters, ExactMatchFilter, FilterOperator

# Specify metadata filters
metadata_filters = MetadataFilters(
   filters=[ExactMatchFilter(key="metadata.page_label", value="2")]
)
retriever = vector_store_index.as_retriever(similarity_top_k=3, filters=metadata_filters)
nodes = retriever.retrieve("MongoDB acquisition")

for node in nodes:
    print(node)

## Basic RAG

In [None]:
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
import pprint

# Instantiate MongoDB Vector Search as a retriever
vector_store_retriever = VectorIndexRetriever(index=vector_store_index, similarity_top_k=5)

# Pass the retriever into the query engine
query_engine = RetrieverQueryEngine(retriever=vector_store_retriever)

# Prompt the LLM
response = query_engine.query("What was MongoDB's latest acquisition?")

print(response)
print("\nSource documents: ")
pprint.pprint(response.source_nodes)

## RAG with Filters

In [None]:
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
import pprint

# Specify metadata filters
metadata_filters = MetadataFilters(
   filters=[ExactMatchFilter(key="metadata.page_label", value="2")]
)

# Instantiate MongoDB Vector Search as a retriever
vector_store_retriever = VectorIndexRetriever(index=vector_store_index, filters=metadata_filters, similarity_top_k=5)

# Pass the retriever into the query engine
query_engine = RetrieverQueryEngine(retriever=vector_store_retriever)

# Prompt the LLM
response = query_engine.query("What was MongoDB's latest acquisition?")

print(response)
print("\nSource documents: ")
pprint.pprint(response.source_nodes)