# MongoDB Vector Search - Vector Quantization - Existing Data

This notebook is a companion to the [Vector Quantization](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-quantization/#how-to-ingest-pre-quantized-vectors) tutorial. Refer to the page for set-up instructions and detailed explanations.

This notebook takes you through how to pre-quantize and ingest your vectors for vector search from **existing data in Atlas** by using the `voyage-3-large` model from [Voyage AI](https://www.voyageai.com).

<a target="_blank" href="https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/quantization/existing-data.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
pip install --quiet --upgrade voyageai pymongo

In [None]:
import os
import voyageai
from bson.binary import Binary, BinaryVectorDtype

# Initialize the VoyageAI Client
os.environ["VOYAGE_API_KEY"] = "<VOYAGEAI-API-KEY>"
vo = voyageai.Client()

# Define a function to generate embeddings for all strings in `texts`
def generate_embeddings(texts, model: str, dtype: str, output_dimension: int):
    embeddings = []
    for text in texts:  # Process eachstring in the data list
        embedding = vo.embed(
            texts=[text],  # Pass each string as a list with a single item
            model=model,
            output_dtype=dtype,
            output_dimension=output_dimension,
        ).embeddings[0]
        embeddings.append(embedding)  # Collect the embedding for the current text
    return embeddings

# Convert embeddings to BSON vectors
def generate_bson_vector(vector, vector_dtype):
   return Binary.from_vector(vector, vector_dtype)

In [None]:
import pymongo  

# Connect to your MongoDB cluster
mongo_client = pymongo.MongoClient("<CONNECTION-STRING>")
db = mongo_client["sample_airbnb"]
collection = db["listingsAndReviews"]

# Filter to exclude null or empty summary fields
filter = { "summary": {"$nin": [None, ""]} }

# Get a subset of documents in the collection
documents = collection.find(filter).limit(50)

# Initialize the count of updated documents
updated_doc_count = 0

In [None]:
model_name = "voyage-3-large"
output_dimension = 1024
float32_field = "float32_embedding"
int8_field = "int8_embedding"
int1_field = "int1_embedding"

# Process and update each document
updated_doc_count = 0  
for document in documents:  
    summary = document.get("summary")  
    if not summary:  
        continue  
  
    # Generate embeddings for the summary field  
    float_embeddings = generate_embeddings([summary], model=model_name, dtype="float", output_dimension=output_dimension)  
    int8_embeddings = generate_embeddings([summary], model=model_name, dtype="int8", output_dimension=output_dimension)  
    ubinary_embeddings = generate_embeddings([summary], model=model_name, dtype="ubinary", output_dimension=output_dimension)  
  
    # Convert embeddings to BSON-compatible format  
    bson_float = generate_bson_vector(float_embeddings[0], BinaryVectorDtype.FLOAT32)  
    bson_int8 = generate_bson_vector(int8_embeddings[0], BinaryVectorDtype.INT8)  
    bson_ubinary = generate_bson_vector(ubinary_embeddings[0], BinaryVectorDtype.PACKED_BIT)  
  
    # Prepare the updated document  
    updated_fields = {  
        float32_field: bson_float,  
        int8_field: bson_int8,  
        int1_field: bson_ubinary,
    }  
  
    # Update the document in MongoDB  
    result = collection.update_one({"_id": document["_id"]}, {"$set": updated_fields})  
    if result.modified_count > 0:  
        updated_doc_count += 1  
  
# Print the results  
print(f"Number of documents updated: {updated_doc_count}") 

In [None]:
from pymongo.operations import SearchIndexModel
import time

# Define and create the vector search index
index_name = "vector_index"
search_index_model = SearchIndexModel(
  definition={
    "fields": [
      {
        "type": "vector",
        "path": float32_field,
        "similarity": "dotProduct",
        "numDimensions": 1024
      },
      {
        "type": "vector",
        "path": int8_field,
        "similarity": "dotProduct",
        "numDimensions": 1024
      },
      {
        "type": "vector",
        "path": int1_field,
        "similarity": "euclidean",
        "numDimensions": 1024
      }
    ]
  },
  name=index_name,
  type="vectorSearch"
)
result = collection.create_search_index(model=search_index_model)
print("New search index named " + result + " is building.")

# Wait for initial sync to complete
print("Polling to check if the index is ready. This may take up to a minute.")
predicate=None
if predicate is None:
  predicate = lambda index: index.get("queryable") is True
while True:
  indices = list(collection.list_search_indexes(index_name))
  if len(indices) and predicate(indices[0]):
    break
  time.sleep(5)
print(result + " is ready for querying.")

In [None]:
import voyageai
from bson.binary import Binary, BinaryVectorDtype

# Define a function to run a vector search query
def run_vector_search(query_text, collection, path):
    # Map path to output dtype and BSON vector type
    path_to_dtype = {
        float32_field: ("float", BinaryVectorDtype.FLOAT32),
        int8_field: ("int8", BinaryVectorDtype.INT8),
        int1_field: ("ubinary", BinaryVectorDtype.PACKED_BIT),
    }

    if path not in path_to_dtype:
        raise ValueError("Invalid path. Must be one of float32_field, int8_field, int1_field.")

    # Get Voyage AI output dtype and BSON vector type based on the path
    output_dtype, bson_dtype = path_to_dtype[path]

    # Generate query embeddings using Voyage AI
    query_vector = vo.embed(
        texts=[query_text],
        model="voyage-3-large",
        input_type="query",
        output_dtype=output_dtype
    ).embeddings[0]

    # Convert the query vector to BSON format
    bson_query_vector = Binary.from_vector(query_vector, bson_dtype)

    # Define the aggregation pipeline for vector search
    pipeline = [
        {
            "$vectorSearch": {
                "index": index_name,  # Replace with your index name
                "path": path,         # Path to the embedding field
                "queryVector": bson_query_vector,  # BSON-encoded query vector
                "numCandidates": 20,
                "limit": 5
            }
        },
        {
            "$project": {
                "_id": 0,
                "summary": 1,
                "score": { "$meta": "vectorSearchScore" }  # Include the similarity score
            }
        }
    ]

    # Run the aggregation pipeline and return results
    return collection.aggregate(pipeline)

In [None]:
from pprint import pprint

# Define a list of embedding fields to query
embedding_fields = [float32_field, int8_field, int1_field] 
results = {}

# Run vector search queries for each embedding type
query_text = "ocean view"
for field in embedding_fields:
    results[field] = list(run_vector_search(query_text, collection, field)) 

# Print the results
for field, field_results in results.items():
    print(f"Results from {field}")
    pprint(field_results)