# Astrabolt - Load Products

This notebook uses **ragstack-ai** and **google-cloud-aiplatform** to connect to Astra DB, create the collections needed, and insert the product catalog with its vector embeddings using gemini multimodal embeeding.

## Install Dependencies, Authenticate, and Create Collections

In [None]:
!pip install google-cloud-aiplatform ragstack-ai --upgrade

In [None]:
import getpass, os, requests

if "GCP_PROJECT_ID" not in os.environ or True:
  os.environ["GCP_PROJECT_ID"] = getpass.getpass("Provide your GCP Project ID")

if "ASTRA_DB_ENDPOINT" not in os.environ or True:
  os.environ["ASTRA_DB_ENDPOINT"] = getpass.getpass("Provide your Astra DB Endpoint")

if "ASTRA_DB_TOKEN" not in os.environ or True:
  os.environ["ASTRA_DB_TOKEN"] = getpass.getpass("Provide your Astra DB Token")

In [None]:
from google.colab import auth
from google.cloud import aiplatform

!gcloud config set project {os.getenv("GCP_PROJECT_ID")}

auth.authenticate_user()


In [None]:
from astrapy.db import AstraDB
# Initialize our vector db
astra_db = AstraDB(token=os.getenv("ASTRA_DB_TOKEN"), api_endpoint=os.getenv("ASTRA_DB_ENDPOINT"))
collection_descriptions = astra_db.create_collection(collection_name="product_catalog_descriptions", dimension=1408)
collection_images = astra_db.create_collection(collection_name="product_catalog_images", dimension=1408)

## Download Product Catalog

In [None]:
PRODUCT_CATALOG_URI="https://raw.githubusercontent.com/BestBuyAPIs/open-data-set/master/products.json"

In [None]:
!wget $PRODUCT_CATALOG_URI

In [None]:
import json
with open('products.json', 'r') as f:
    data = json.load(f)
products_slice=data[:1000] #Feel free to modify the size of the dataset as needed

## Create and Load Vector Embeddings
We will first create some helper functions for things such as downloading images, inserting records into Astra DB. And finally we will load the data prom `products_slice` in batches.

In [None]:
import requests
from PIL import Image
def download_image(image_url): #Downloads images from public bestbuy's URI to local file
  try:
    response = requests.get(image_url)
    response.raise_for_status()  # Raise an exception for error status codes

    filename = image_url.rsplit("/", 1)[-1]
    # Create the folder if it doesn't exist
    folder_path="product_images"
    if not os.path.exists(folder_path):
      os.makedirs(folder_path)
    file_path=f"product_images/{filename}"
    with open(file_path, "wb") as f:
      f.write(response.content)
      return file_path
  except requests.exceptions.HTTPError as err:
    if err.response.status_code == 404:
        print("File not found at the specified URL.")
        return None
    else:
        print("An error occurred:", err)
        return None

In [None]:
def insert_document(collection,document, verbose=0): #loads the document into the specified collection
  try:
    # add to the AstraDB Vector Database
    collection.insert_one(document)
  except Exception as error:
    # if you've already added this record, skip the error message
    error_info = json.loads(str(error))
    if error_info[0]['errorCode'] == "DOCUMENT_ALREADY_EXISTS" and verbose>0:
      print("Document already exists in the database.  Skipping.")

In [None]:
from langchain.chat_models import ChatVertexAI

llm = ChatVertexAI(project=os.getenv("GCP_PROJECT_ID"), model_name="gemini-pro-vision", region="uswest-1")


In [None]:
import tqdm, time
from vertexai.preview.vision_models import MultiModalEmbeddingModel, Image

model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding@001")
multimodalembedding_requests_per_minute=120
batch_size=multimodalembedding_requests_per_minute

# Process product catalog in batches of 1000
for batch_start in tqdm.tqdm(range(0, len(products_slice), batch_size), desc="Processing product catalog"):
    batch_products = products_slice[batch_start:batch_start + batch_size]

    # Create embeddings for the batch of products
    t1 = time.perf_counter()
    for product in batch_products:
      filename = product['image'].rsplit("/", 1)[-1]
      import os.path
      if not os.path.exists(f"product_images/{filename}"):
        product["_id"] = product["sku"]
        product["description"] = f'{product["name"]}. {product["description"]}'
        product["main_category"] = product["category"][0]
        filename=download_image(product['image'])
        if filename is not None:
          img = Image.load_from_file(filename)
          embeddings = model.get_embeddings(image=img, contextual_text=product['description'])
        else:
          embeddings = model.get_embeddings(contextual_text=product['description'])
        product["$vector"] = embeddings.text_embedding
        insert_document(collection_descriptions,product)
        if filename is not None:
          product["$vector"] = embeddings.image_embedding
          insert_document(collection_images,product)
    #t2 = time.perf_counter()
    #if t2-t1<60:
    #  time.sleep(60-(t2-t1)) #to stay under GCP's API requests quota


## Experiment Querying the Database
We'll run a couple of queries using text, and image as input.

In [None]:
import json
from vertexai.preview.vision_models import MultiModalEmbeddingModel, Image
from langchain.schema.messages import HumanMessage

model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding@001")

# Embed the similar item
img = Image.load_from_file('2877554_sa.jpg')
embeddings = model.get_embeddings(image=img, contextual_text="")

# Perform the vector search against AstraDB Vector
documents = collection_images.vector_find(
    embeddings.image_embedding,
    limit=3,
)

related_products_csv = "name, image, price, url\n"
for doc in documents:
  related_products_csv += f"{doc['name']}, {doc['image']}, {doc['price']}, {doc['url']},\n"
print(related_products_csv)

In [None]:
embeddings = model.get_embeddings(contextual_text="AudioQuest - Niagara 1200 Low-Z Power Conditioner")

# Perform the vector search against AstraDB Vector
documents = collection_descriptions.vector_find(
    embeddings.text_embedding,
    limit=3,
)

related_products_csv = "name, image, price, url\n"
for doc in documents:
  related_products_csv += f"{doc['name']}, {doc['image']}, {doc['price']}, {doc['url']},\n"
print(related_products_csv)