In [2]:
import csv

# Load sample data (a restaurant menu of items)
with open('menu_items.csv') as file:
    lines = csv.reader(file)

    # Store the name of the menu items in this array. In Chroma, a "document" is a string i.e. name, sentence, paragraph, etc.
    documents = []

    # Store the corresponding menu item IDs in this array.
    metadatas = []

    # Each "document" needs a unique ID. This is like the primary key of a relational database. We'll start at 1 and increment from there.
    ids = []
    id = 1

    # Loop thru each line and populate the 3 arrays.
    for i, line in enumerate(lines):
        if i==0:
            # Skip the first row (the column headers)
            continue

        documents.append(line[1])
        metadatas.append({"item_id": line[0]})
        ids.append(str(id))
        id+=1

In [3]:
# Install chromadb
%pip install chromadb

# Install sentence transformers
# This is used to convert text to vector embeddings. In other words, it converts text to a bunch of numbers that represent the 'meaning' of the text.
%pip install -U sentence-transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting chromadb
  Downloading chromadb-0.5.0-py3-none-any.whl.metadata (7.3 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.1-py3-none-any.whl.metadata (4.3 kB)
Collecting requests>=2.28 (from chromadb)
  Downloading requests-2.31.0-py3-none-any.whl.metadata (4.6 kB)
Collecting pydantic>=1.9 (from chromadb)
  Downloading pydantic-2.7.1-py3-none-any.whl.metadata (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.3/107.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.111.0-py3-none-any.whl.metadata (25 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.29.0-py3-none-a

In [4]:
# Install Gemini Pro
%pip install google-generativeai

Defaulting to user installation because normal site-packages is not writeable
Collecting google-generativeai
  Downloading google_generativeai-0.5.2-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.2 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.2-py3-none-any.whl.metadata (5.6 kB)
Collecting google-api-core (from google-generativeai)
  Downloading google_api_core-2.19.0-py3-none-any.whl.metadata (2.7 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.127.0-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-ai-generativelanguage==0.6.2->google-generativeai)
  Downloading proto_plus-1.23.0-py3-none-any.whl.metadata (2.2 kB)
Collecting google-auth-httplib2<1.0.0,>=0.2.0 (from google-api-python-client->google-generativeai)
  Downloading google_auth_httplib2-0.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting uritemplate<5,>=3.0.1 

In [5]:
# Reference: https://docs.trychroma.com/getting-started

import chromadb
from chromadb.utils import embedding_functions

# Instantiate chromadb instance. Data is stored in memory only.
# chroma_client = chromadb.Client()

# Instantiate chromadb instance. Data is stored on disk (a folder named 'my_vectordb' will be created in the same folder as this file).
chroma_client = chromadb.PersistentClient(path="my_vectordb")

In [6]:
# Get your API Key at https://console.cloud.google.com/ or https://aistudio.google.com/app/apikey
google_ef  = embedding_functions.GoogleGenerativeAiEmbeddingFunction(api_key="AIzaSyACW3E53wRCmYB8T0Ege-od9BomT0-TGbY")

# Use this to delete the database
# chroma_client.delete_collection(name="my_collection")

# Create the collection, aka vector database. Or, if database already exist, then use it. Specify the model that we want to use to do the embedding.
collection = chroma_client.get_or_create_collection(name="my_collection", embedding_function=google_ef)

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# Add all the data to the vector database. ChromaDB automatically converts and stores the text as vector embeddings. This may take a few minutes.
collection.update(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

Update of nonexisting embedding ID: 1
Update of nonexisting embedding ID: 2
Update of nonexisting embedding ID: 3
Update of nonexisting embedding ID: 4
Update of nonexisting embedding ID: 5
Update of nonexisting embedding ID: 6
Update of nonexisting embedding ID: 7
Update of nonexisting embedding ID: 8
Update of nonexisting embedding ID: 9
Update of nonexisting embedding ID: 10
Update of nonexisting embedding ID: 11
Update of nonexisting embedding ID: 12
Update of nonexisting embedding ID: 13
Update of nonexisting embedding ID: 14
Update of nonexisting embedding ID: 15
Update of nonexisting embedding ID: 16
Update of nonexisting embedding ID: 17
Update of nonexisting embedding ID: 18
Update of nonexisting embedding ID: 19
Update of nonexisting embedding ID: 20
Update of nonexisting embedding ID: 21
Update of nonexisting embedding ID: 22
Update of nonexisting embedding ID: 23
Update of nonexisting embedding ID: 24
Update of nonexisting embedding ID: 25
Update of nonexisting embedding ID

In [11]:
# Query the vector database

# Query mispelled word: 'vermiceli'. Expect to find the correctly spelled 'vermicelli' item
results = collection.query(
    query_texts=["vermiceli"],
    n_results=5,
    include=['documents', 'distances', 'metadatas']
)
print(results['documents'])

# Query word variation: 'donut'. Expect to find the 'doughnut' item
results = collection.query(
    query_texts=["donut"],
    n_results=5,
    include=['documents', 'distances', 'metadatas']
)
print(results['documents'])

# Query similar meaning: 'shrimp'. Expect to find the 'prawn' items
results = collection.query(
    query_texts=["shrimp"],
    n_results=5,
    include=['documents', 'distances', 'metadatas']
)
print(results['documents'])


Number of requested results 5 is greater than number of elements in index 0, updating n_results = 0


[[]]


Number of requested results 5 is greater than number of elements in index 0, updating n_results = 0


[[]]


Number of requested results 5 is greater than number of elements in index 0, updating n_results = 0


[[]]
