## Chroma DB Introduction

Import necessary library and model

In [1]:
import chromadb
from chromadb.utils import embedding_functions

CHROMA_DATA_PATH = "chroma/data"
EMBED_MODEL = "all-MiniLM-L6-v2"
COLLECTION_NAME = "demo_docs"

client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)

Create new collection to chromadb

In [None]:
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EMBED_MODEL)

collection = client.create_collection(
    name = 'COLLECTION_NAME',
    embedding_function = embedding_func,
    metadata = {"hnsw : space": "cosinse"},
)

In [None]:
documents = [
    "The latest iPhone model comes with impressive features and a powerful camera.",
    "Exploring the beautiful beaches and vibrant culture of Bali is a dream for many travelers.",
    "Einstein's theory of relativity revolutionized our understanding of space and time.",
    "Traditional Italian pizza is famous for its thin crust, fresh ingredients, and wood-fired ovens.",
    "The American Revolution had a profound impact on the birth of the United States as a nation.",
    "Regular exercise and a balanced diet are essential for maintaining good physical health.",
    "Leonardo da Vinci's Mona Lisa is considered one of the most iconic paintings in art history.",
    "Climate change poses a significant threat to the planet's ecosystems and biodiversity.",
    "Startup companies often face challenges in securing funding and scaling their operations.",
    "Beethoven's Symphony No. 9 is celebrated for its powerful choral finale, 'Ode to Joy.'",
]

In [None]:
genres = [
    "technology",
    "travel",
    "science",
    "food",
    "history",
    "fitness",
    "art",
    "climate change",
    "business",
    "music",
]

Add some documents to collection

In [None]:
collection.add(
    documents=documents,
    ids=[f"id{i}" for i in range(len(documents))],
    metadatas=[{"genre": g} for g in genres]
)

### Querying semantic search to chroma db

In [None]:
query_results = collection.query(
    query_texts = ["Find me some delicious food!"],
    n_results = 1,
)

In [None]:
query_results.keys()

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'data', 'metadatas', 'distances', 'included'])

In [None]:
query_results["documents"]

[['Traditional Italian pizza is famous for its thin crust, fresh ingredients, and wood-fired ovens.']]

In [None]:
query_results['distances']

[[1.5276529838086401]]

In [None]:
query_results['metadatas']

[[{'genre': 'food'}]]

In [None]:
next_query = collection.query(
    query_texts=["Teach me about history",
                 "What's going on in the world?"],
    include=["documents", "distances"],
    n_results=2
)

In [None]:
next_query['documents'][0]

["Einstein's theory of relativity revolutionized our understanding of space and time.",
 'The American Revolution had a profound impact on the birth of the United States as a nation.']

In [None]:
collection.query(
    query_texts=["Teach me about music history"],
    n_results=1
)

{'ids': [['id2']],
 'embeddings': None,
 'documents': [["Einstein's theory of relativity revolutionized our understanding of space and time."]],
 'uris': None,
 'data': None,
 'metadatas': [[{'genre': 'science'}]],
 'distances': [[1.5251639967225434]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [None]:
collection.query(
    query_texts=["Teach me about music history"],
    where={"genre": {"$eq": "music"}},
    n_results=1,
)

{'ids': [['id9']],
 'embeddings': None,
 'documents': [["Beethoven's Symphony No. 9 is celebrated for its powerful choral finale, 'Ode to Joy.'"]],
 'uris': None,
 'data': None,
 'metadatas': [[{'genre': 'music'}]],
 'distances': [[1.6372656028108206]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [None]:
query_results = collection.query(
    query_texts=["Teach me about music history"],
    where={"genre": {"$in": ["music", "history"]}},
    n_results=2,
)

query_results["documents"]

[["Beethoven's Symphony No. 9 is celebrated for its powerful choral finale, 'Ode to Joy.'",
  'The American Revolution had a profound impact on the birth of the United States as a nation.']]

In [None]:
collection.update(
    ids=["id1", "id2"],
    documents=["The new iPhone is awesome!",
               "Bali has beautiful beaches"],
    metadatas=[{"genre": "tech"}, {"genre": "beaches"}]
)

query_results = collection.get(ids=["id1", "id2"])

query_results["documents"]


query_results["metadatas"]

[{'genre': 'tech'}, {'genre': 'beaches'}]

In [None]:
collection.delete(ids=["id1", "id2"])

collection.count()


collection.get(["id1", "id2"])

{'ids': [],
 'embeddings': None,
 'documents': [],
 'uris': None,
 'data': None,
 'metadatas': [],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

## Create collection to ChromaDB

In [4]:
import chromadb
from chromadb.utils import embedding_functions
from car_data_etl import prepare_car_reviews_data
from chrome_utils import build_chroma_collection

DATA_PATH = "data/archive/*.csv"
CHROMA_PATH = "car_review_embeddings"
EMBEDDING_FUNC_NAME = "multi-qa-MiniLM-L6-cos-v1"
COLLECTION_NAME = "car_reviews"

chroma_car_reviews_dict = prepare_car_reviews_data(DATA_PATH)

build_chroma_collection(
    CHROMA_PATH,
    COLLECTION_NAME,
    EMBEDDING_FUNC_NAME,
    chroma_car_reviews_dict["ids"],
    chroma_car_reviews_dict["documents"],
    chroma_car_reviews_dict["metadatas"]
)

  from tqdm.autonotebook import tqdm, trange


In [7]:
client = chromadb.PersistentClient(CHROMA_PATH)
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=EMBEDDING_FUNC_NAME
    )
collection = client.get_collection(name=COLLECTION_NAME, embedding_function=embedding_func)

great_reviews = collection.query(
    query_texts=["Find me some positive reviews that discuss the car's performance"],
    n_results=5,
    include=["documents", "distances", "metadatas"]
)

great_reviews["documents"][0][0]

' Excellent car'