# An intro to vector databases with ChromaDB
In this notebook, we'll learn how to use the ChromaDB vector database

In [2]:
import chromadb
import uuid
import pprint
import chromadb.utils.embedding_functions as ef
import csv

In [22]:
client = chromadb.PersistentClient(path="vector.db")
client.reset()

True

In [15]:
collection = client.create_collection(
    name="Wimbledon",
    embedding_function=ef.DefaultEmbeddingFunction() # Default if no function provided
)

## Storing Documents

In [10]:
with open("sentences.csv", "r") as sentences_file:
    reader = csv.reader(sentences_file, delimiter=",")
    documents = [row[0] for row in reader]

documents

['The Championships, commonly known simply as Wimbledon,[c] is the oldest tennis tournament in the world and is regarded by many as the most prestigious.',
 'It has been held at the All England Lawn Tennis and Croquet Club in Wimbledon, London, since 1877 and is played on outdoor grass courts',
 'Due to the COVID-19 pandemic, 2020 Wimbledon was cancelled, the first cancellation of the tournament since World War II.',
 'The 2023 Wimbledon Championships was the 136th staging and ran from 3 July 2023 to 16 July 2023.',
 "Roger Federer has won the men's singles title the most times, with 8 wins.",
 "Martina Navratilova has won the women's singles title the most times, with 9 wins",
 "Each year the tournament begins on the last Monday in June or first Monday in July, two weeks after the Queen's Club Championships.",
 'Since 2009 all matches have to finish before 11:00pm. Wimbledon is the only Grand Slam that retains a night-time curfew.',
 'Due to the possibility of rain during Wimbledon, a

In [16]:
collection.add(
    documents=documents,
    metadatas=[{"source": "Wikipedia"} for _ in documents],
    ids=[f"id-{uuid.uuid4()}" for _ in documents]
)

So what actually happens when we add those documents?

In [33]:
embedding_fn = ef.DefaultEmbeddingFunction()

In [18]:
embeddings = embedding_fn(documents)
pprint.pprint(embeddings, width=150, compact=True)

[[0.002460714429616928, -0.04395738244056702, -0.027920441702008247, -0.07969953119754791, -0.06942763179540634, 0.027065809816122055,
  -0.021303363144397736, 0.03268865868449211, 0.02657260186970234, 0.009070895612239838, -0.10297709703445435, -6.979482714086771e-05,
  0.015810325741767883, 0.028591390699148178, -0.11295675486326218, -0.012612123042345047, 0.058572810143232346, -0.08381988853216171,
  -0.018081383779644966, 0.0067652082070708275, -0.013807314448058605, 0.0030130241066217422, 0.0593797042965889, -0.0071263802237808704,
  0.029255781322717667, 0.051927946507930756, -0.050490181893110275, 0.06612182408571243, -0.03607283905148506, 0.006332795601338148,
  -0.07900933176279068, -0.053228285163640976, 0.07309940457344055, 0.05366131663322449, -0.09177856892347336, 0.002326398389413953,
  -0.05120716989040375, -0.04248801991343498, 0.03273676708340645, 0.029550859704613686, -0.005472926422953606, -0.07551101595163345,
  0.010976092889904976, 0.060927651822566986, -0.0355716

## Querying documents

In [19]:
results = collection.query(
    query_texts=["Clothing"],
    n_results=3
)
pprint.pprint(results)

{'distances': [[1.4334193135753135, 1.81285520637129, 2.019870306995956]],
 'documents': [['All tennis players participating in the tournament are '
                'required to wear all-white or at least almost all-white '
                'clothing, a long-time tradition at Wimbledon.',
                'Due to the possibility of rain during Wimbledon, a '
                'retractable roof was installed prior to the 2009 '
                'Championship.',
                'It has been held at the All England Lawn Tennis and Croquet '
                'Club in Wimbledon, London, since 1877 and is played on '
                'outdoor grass courts']],
 'embeddings': None,
 'ids': [['id-735125be-2f9e-4624-bf30-600bc1aff5e4',
          'id-e1ae437a-ff27-4b21-a762-753fd9ecca30',
          'id-eb980f67-c680-46e3-b97b-f653472c30ff']],
 'metadatas': [[{'source': 'Wikipedia'},
                {'source': 'Wikipedia'},
                {'source': 'Wikipedia'}]]}


In [40]:
query_embeddings = embedding_fn(["2023 dates"])
results = collection.query(
    query_embeddings=query_embeddings,
    n_results=3
)
pprint.pprint(results)

{'distances': [[0.9667476872893191, 0.9667476872893191, 1.3860115586052497]],
 'documents': [['The 2023 Wimbledon Championships was the 136th staging and '
                'ran from 3 July 2023 to 16 July 2023.',
                'The 2023 Wimbledon Championships was the 136th staging and '
                'ran from 3 July 2023 to 16 July 2023.',
                'Due to the COVID-19 pandemic, 2020 Wimbledon was cancelled, '
                'the first cancellation of the tournament since World War '
                'II.']],
 'embeddings': None,
 'ids': [['id-53f1caac-4109-4196-8fd0-d22826cb5dbe',
          'id-6cce4e85-7b1b-4a1a-98f7-30c0e3eb5e0e',
          'id-a60a2309-8264-4dc4-9404-4c3fc13f1a3f']],
 'metadatas': [[{'source': 'Wikipedia'},
                {'source': 'Wikipedia'},
                {'source': 'Wikipedia'}]]}


## Using a different embedding algorithm

In [86]:
import inspect

[
    cls_name
    for cls_name in dir(ef) 
    if inspect.isclass(getattr(ef,cls_name)) and ef.EmbeddingFunction in getattr(ef,cls_name).__bases__
]

['CohereEmbeddingFunction',
 'GooglePalmEmbeddingFunction',
 'GoogleVertexEmbeddingFunction',
 'HuggingFaceEmbeddingFunction',
 'InstructorEmbeddingFunction',
 'ONNXMiniLM_L6_V2',
 'OpenAIEmbeddingFunction',
 'SentenceTransformerEmbeddingFunction',
 'Text2VecEmbeddingFunction']

In [23]:
collection2 = client.create_collection(
    name="Wimbledon-DistiRoberta",
    embedding_function=ef.SentenceTransformerEmbeddingFunction(model_name="sentence-transformers/all-distilroberta-v1")
)

In [24]:
collection2.add(
    documents=documents,
    metadatas=[{"source": "Wikipedia"} for _ in documents],
    ids=[f"id-{uuid.uuid4()}" for _ in documents]
)

In [37]:
results = collection2.query(
    query_texts=["Clothing"],
    n_results=3,
    include=["documents", "distances"]
)
pprint.pprint(results)

{'distances': [[1.5321013945588962, 1.8215139849772641, 1.8973226623646495]],
 'documents': [['All tennis players participating in the tournament are '
                'required to wear all-white or at least almost all-white '
                'clothing, a long-time tradition at Wimbledon.',
                'The Championships, commonly known simply as Wimbledon,[c] is '
                'the oldest tennis tournament in the world and is regarded by '
                'many as the most prestigious.',
                'Due to the possibility of rain during Wimbledon, a '
                'retractable roof was installed prior to the 2009 '
                'Championship.']],
 'embeddings': None,
 'ids': [['id-f2727219-31de-4fb9-815b-4c781130c9e8',
          'id-ab2feda3-136c-4b30-827d-88ec84eeecf4',
          'id-30c3b633-6f25-4e0a-98b0-e36e2ffcb8cb']],
 'metadatas': None}


In [38]:
query_embeddings = ef.DefaultEmbeddingFunction()(["2023 dates"])
results = collection2.query(
    query_embeddings=query_embeddings,
    n_results=3
)
pprint.pprint(results)

InvalidDimensionException: Embedding dimension 384 does not match collection dimensionality 768