In [1]:
import chromadb

client = chromadb.Client()

collection = client.create_collection(name = "my_collection")
## collection is like a database where we will store the data

In [2]:
## add documents/ records to this database

collection.add(
    # documents are large collection of text
    # chroma will store our text and handle the embedding and indexing automatically
    # we must provide string id's also to our documents
    documents = [
        "This document is about New York",
        "This document is about Delhi"
    ],
    ids = ["id1", "id2"]
)

In [3]:
# get all the documents
all_docs = collection.get()
all_docs

{'ids': ['id1', 'id2'],
 'embeddings': None,
 'documents': ['This document is about New York',
  'This document is about Delhi'],
 'uris': None,
 'data': None,
 'metadatas': [None, None],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [4]:
## to get an individual document using an id
document = collection.get(ids = "id1")
document

{'ids': ['id1'],
 'embeddings': None,
 'documents': ['This document is about New York'],
 'uris': None,
 'data': None,
 'metadatas': [None],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [7]:
## we can query a collection with a set of query texts
## chroma will return the n most similar texts
results = collection.query(
    query_texts = ["Query is about India"], # chroma will embed this query and do a semantic search for similar documents in the vector database
    n_results=2 # how many results we want to return
)

In [8]:
results
## more similar data will have lesser distance

{'ids': [['id2', 'id1']],
 'embeddings': None,
 'documents': [['This document is about Delhi',
   'This document is about New York']],
 'uris': None,
 'data': None,
 'metadatas': [[None, None]],
 'distances': [[0.7835131287574768, 1.596900224685669]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [9]:
##  to delete all the documents
collection.delete(ids = all_docs['ids'])
collection.get()

{'ids': [],
 'embeddings': None,
 'documents': [],
 'uris': None,
 'data': None,
 'metadatas': [],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [10]:
# we can also add metadata- additional informatioan about the document
collection.add(
    documents=[
        "This document is about New York",
        "This document is about Delhi"
    ],
    ids = ["id1", "id2"],
    metadatas=[
        {"url":"https://en.wikipedia.org/wiki/New_York_City"},
        {"url":"https://en.wikipedia.org/wiki/New_Delhi"}
    ]
)

In [11]:
## we can query a collection with a set of query texts
## chroma will return the n most similar texts
results = collection.query(
    query_texts = ["Query is about Chhole Bahture"], # chroma will embed this query and do a semantic search for similar documents in the vector database
    n_results=2 # how many results we want to return
)

In [12]:
results

{'ids': [['id2', 'id1']],
 'embeddings': None,
 'documents': [['This document is about Delhi',
   'This document is about New York']],
 'uris': None,
 'data': None,
 'metadatas': [[{'url': 'https://en.wikipedia.org/wiki/New_Delhi'},
   {'url': 'https://en.wikipedia.org/wiki/New_York_City'}]],
 'distances': [[1.5424985885620117, 1.8458157777786255]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}