In [1]:
import chromadb

In [56]:
client = chromadb.PersistentClient(path="./db/") # automatically create db folder

In [3]:
client.heartbeat() # returns a nanosecond heartbeat. Useful for making sure the client remains connected.
#client.reset() # Empties and completely resets the database. ⚠️ This is destructive and not reversible.

1710769664747044460

In [None]:
!chroma run --path ./db/
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

In [57]:
#collection = client.create_collection(name="my_collection") # if you want to use a different embedding as the default (SentenceTransformer) you need to espeficy
collection = client.get_collection(name="my_collection") # if you create a collection using a different embedding, you need to especify in order the get the collection

In [58]:
collection.add(
    documents=["doc1", "doc2", "doc3"],
    embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2]],
    metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}],
    ids=["id1", "id2", "id3"]
)

Add of existing embedding ID: id1
Add of existing embedding ID: id2
Add of existing embedding ID: id3
Insert of existing embedding ID: id1
Insert of existing embedding ID: id2
Insert of existing embedding ID: id3


### query a collection

In [18]:
collection.query(
    query_embeddings=[[11.1, 12.1, 13.1],[1.1, 2.3, 3.2]],
    n_results=10,
    where={"chapter":"3"}, # filter by metadata
    where_document={"$contains":"doc"} # filter by document content ($contains)
)

Number of requested results 10 is greater than number of elements in index 3, updating n_results = 3


{'ids': [['id2', 'id1'], ['id1', 'id2']],
 'distances': [[146.28999999999994, 294.05], [0.0, 34.16]],
 'metadatas': [[{'chapter': '3', 'verse': '5'},
   {'chapter': '3', 'verse': '16'}],
  [{'chapter': '3', 'verse': '16'}, {'chapter': '3', 'verse': '5'}]],
 'embeddings': None,
 'documents': [['doc2', 'doc1'], ['doc1', 'doc2']],
 'uris': None,
 'data': None}

In [28]:
collection.get(
    ids=["id1","id2","id3"], # ids
    where={"chapter": {"$eq":""}} # metadata
)

{'ids': [],
 'embeddings': None,
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None}

In [37]:
# Only get documents and ids
collection.get(
    include = ["documents"]
)

collection.query(
    query_embeddings= [[11.1, 12.1, 13.1], [1.1, 2.3, 3.2]],
    include=["documents"]
)

Number of requested results 10 is greater than number of elements in index 3, updating n_results = 3


{'ids': [['id2', 'id1', 'id3'], ['id1', 'id3', 'id2']],
 'distances': None,
 'metadatas': None,
 'embeddings': None,
 'documents': [['doc2', 'doc1', 'doc3'], ['doc1', 'doc3', 'doc2']],
 'uris': None,
 'data': None}

In [39]:
collection.query(
    query_embeddings=[[1.1, 2.2, 3.3]], # it's always required when we are using the QUERY function
    where={
        "$and": [{ # for use $and/$or is necesary a list of two or more elements
            "chapter": "3"
        },
        {
            "version": "1"
        }
        ]
    }
)

Number of requested results 10 is greater than number of elements in index 3, updating n_results = 3


{'ids': [[]],
 'distances': [[]],
 'metadatas': [[]],
 'embeddings': None,
 'documents': [[]],
 'uris': None,
 'data': None}

In [40]:
from chromadb.utils import embedding_functions

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")


  from .autonotebook import tqdm as notebook_tqdm


In [41]:
client = chromadb.Client()

In [52]:
collection = client.get_or_create_collection("test-where-list", embedding_function=sentence_transformer_ef)
collection.add(
    documents=["Article by john", "Article by jack", "Article by Jill"],
    metadatas=[{"author": "john"}, {"author": "jack"}, {"author": "jill"}],
    ids=["1", "2", "3"])

Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Insert of existing embedding ID: 1
Insert of existing embedding ID: 2
Insert of existing embedding ID: 3


In [44]:
query = ["Give me articles by john"]
res = collection.query(query_texts=query, where={"author": {"$in": ["john", "jill"]}}, n_results=10)
print(res)

Number of requested results 10 is greater than number of elements in index 3, updating n_results = 3


{'ids': [['1', '3']], 'distances': [[0.2882419228553772, 1.0175081491470337]], 'metadatas': [[{'author': 'john'}, {'author': 'jill'}]], 'embeddings': None, 'documents': [['Article by john', 'Article by Jill']], 'uris': None, 'data': None}


In [46]:
res_get = collection.get(where={"author": {'$in': ["john", "jill"]}})
print(res_get)

{'ids': ['1', '3'], 'embeddings': None, 'metadatas': [{'author': 'john'}, {'author': 'jill'}], 'documents': ['Article by john', 'Article by Jill'], 'uris': None, 'data': None}


### Interaction with existing Where operators

In [47]:
collection.upsert( # update items using the ids and create new items if id not exist
    documents=["Article by john", "Article by Jack", "Article by Jill"],
    metadatas=[{"author": "john","article_type":"blog"}, {"author": "jack","article_type":"social"}, {"author": "jill","article_type":"paper"}],
    ids=["1","2","3"])


In [49]:
collection.query(
    query_texts=query,
    where={"$and": [{"author" : {"$in" : ["john", "jill"]}}, {"article_type": {"$in" : ["paper"]}}]},
    n_results=3
    )

{'ids': [['3']],
 'distances': [[1.0175081491470337]],
 'metadatas': [[{'article_type': 'paper', 'author': 'jill'}]],
 'embeddings': None,
 'documents': [['Article by Jill']],
 'uris': None,
 'data': None}

In [50]:
collection.query(
    query_texts=query,
    where={"$and":[{"author": {'$in': ['john', 'jill']}},{"article_type":{"$eq":"blog"}}]}, 
    n_results=3)

{'ids': [['1']],
 'distances': [[0.2882419228553772]],
 'metadatas': [[{'article_type': 'blog', 'author': 'john'}]],
 'embeddings': None,
 'documents': [['Article by john']],
 'uris': None,
 'data': None}

In [59]:
collection.update(
    ids=["doc1", "doc2", "doc3"],
    embeddings=[[1.1, 2.2, 3.3],[1.1, 2.2, 3.3],[1.1, 2.2, 3.3]],
    metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}],
    documents=["doc1", "doc2", "doc3"]
)

Update of nonexisting embedding ID: doc1
Update of nonexisting embedding ID: doc2
Update of nonexisting embedding ID: doc3
Update of nonexisting embedding ID: doc1
Update of nonexisting embedding ID: doc2
Update of nonexisting embedding ID: doc3


In [61]:
collection.upsert(
    ids=["doc3","doc4"],
    embeddings=[[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]],
    metadatas=[{"chapter": "10", "verse": "5"}, {"chapter": "1", "verse": "4"}],
    documents=["doc 1", "doc4"]
)

In [62]:
collection.delete(
    ids=["id1", "id2", "id3"],
    where={"chapter": "20"}
)

In [89]:
from langchain.vectorstores import chroma
from langchain.embeddings import SentenceTransformerEmbeddings

In [91]:
persistent_client = chromadb.PersistentClient("./db/")
vectorstore = chroma.Chroma(
    client=persistent_client,
    collection_name="b3b83639-1b29-4b00-8c20-fcc73bb182a3",
    embedding_function=SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2"),
    persist_directory="./db/",
)
vectorstore

<langchain_community.vectorstores.chroma.Chroma at 0x778317cd6680>

In [78]:
retriever = vectorstore.as_retriever()

In [96]:
#vectorstore.__query_collection(query_texts=["all the documents"])
vectorstore.get_collections()

AttributeError: 'Chroma' object has no attribute 'get_collections'