In [1]:
import chromadb

chroma_client = chromadb.Client()

In [2]:
collection = chroma_client.create_collection(name="my_collection_1")

In [3]:
collection.add(
    documents=[
        "This is a document about pineapple",
        "This is a document about oranges",
    ],
    ids=["id1", "id2"],
)

/home/ubuntu/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:00<00:00, 103MiB/s] 


In [4]:
results = collection.query(
    query_texts=[
        "This is a query document about hawaii"
    ],  # Chroma will embed this for you
    n_results=2,  # how many results to return
)
print(results)

{'ids': [['id1', 'id2']], 'distances': [[1.0404009819030762, 1.2430799007415771]], 'metadatas': [[None, None]], 'embeddings': None, 'documents': [['This is a document about pineapple', 'This is a document about oranges']], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}


In [1]:
import chromadb

In [42]:
client = client = chromadb.HttpClient(host='localhost', port=8000)


In [4]:
collection = client.get_collection(
    name="test"
)  # Get a collection object from an existing collection, by name. Will raise an exception if it's not found.

InvalidCollectionException: Collection test does not exist.

In [5]:
collection = client.get_or_create_collection(
    name="test"
)  # Get a collection object from an existing collection, by name. Will raise an exception if it's not found.

In [6]:
collection.peek()  # returns a list of the first 10 items in the collection

{'ids': [],
 'embeddings': array([], dtype=float64),
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None,
 'included': ['embeddings', 'metadatas', 'documents']}

In [None]:
# collection.modify(name="new_name") 

In [7]:
from src.my_rag.components.embeddings.huggingface_embedding import HuggingFaceEmbedding 

In [9]:
embedding_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    load_in_8bit=False,
    trust_remote_code=False,
    device_map="auto",
    max_memory={0: "18000MB", "cpu": "18000MB"},
)

In [11]:
embedding_model.embed(
    [
        "lorem ipsum...",
        "doc2",
        "doc3",
    ]
).cpu().numpy()

array([[-9.04761925e-02,  4.04396132e-02,  2.39056572e-02,
         5.89479990e-02, -2.28823405e-02, -4.72201072e-02,
         4.50475514e-02,  1.57863349e-02, -4.81995530e-02,
        -3.77941206e-02, -1.90776531e-02,  2.13088430e-02,
        -4.68305172e-03, -4.33081612e-02,  5.99147864e-02,
         5.91033697e-02, -2.80367490e-02, -5.92183471e-02,
        -1.24403104e-01, -3.56000178e-02, -6.08058181e-03,
         3.24291028e-02, -3.78007405e-02,  2.47109868e-02,
        -4.27243076e-02, -4.24539112e-02,  4.59356755e-02,
         9.86255482e-02, -4.99980114e-02, -3.52358632e-02,
         7.08397701e-02,  3.31632085e-02,  2.65883636e-02,
         1.73202774e-04,  3.88165796e-03,  3.04672439e-02,
        -7.82026127e-02, -1.20379560e-01,  1.80415157e-02,
         2.28290595e-02, -1.77502877e-03, -2.34498531e-02,
         3.05810804e-03,  2.43557282e-02,  4.41539697e-02,
        -4.01097238e-02,  2.01923642e-02,  1.08881490e-02,
         2.87315156e-02,  1.23677244e-02, -9.13190544e-0

In [12]:
collection.add(
    documents=["lorem ipsum...", "doc2", "doc3", ],
    metadatas=[
        {"chapter": "3", "verse": "16"},
        {"chapter": "3", "verse": "5"},
        {"chapter": "29", "verse": "11"},
    ],
    ids=["id1", "id2", "id3",],
    embeddings=embedding_model.embed(
    [
        "lorem ipsum...",
        "doc2",
        "doc3",
    ]
).cpu().numpy()
)

In [13]:
collection.peek()

{'ids': ['id1', 'id2', 'id3'],
 'embeddings': array([[ 0.03684679,  0.02051687,  0.07883837, ..., -0.10914503,
          0.02406924, -0.00106997],
        [-0.05694772,  0.00664129,  0.07214396, ...,  0.04491494,
          0.03988827,  0.01339069],
        [-0.07116625, -0.00156549,  0.02162257, ...,  0.02111645,
          0.03343027,  0.01777468]]),
 'metadatas': [{'chapter': '3', 'verse': '16'},
  {'chapter': '3', 'verse': '5'},
  {'chapter': '29', 'verse': '11'}],
 'documents': ['lorem ipsum...', 'doc2', 'doc3'],
 'uris': None,
 'data': None,
 'included': ['embeddings', 'metadatas', 'documents']}

In [16]:
query_em = embeddings = (
    embedding_model.embed(
        [
            "document",
        ]
    )
    .cpu()
    .numpy()
)

In [31]:
collection.query(
    query_embeddings=query_em,
    n_results=10,
    where={"chapter":  "29"},
    # where={"metadatas__chapter": "3"},
    where_document={"$contains": "doc"},
)

Number of requested results 10 is greater than number of elements in index 3, updating n_results = 3


{'ids': [['id3']],
 'distances': [[0.9319435853838086]],
 'metadatas': [[{'chapter': '29', 'verse': '11'}]],
 'embeddings': None,
 'documents': [['doc3']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [33]:
collection.modify(metadata={"hnsw:space": "cosine"})

ValueError: Changing the distance function of a collection once it is created is not supported currently.

In [36]:
%env ALLOW_RESET = TRUE

env: ALLOW_RESET=TRUE


In [37]:
client.reset()  # Empties and completely resets the database. ⚠️ This is destructive and not reversible.

ValueError: Resetting is not allowed by this configuration (to enable it, set `allow_reset` to `True` in your Settings() or include `ALLOW_RESET=TRUE` in your environment variables)

In [35]:
client.heartbeat()  # returns a nanosecond heartbeat. Useful for making sure the client remains connected.

1730224338311327325

In [38]:
collection.delete()

ValueError: 
                You must provide either ids, where, or where_document to delete. If
                you want to delete all data in a collection you can delete the
                collection itself using the delete_collection method. Or alternatively,
                you can get() all the relevant ids and then delete them.
                

In [39]:
client.delete_collection(name="test")