# ChromaDB

In [1]:
import os

# REPLICATE_API_TOKEN = os.getenv("REPLICATE_API_TOKEN")
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
TEXT_FILES_FOLDER = "../resources/Wissensquellen"
SPLITS_PATH = "../resources/splits/"

## 1. Get Chunks

In [2]:
from langchain_core import documents
import pandas as pd

def create_document(chunk):
    page_content = chunk
    # metadata = {
    #     "splitter": ,
    #     "",}
    doc = documents.Document(
        page_content=page_content,
        # metadata=metadata,
    )
    return doc
# len(df)

## 2. Embedding Methods

In [24]:
import chromadb.utils.embedding_functions as embedding_functions
from chromadb import Documents, EmbeddingFunction, Embeddings
import replicate
import spacy

# USABLE BUT DEPENDING ON HF IT TAKES VERY LONG TO GET EMBEDDINGS
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
    api_key=HUGGINGFACE_API_KEY,
    # model_name="sentence-transformers/all-MiniLM-L6-v2"  # das ist der default
)
 
class ReplicateEmbeddingsFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        embeddings = replicate.run(
            "replicate/all-mpnet-base-v2:b6b7585c9640cd7a9572c6e129c9549d79c9c31f0d3fdce7baac7c67ca38f305",
            input={"text": input},
        )
        embeddings = embeddings[0]['embedding']
        return embeddings
replicate_ef = ReplicateEmbeddingsFunction()

class SpacyEmbeddingsFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        nlp = spacy.load("de_core_news_sm") # md , lg , trf (bert-base-german-cased)
        embeddings = [nlp(document).vector.tolist() for document in input]
        return embeddings
spacy_ef = SpacyEmbeddingsFunction()

In [25]:
repl_embeddings = replicate_ef("Das ist ein Test")
repl_embeddings

[[-0.022632569074630737,
  -0.0031665959395468235,
  -0.02212567999958992,
  0.0414096862077713,
  -0.06377043575048447,
  0.019593222066760063,
  0.025529034435749054,
  0.06069811433553696,
  0.04385733976960182,
  0.01583465188741684,
  0.04469894617795944,
  -0.046648938208818436,
  -0.014321427792310715,
  0.015423296019434929,
  -0.0179049763828516,
  0.03806186094880104,
  0.002526492578908801,
  0.016210880130529404,
  -0.037858325988054276,
  -0.008996553719043732,
  0.004195093642920256,
  0.008516496978700161,
  0.003603643737733364,
  -0.028662297874689102,
  0.0025116370525211096,
  0.04290294647216797,
  0.02536693401634693,
  -0.03861628845334053,
  0.010844996199011803,
  -0.03573733940720558,
  0.007829226553440094,
  -0.008949585258960724,
  0.02353186532855034,
  0.0020613728556782007,
  1.513085408078041e-06,
  -0.022479120641946793,
  0.0018581890035420656,
  -0.021055229008197784,
  -0.05647820234298706,
  -0.05642903968691826,
  -0.023762525990605354,
  0.0202670

## 3. Chroma Client and Collection

In [12]:
import chromadb

chroma_client = chromadb.PersistentClient(path="../resources/chromadb")

def create_collection(split_folder: str, embedding_function: EmbeddingFunction):
    file_names = os.listdir(SPLITS_PATH + split_folder)
    
    collection = chroma_client.get_or_create_collection(
        name=split_folder[:-1],
        embedding_function=embedding_function,
        # metadata={"hnsw:space": "cosine"} # "l2" (default: squared L2 norm), "ip" or "cosine"
        )

    for file in file_names:
        file_path = SPLITS_PATH + split_folder + file
        df = pd.read_json(file_path)
        documents_ = df['splits'].to_list()
        ids = [file[:-4] + "#" + str(i) for i in range(0, len(documents_))]
        metadata = [{"file": file[:-4]} for i in range(0, len(documents_))] # doesn`t work because of filename
        if len(documents_) == 0:
            print(r'This document {filepath} has no splits. It won`t be added to the collection')
            break
        number_documents = len(documents_)
        batchsize = 1000 # need to add in batches as batchsize cap is 5461 
        batch_indexes = [i * batchsize for i in range (1, number_documents//batchsize + 1)]
        batch_indexes.append(number_documents)
        current = 0
        print(f"Adding {len(documents_)} Chunks from {file} to collection {collection.name} in {len(batch_indexes)} batches.")
        for index in batch_indexes:
            documents_batch = documents_[current:index]
            ids_batch = ids[current:index]
            metadata_batch = metadata[current:index]
            collection.add(
                ids=ids_batch,
                documents=documents_batch,
                metadatas=metadata_batch
                )
            print(f"    Batch '{current} - {index}' complete")
            current = index

        print(f"Success - length of collection {collection.name} is {collection.count()}")

split_folder = SPLITS_PATH + "char_splitter_128_o0/"

In [11]:
# create_collection("char_splitter_128_o0/")
create_collection("article_regex_splitter/", spacy_ef)

Adding 171 Chunks from CELEX_02013L0036-20220101_DE_TXT.json to collection article_regex_splitter in 1 batches.


Add of existing embedding ID: CELEX_02013L0036-20220101_DE_TXT.#0
Add of existing embedding ID: CELEX_02013L0036-20220101_DE_TXT.#1
Add of existing embedding ID: CELEX_02013L0036-20220101_DE_TXT.#2
Add of existing embedding ID: CELEX_02013L0036-20220101_DE_TXT.#3
Add of existing embedding ID: CELEX_02013L0036-20220101_DE_TXT.#4
Add of existing embedding ID: CELEX_02013L0036-20220101_DE_TXT.#5
Add of existing embedding ID: CELEX_02013L0036-20220101_DE_TXT.#6
Add of existing embedding ID: CELEX_02013L0036-20220101_DE_TXT.#7
Add of existing embedding ID: CELEX_02013L0036-20220101_DE_TXT.#8
Add of existing embedding ID: CELEX_02013L0036-20220101_DE_TXT.#9
Add of existing embedding ID: CELEX_02013L0036-20220101_DE_TXT.#10
Add of existing embedding ID: CELEX_02013L0036-20220101_DE_TXT.#11
Add of existing embedding ID: CELEX_02013L0036-20220101_DE_TXT.#12
Add of existing embedding ID: CELEX_02013L0036-20220101_DE_TXT.#13
Add of existing embedding ID: CELEX_02013L0036-20220101_DE_TXT.#14
Add o

    Batch '0 - 171' complete
Success - length of collection article_regex_splitter is 720
Adding 549 Chunks from CELEX_02013R0575-20230628_DE_TXT.json to collection article_regex_splitter in 1 batches.


Add of existing embedding ID: CELEX_02013R0575-20230628_DE_TXT.#0
Add of existing embedding ID: CELEX_02013R0575-20230628_DE_TXT.#1
Add of existing embedding ID: CELEX_02013R0575-20230628_DE_TXT.#2
Add of existing embedding ID: CELEX_02013R0575-20230628_DE_TXT.#3
Add of existing embedding ID: CELEX_02013R0575-20230628_DE_TXT.#4
Add of existing embedding ID: CELEX_02013R0575-20230628_DE_TXT.#5
Add of existing embedding ID: CELEX_02013R0575-20230628_DE_TXT.#6
Add of existing embedding ID: CELEX_02013R0575-20230628_DE_TXT.#7
Add of existing embedding ID: CELEX_02013R0575-20230628_DE_TXT.#8
Add of existing embedding ID: CELEX_02013R0575-20230628_DE_TXT.#9
Add of existing embedding ID: CELEX_02013R0575-20230628_DE_TXT.#10
Add of existing embedding ID: CELEX_02013R0575-20230628_DE_TXT.#11
Add of existing embedding ID: CELEX_02013R0575-20230628_DE_TXT.#12
Add of existing embedding ID: CELEX_02013R0575-20230628_DE_TXT.#13
Add of existing embedding ID: CELEX_02013R0575-20230628_DE_TXT.#14
Add o

    Batch '0 - 549' complete
Success - length of collection article_regex_splitter is 720
This document has no splits. It won`t be added to the collection


# SCRIPT FINISH, HERE ONLY TESTING FOR RETRIEVAL

In [16]:
# collection = chroma_client.get_or_create_collection(
#     name="test",
#     embedding_function=SpacyEmbeddingsFunction(),
#     # metadata={"hnsw:space": "cosine"} # "l2" (default: squared L2 norm), "ip" or "cosine"
#     )

# collection = chroma_client.get_or_create_collection(
#     name="first_collection",
#     embedding_function=huggingface_ef,
#     # metadata={"hnsw:space": "cosine"} # "l2" (default: squared L2 norm), "ip" or "cosine"
#     )

# collection.add(
#     ids=["1", "2", "3"],
#     documents=["Apfel", "Fahrzeug", "Rechtssprechung"],
#     # metadatas=[{},{}]
#     )

Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Insert of existing embedding ID: 1
Insert of existing embedding ID: 2
Insert of existing embedding ID: 3


In [8]:
collection = chroma_client.get_collection("article_regex_splitter")


In [14]:
# collection.peek(0)
# collection.count()
# collection.modify(name="new_name")

In [24]:
# collection.query(
#     query_embeddings=[], # embedded question / part of question # HERE: PREFORMULATE ANSWER, EMBED ANSWER, RETRIEVE REAL KNOWLEDGE ?!? # needs to be the same dimension as embedded vectors in db
#     query_texts=["Obst"], # ALTERNATIVE THAN QUERYING WITH EMBEDDINGS -> CHROMA WILL AUTOMATICALLY EMBED USING EMBEDDING FUNCTION OF COLLECTION
#     n_results=1, # number of docs to retrieve
#     where={"metadata_field": "is_equal_to_this"}, # filter metadata
#     where_document={"$contains": "search_string"} # filter for hard words / regexes etc.
#     include=["documents"], specify which data to return (embeddings is excluded by default)
# )

In [None]:
# collection.get( # when wanting to not query with embeddings but only retrieve by id or so
#     # ids=[], 
#     # where={,}
# )

In [None]:
# # more info: https://docs.trychroma.com/usage-guide