# ChromaDB

In [1]:
import os

# REPLICATE_API_TOKEN = os.getenv("REPLICATE_API_TOKEN")
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
TEXT_FILES_FOLDER = "../resources/Wissensquellen"
SPLITS_PATH = "../resources/splits/"

## 1. Get Chunks

In [2]:
from langchain_core import documents
import pandas as pd

def create_document(chunk):
    page_content = chunk
    # metadata = {
    #     "splitter": ,
    #     "",}
    doc = documents.Document(
        page_content=page_content,
        # metadata=metadata,
    )
    return doc
# len(df)

## 2. Embedding Methods

In [3]:
import chromadb.utils.embedding_functions as embedding_functions
from chromadb import Documents, EmbeddingFunction, Embeddings
from replicate import Client
import spacy

# USABLE BUT DEPENDING ON HF IT TAKES VERY LONG TO GET EMBEDDINGS
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
    api_key=HUGGINGFACE_API_KEY,
    # model_name="sentence-transformers/all-MiniLM-L6-v2"  # das ist der default
)
 
replicateSession = Client(api_token="r8_c7xatuN0lmkBFqlgCXvas56nRkRWBeS1aXtWL")
class ReplicateEmbeddingsFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        embeddings = [replicateSession.run(
            "replicate/all-mpnet-base-v2:b6b7585c9640cd7a9572c6e129c9549d79c9c31f0d3fdce7baac7c67ca38f305",
            input={"text": document},
        )[0]['embedding'] for document in input]
        return embeddings
replicate_ef = ReplicateEmbeddingsFunction()

class SpacyEmbeddingsFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        nlp = spacy.load("de_core_news_lg") # md , lg
        embeddings = [nlp(document).vector.tolist() for document in input]
        return embeddings
spacy_ef = SpacyEmbeddingsFunction()

## 3. Chroma Client and Collection

In [4]:
import chromadb

chroma_client = chromadb.PersistentClient(path="../resources/chromadb")

def create_collection(split_folder: str, embedding_function: EmbeddingFunction, ef_name: str):
    file_names = os.listdir(SPLITS_PATH + split_folder)

    collection_name = split_folder[:-1] + "_" + ef_name
    collection = chroma_client.get_or_create_collection(
        name=collection_name,
        embedding_function=embedding_function,
        # metadata={"hnsw:space": "cosine"} # "l2" (default: squared L2 norm), "ip" or "cosine"
        )

    for file in file_names:
        file_path = SPLITS_PATH + split_folder + file
        df = pd.read_json(file_path)
        documents_ = df['splits'].to_list()
        ids = [file[:-4] + "#" + str(i) for i in range(0, len(documents_))]
        metadata = [{"file": file[:-4]} for i in range(0, len(documents_))] # doesn`t work because of filename
        if len(documents_) == 0:
            print(r'This document {filepath} has no splits. It won`t be added to the collection')
            break
        number_documents = len(documents_)
        batchsize = 1000 # need to add in batches as batchsize cap is 5461 
        batch_indexes = [i * batchsize for i in range (1, number_documents//batchsize + 1)]
        batch_indexes.append(number_documents)
        current = 0
        print(f"Adding {len(documents_)} Chunks from {file} to collection {collection.name} in {len(batch_indexes)} batches.")
        for index in batch_indexes:
            documents_batch = documents_[current:index]
            ids_batch = ids[current:index]
            metadata_batch = metadata[current:index]
            collection.add(
                ids=ids_batch,
                documents=documents_batch,
                metadatas=metadata_batch
                )
            print(f"    Batch '{current} - {index}' complete")
            current = index

        print(f"Success - length of collection {collection.name} is {collection.count()}")

split_folder = SPLITS_PATH + "char_splitter_128_o0/"

In [5]:
create_collection("char_splitter_1024_o128/", spacy_ef, "spacy")
create_collection("article_regex_splitter/", spacy_ef, "spacy")
create_collection("semantic_splitter/", spacy_ef, "spacy")

# same for replicate
create_collection("char_splitter_1024_o128/", replicate_ef, "replicate")
create_collection("article_regex_splitter/", replicate_ef, "replicate")
create_collection("semantic_splitter/", replicate_ef, "replicate")

Adding 532 Chunks from CELEX_02013L0036-20220101_DE_TXT.json to collection char_splitter_1024_o128_spacy in 1 batches.
    Batch '0 - 532' complete
Success - length of collection char_splitter_1024_o128_spacy is 532
Adding 2292 Chunks from CELEX_02013R0575-20230628_DE_TXT.json to collection char_splitter_1024_o128_spacy in 3 batches.
    Batch '0 - 1000' complete
    Batch '1000 - 2000' complete
    Batch '2000 - 2292' complete
Success - length of collection char_splitter_1024_o128_spacy is 2824
Adding 988 Chunks from KWG.json to collection char_splitter_1024_o128_spacy in 1 batches.
    Batch '0 - 988' complete
Success - length of collection char_splitter_1024_o128_spacy is 3812
Adding 171 Chunks from CELEX_02013L0036-20220101_DE_TXT.json to collection article_regex_splitter_spacy in 1 batches.
    Batch '0 - 171' complete
Success - length of collection article_regex_splitter_spacy is 171
Adding 549 Chunks from CELEX_02013R0575-20230628_DE_TXT.json to collection article_regex_splitter

# SCRIPT FINISH, HERE ONLY TESTING FOR RETRIEVAL

In [None]:
# collection = chroma_client.get_or_create_collection(
#     name="test",
#     embedding_function=SpacyEmbeddingsFunction(),
#     # metadata={"hnsw:space": "cosine"} # "l2" (default: squared L2 norm), "ip" or "cosine"
#     )

# collection = chroma_client.get_or_create_collection(
#     name="first_collection",
#     embedding_function=huggingface_ef,
#     # metadata={"hnsw:space": "cosine"} # "l2" (default: squared L2 norm), "ip" or "cosine"
#     )

# collection.add(
#     ids=["1", "2", "3"],
#     documents=["Apfel", "Fahrzeug", "Rechtssprechung"],
#     # metadatas=[{},{}]
#     )

In [None]:
import chromadb 
chroma_client = chromadb.PersistentClient(path="../resources/chromadb")
collection = chroma_client.get_collection("semantic_splitter_spacy")


In [None]:
# collection.peek(0)
collection.count()
# collection.modify(name="new_name")

In [None]:
# collection.query(
#     query_embeddings=[], # embedded question / part of question # HERE: PREFORMULATE ANSWER, EMBED ANSWER, RETRIEVE REAL KNOWLEDGE ?!? # needs to be the same dimension as embedded vectors in db
#     query_texts=["Obst"], # ALTERNATIVE THAN QUERYING WITH EMBEDDINGS -> CHROMA WILL AUTOMATICALLY EMBED USING EMBEDDING FUNCTION OF COLLECTION
#     n_results=1, # number of docs to retrieve
#     where={"metadata_field": "is_equal_to_this"}, # filter metadata
#     where_document={"$contains": "search_string"} # filter for hard words / regexes etc.
#     include=["documents"], specify which data to return (embeddings is excluded by default)
# )

In [None]:
# collection.get( # when wanting to not query with embeddings but only retrieve by id or so
#     # ids=[], 
#     # where={,}
# )

In [None]:
# # more info: https://docs.trychroma.com/usage-guide