# ChromaDB

In [1]:
import os

# REPLICATE_API_TOKEN = os.getenv("REPLICATE_API_TOKEN")
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
TEXT_FILES_FOLDER = "../resources/Wissensquellen"
SPLITS_PATH = "../resources/splits/"

## 1. Get Chunks

In [2]:
from langchain_core import documents
import pandas as pd

def create_document(chunk):
    page_content = chunk
    # metadata = {
    #     "splitter": ,
    #     "",}
    doc = documents.Document(
        page_content=page_content,
        # metadata=metadata,
    )
    return doc
# len(df)

## 2. Embedding Methods

In [3]:
import chromadb.utils.embedding_functions as embedding_functions
from chromadb import Documents, EmbeddingFunction, Embeddings
import replicate
import spacy

huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
    api_key=HUGGINGFACE_API_KEY,
    # model_name="sentence-transformers/all-MiniLM-L6-v2"  # das ist der default
)

# openai_embeddings = 

# replicate_embeddings = 

class SpacyEmbeddingsFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        nlp = spacy.load("de_core_news_sm") # md , lg , trf (bert-base-german-cased)
        embeddings = [nlp(document).vector.tolist() for document in input]
        return embeddings
spacy_ef = SpacyEmbeddingsFunction()

## 3. Chroma Client and Collection

In [6]:
import chromadb

chroma_client = chromadb.PersistentClient(path="../resources/chromadb")

def create_collection(split_folder):
    file_names = os.listdir(SPLITS_PATH + split_folder)
    
    collection = chroma_client.get_or_create_collection(
        name=split_folder[:-1],
        embedding_function=spacy_ef,
        # metadata={"hnsw:space": "cosine"} # "l2" (default: squared L2 norm), "ip" or "cosine"
        )

    for file in file_names:
        file_path = SPLITS_PATH + split_folder + file
        df = pd.read_json(file_path)
        documents_ = df['splits'].to_list()
        ids = [file[:-4] + "#" + str(i) for i in range(0, len(documents_))]
        metadata = [{"file": file[:-4]} for i in range(0, len(documents_))] # doesn`t work because of filename
        
        number_documents = len(documents_)
        batchsize = 1000 # need to add in batches as batchsize cap is 5461 
        batch_indexes = [i * batchsize for i in range (1, number_documents//batchsize + 1)]
        batch_indexes.append(number_documents)
        current = 0
        print(f"Adding {len(documents_)} Chunks from {file} to collection {collection.name} in {len(batch_indexes)} batches.")
        for index in batch_indexes:
            documents_batch = documents_[current:index]
            ids_batch = ids[current:index]
            metadata_batch = metadata[current:index]
            collection.add(
                ids=ids_batch,
                documents=documents_batch,
                metadatas=metadata_batch
                )
            print(f"    Batch '{current} - {index}' complete")
            current = index

        print(f"Success - length of collection {collection.name} is {collection.count()}")

split_folder = SPLITS_PATH + "char_splitter_128_o0/"
create_collection("char_splitter_128_o0/")

Adding 3724 Chunks from CELEX_02013L0036-20220101_DE_TXT.json to collection char_splitter_128_o0 in 4 batches.
Batch '0 - 1000' complete
Batch '1000 - 2000' complete
Batch '2000 - 3000' complete
Batch '3000 - 3724' complete
    Success - length of collection char_splitter_128_o0 is 3724
Adding 16041 Chunks from CELEX_02013R0575-20230628_DE_TXT.json to collection char_splitter_128_o0 in 17 batches.
Batch '0 - 1000' complete
Batch '1000 - 2000' complete
Batch '2000 - 3000' complete
Batch '3000 - 4000' complete
Batch '4000 - 5000' complete
Batch '5000 - 6000' complete
Batch '6000 - 7000' complete
Batch '7000 - 8000' complete
Batch '8000 - 9000' complete
Batch '9000 - 10000' complete


KeyboardInterrupt: 

# SCRIPT FINISH, HERE ONLY TESTING FOR RETRIEVAL

In [16]:
# collection = chroma_client.get_or_create_collection(
#     name="test",
#     embedding_function=SpacyEmbeddingsFunction(),
#     # metadata={"hnsw:space": "cosine"} # "l2" (default: squared L2 norm), "ip" or "cosine"
#     )

# collection = chroma_client.get_or_create_collection(
#     name="first_collection",
#     embedding_function=huggingface_ef,
#     # metadata={"hnsw:space": "cosine"} # "l2" (default: squared L2 norm), "ip" or "cosine"
#     )

# collection.add(
#     ids=["1", "2", "3"],
#     documents=["Apfel", "Fahrzeug", "Rechtssprechung"],
#     # metadatas=[{},{}]
#     )

Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Insert of existing embedding ID: 1
Insert of existing embedding ID: 2
Insert of existing embedding ID: 3


In [19]:
# collection.peek(0)
# collection.count()
# collection.modify(name="new_name")

In [24]:
# collection.query(
#     query_embeddings=[], # embedded question / part of question # HERE: PREFORMULATE ANSWER, EMBED ANSWER, RETRIEVE REAL KNOWLEDGE ?!? # needs to be the same dimension as embedded vectors in db
#     query_texts=["Obst"], # ALTERNATIVE THAN QUERYING WITH EMBEDDINGS -> CHROMA WILL AUTOMATICALLY EMBED USING EMBEDDING FUNCTION OF COLLECTION
#     n_results=1, # number of docs to retrieve
#     where={"metadata_field": "is_equal_to_this"}, # filter metadata
#     where_document={"$contains": "search_string"} # filter for hard words / regexes etc.
#     include=["documents"], specify which data to return (embeddings is excluded by default)
# )

In [None]:
# collection.get( # when wanting to not query with embeddings but only retrieve by id or so
#     # ids=[], 
#     # where={,}
# )

In [None]:
# # more info: https://docs.trychroma.com/usage-guide