In [2]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma


import chromadb.utils.embedding_functions as embedding_functions
import chromadb

from dotenv import load_dotenv, find_dotenv
import random
import os

In [3]:
# Find and load the .env file
load_dotenv(find_dotenv())

# Define the embedding model to use
EMBEDDING_MODEL_NAME = "text-embedding-3-small"
embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL_NAME)

In [4]:
### Load the documents from the Chroma database

# Load the documents from the Chroma database
# Stored in the chroma_db folder with the collection name "lyrics"
# The OpenAI embeddings model is used as the embedding function
vectorstore = Chroma(
    persist_directory="./chroma_db",
    collection_name="lyrics",
    embedding_function=embeddings,
)

In [None]:
### Get the metadata and documents and count them

# Get the metadata and documents from the loaded vectorstore collection
vectorstore_elements_dict = vectorstore._collection.get(
    include=["metadatas", "documents"]
)
print(f"Number of documents in the vectorstore: {vectorstore._collection.count()}")
print(f"Name of the loaded collection: {vectorstore._collection.name}")

In [None]:
### Split the collection into parts and save them in separate databases to avoid memory issues

# Define the OpenAI embedding function
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=os.getenv("OPENAI_API_KEY"), model_name=EMBEDDING_MODEL_NAME
)

# Load the original collection with all the documents
client = chromadb.PersistentClient(path="./chroma_db")
original_collection = client.get_collection("lyrics")

# Get the ids of the documents in the original collection to shuffle them and split them into parts
ids = original_collection.get(include=[])["ids"]

# Shuffle the ids and split them into 5 parts
parts_number = 5
random.shuffle(ids)
part_size = len(ids) // parts_number
ids_parts = [ids[i : i + part_size] for i in range(0, len(ids), part_size)]

# If there are more than 5 parts, then merge the last two parts into one
if len(ids_parts) > parts_number:
    ids_parts[-2] += ids_parts[-1]
    ids_parts = ids_parts[:-1]

# Create a new collection for each part
# Add the documents to the new collection in chunks of 1000 documents to avoid memory issues
# Save the collections in separate databases
for i, ids_part in enumerate(ids_parts):
    part_client = chromadb.PersistentClient(path=f"./chroma_db_part{i+1}")
    new_collection_name = "lyrics"
    new_collection = part_client.create_collection(
        name=new_collection_name, embedding_function=openai_ef
    )

    chunk_size = 1000
    ids_part_chunks = [
        ids_part[i : i + chunk_size] for i in range(0, len(ids_part), chunk_size)
    ]
    chunks_num = len(ids_part_chunks)
    for j, ids in enumerate(ids_part_chunks):
        item = original_collection.get(
            ids=ids, include=["metadatas", "documents", "embeddings"]
        )
        item.pop("data")
        item.pop("included")
        new_collection.add(**item)
        print(f"Added document {j+1}/{chunks_num} from part {i+1}")

    print(f"Created collection {new_collection_name} with {len(ids_part)} documents")