In [None]:
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Union

# Method 1: Create a custom embedding function class
class BGEEmbeddingFunction(embedding_functions.EmbeddingFunction):
    def __init__(self, model_name: str = 'BAAI/bge-base-en-v1.5'):
        self.model = SentenceTransformer(model_name)
    
    def __call__(self, input: List[str]) -> List[List[float]]:
        # Generate embeddings
        embeddings = self.model.encode(input, convert_to_tensor=False)
        # Convert numpy arrays to lists if necessary
        if isinstance(embeddings, np.ndarray):
            embeddings = embeddings.tolist()
        return embeddings

# Method 2: Using ChromaDB's SentenceTransformerEmbeddingFunction (simpler)
def create_chromadb_with_bge_embeddings():
    # Initialize ChromaDB client
    client = chromadb.Client()
    
    # Method 1: Using custom embedding function
    custom_ef = BGEEmbeddingFunction()
    
    # Method 2: Using built-in SentenceTransformerEmbeddingFunction
    sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="BAAI/bge-base-en-v1.5"
    )
    
    # Create collection with custom embedding function
    collection_custom = client.create_collection(
        name="my_collection_custom",
        embedding_function=custom_ef
    )
    
    # Create collection with built-in function
    collection_builtin = client.create_collection(
        name="my_collection_builtin", 
        embedding_function=sentence_transformer_ef
    )
    
    return collection_custom, collection_builtin

# Example usage
def example_usage():
    # Create collections
    collection_custom, collection_builtin = create_chromadb_with_bge_embeddings()
    
    # Sample documents
    documents = [
        "The quick brown fox jumps over the lazy dog",
        "Machine learning is a subset of artificial intelligence",
        "ChromaDB is a vector database for AI applications"
    ]
    
    ids = ["doc1", "doc2", "doc3"]
    metadatas = [
        {"source": "example1"},
        {"source": "example2"}, 
        {"source": "example3"}
    ]
    
    # Add documents to collection (embeddings will be generated automatically)
    collection_custom.add(
        documents=documents,
        metadatas=metadatas,
        ids=ids
    )
    
    # Query the collection
    results = collection_custom.query(
        query_texts=["What is machine learning?"],
        n_results=2
    )
    
    print("Query results:")
    for i, doc in enumerate(results['documents'][0]):
        print(f"{i+1}. {doc}")
        print(f"   Distance: {results['distances'][0][i]}")
        print(f"   Metadata: {results['metadatas'][0][i]}")
    
    return results

# Advanced: Custom embedding function with preprocessing
class AdvancedBGEEmbeddingFunction(embedding_functions.EmbeddingFunction):
    def __init__(self, model_name: str = 'BAAI/bge-base-en-v1.5', normalize: bool = True):
        self.model = SentenceTransformer(model_name)
        self.normalize = normalize
    
    def __call__(self, input: List[str]) -> List[List[float]]:
        # Optional: Add query prefix for better retrieval (BGE models benefit from this)
        processed_input = []
        for text in input:
            # You can add prefixes like "Represent this sentence for searching relevant passages:"
            processed_input.append(text)
        
        # Generate embeddings
        embeddings = self.model.encode(
            processed_input, 
            convert_to_tensor=False,
            normalize_embeddings=self.normalize
        )
        
        # Convert to list format
        if isinstance(embeddings, np.ndarray):
            embeddings = embeddings.tolist()
        
        return embeddings

# Example with persistent storage
def create_persistent_collection():
    # Create persistent client
    client = chromadb.PersistentClient(path="./chroma_db")
    
    # Create embedding function
    bge_ef = BGEEmbeddingFunction()
    
    # Create or get collection
    try:
        collection = client.create_collection(
            name="persistent_bge_collection",
            embedding_function=bge_ef
        )
    except Exception:
        # Collection already exists, get it
        collection = client.get_collection(
            name="persistent_bge_collection",
            embedding_function=bge_ef
        )
    
    return collection

if __name__ == "__main__":
    # Run example
    example_usage()
    
    # Create persistent collection
    persistent_collection = create_persistent_collection()
    print(f"Persistent collection created: {persistent_collection.name}")

InternalError: Collection [my_collection_custom] already exists