# QuicKB Integration - ChromaDB

This example notebook shows you how to implement your knowledgebase and fine-tuned model with ChromaDB

In [1]:
# Install required packages if needed:
# !pip install chromadb datasets sentence-transformers

import chromadb
from chromadb.utils import embedding_functions
from datasets import load_dataset

from sentence_transformers import SentenceTransformer

In [2]:
# Load model from Hugging Face
model_id = "AdamLucek/modernbert-embed-quickb"  # Replace with your model ID
model = SentenceTransformer(model_id)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/30.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/596M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [3]:
# Create embedding function
ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=model_id,
    device="cuda" if model.device.type == "cuda" else "cpu"
)

In [4]:
# Initialize ChromaDB
client = chromadb.PersistentClient(path="./chroma_quickb")

In [5]:
# Create collection
collection = client.get_or_create_collection(
    name="quickb_collection",
    embedding_function=ef
)

In [6]:
# Load dataset from Hugging Face
dataset = load_dataset("AdamLucek/quickb-kb")  # Replace with your dataset ID
chunks = dataset['train']

README.md:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

In [7]:
# Add documents to ChromaDB
batch_size = 500
for i in range(0, len(chunks), batch_size):
    batch = chunks[i:i + batch_size]
    
    collection.add(
        documents=batch['text'],
        metadatas=[{'source': doc} for doc in batch['source']],
        ids=[str(id) for id in batch['id']]
    )

print(f"Added {collection.count()} documents")

Compiling the model with `torch.compile` and using a `torch.cpu` device is not supported. Falling back to non-compiled mode.


Added 2807 documents


In [8]:
# Example query
results = collection.query(
    query_texts=["Who has relied on them to draft court filings?"],
    n_results=3
)

# Print results
for i, (doc, distance, metadata) in enumerate(zip(
    results['documents'][0],
    results['distances'][0],
    results['metadatas'][0]
)):
    print(f"\nResult {i+1}")
    print(f"Distance: {distance:.4f}")
    print(f"Source: {metadata['source']}")
    print(f"Text: {doc}")


Result 1
Distance: 0.3171
Source: Al-Hamim_v_Star_2024-12-26.txt
Text: self-represented litigants alike have relied on them to draft court filings

Result 2
Distance: 1.0947
Source: Al-Hamim_v_Star_2024-12-26.txt
Text: . Some self-represented litigants, including plaintiff, Alim Al-Hamim, have relied on GAI tools to draft court filings, only to discover later to their chagrin that their filings contained hallucinations. Al-Hamim’s opening brief in this appeal contained hallucinations, as well as bona fide legal citations

Result 3
Distance: 1.1034
Source: Al-Hamim_v_Star_2024-12-26.txt
Text: .) For these reasons, individuals using the current generation of general-purpose GAI tools to assist with legal research and drafting must be aware of the tools’ propensity to generate outputs 18 containing fictitious legal authorities and must ensure that such fictitious citations do not appear in any court filing
