In [1]:
!pip install tiktoken openai -q
!pip install chromadb==0.4.15 -q

In [1]:
import chromadb
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings
from dotenv import load_dotenv
from openai import OpenAI
import os
import json
from tqdm import tqdm

In [29]:
load_dotenv()

client = OpenAI(api_key = os.getenv("OPENAI_API_KEY"))

In [27]:
def read_jsonl(filename):
    """Reads a JSONL file and yields each line as a dictionary."""
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            try:
                yield json.loads(line)
            except json.JSONDecodeError:
                print(f"Skipping invalid JSON object: {line}")

In [30]:
documents = read_jsonl("chunks.jsonl")

First version, using a custom embedding function (based on Open AI)

In [5]:
chroma_client = chromadb.PersistentClient(
    path="chromadb",
    settings=Settings(),
    tenant=DEFAULT_TENANT,
    database=DEFAULT_DATABASE,
)

In [6]:
#chroma_client.delete_collection(name="documents")  

collection = chroma_client.create_collection(name="documents")

In [7]:
def get_embedding(text, model="text-embedding-3-small"):
    list_texts_replaced = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding
    
#get_embedding("I love Kitty")

In [8]:
counter = 0

for d in tqdm(documents, desc = "Processing documents: "):
    collection.add(
        documents = [d['page_content']],
        metadatas = [{'source': d['source']}],
        embeddings = [get_embedding(d['page_content'])],
        ids = [str(d['id'])]
    )
    counter += 1
    if counter > 10:
        break

Processing documents: : 10it [00:02,  4.60it/s]


Second version, using the Open AI embeddings

In [8]:
!pip install -U langchain-openai -q

In [24]:
import chromadb
#from langchain.embeddings import OpenAIEmbeddings
#from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings

client = chromadb.PersistentClient(
    path="chromadb2",  # Specify the disk storage path
    settings=Settings(),
    tenant=DEFAULT_TENANT,
    database=DEFAULT_DATABASE,
)

# Create OpenAI embeddings
embedding_function = OpenAIEmbeddings()

# Get or create a collection
collection = client.get_or_create_collection(
    name="documents",
    embedding_function=embedding_function.embed_documents,
    #metadata={"hnsw:space": "cosine", "hnsw:dim": 1536}
)

In [25]:
client.get_settings()

Settings(environment='', chroma_db_impl=None, chroma_api_impl='chromadb.api.segment.SegmentAPI', chroma_product_telemetry_impl='chromadb.telemetry.product.posthog.Posthog', chroma_telemetry_impl='chromadb.telemetry.product.posthog.Posthog', chroma_sysdb_impl='chromadb.db.impl.sqlite.SqliteDB', chroma_producer_impl='chromadb.db.impl.sqlite.SqliteDB', chroma_consumer_impl='chromadb.db.impl.sqlite.SqliteDB', chroma_segment_manager_impl='chromadb.segment.impl.manager.local.LocalSegmentManager', chroma_segment_directory_impl='chromadb.segment.impl.distributed.segment_directory.RendezvousHashSegmentDirectory', chroma_memberlist_provider_impl='chromadb.segment.impl.distributed.segment_directory.CustomResourceMemberlistProvider', chroma_collection_assignment_policy_impl='chromadb.ingest.impl.simple_policy.SimpleAssignmentPolicy', worker_memberlist_name='worker-memberlist', tenant_id='default', topic_namespace='default', is_persistent=True, persist_directory='chromadb2', chroma_server_host=None

In [26]:
print(dir(client))

['__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_add', '_admin_client', '_count', '_create_system_if_not_exists', '_delete', '_get', '_get_identifier_from_settings', '_identifer_to_system', '_identifier', '_modify', '_peek', '_populate_data_from_system', '_query', '_server', '_system', '_update', '_upsert', '_validate_tenant_database', 'clear_system_cache', 'create_collection', 'database', 'delete_collection', 'from_system', 'get_collection', 'get_or_create_collection', 'get_settings', 'get_version', 'heartbeat', 'list_collections', 'max_batch_size', 'reset', 'set_database', 'set_tenant', 'tenant']


In [31]:
def chunked_generator(generator, chunk_size):
    chunk = []
    for item in generator:
        chunk.append(item)
        if len(chunk) == chunk_size:
            yield chunk
            chunk = []
    if chunk:  # Yield the last chunk if it's not empty
        yield chunk

chunk_size = 100

for chunk in tqdm(chunked_generator(documents, chunk_size), total = 10162 // chunk_size):
    collection.add(
        documents = [d['page_content'] for d in chunk],
        metadatas = [{'source': d['source']} for d in chunk],
        #embeddings = [get_embedding(d['page_content'])],
        ids = [str(d['id']) for d in chunk]
    )

102it [03:38,  2.15s/it]                         
