In [None]:
#| hide
%load_ext autoreload
%autoreload 2

# load chromadb

> A notebook to load chromadb for testing.

In [None]:
import jarvis.secrets

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader

In [None]:
from functools import partial
from pathlib import Path
import os
from tqdm.notebook import tqdm

In [None]:
import chromadb
import chromadb.config

In [None]:
CHROMA_DB_PATH = '../.chromadb'

In [None]:
CHROMA_DB_NOTRACK_SETTING = chromadb.config.Settings(anonymized_telemetry=False, chroma_db_impl='duckdb+parquet', 
                                                     persist_directory=CHROMA_DB_PATH)
# client = chromadb.Client(chroma.config.Settings(anonymized_telemetry=False))

Example of adding chromadb documents to a collection in the chromadb client. Doing this in langchain is slightly different. <br>
```python
collection.add(
    documents=["This is a document", "This is another document"],
    metadatas=[{"source": "my_source"}, {"source": "my_source"}],
    ids=["id1", "id2"]
)
```

Example how to set the `persistence_directory` for Chroma.<br>
Documentation Reference: https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/chroma.html <br>
Always remember to persist after adding content! <br>
```python
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = '../.chromadb'
embedding_function = OpenAIEmbeddings()
db = get_chromadb_collection(collection_name='...')
db.persist() #Always remember to persist after adding content!
````

```python
loader = TextLoader('../examples/data/pg_essay_beyond_smart.txt')
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(documents)#[:10] #limit length to 10 for this example.
print(len(docs))
embedding_function = OpenAIEmbeddings()
db = get_chromadb_collection(collection_name='...')
```

https://python.langchain.com/en/latest/modules/chains/index_examples/chat_vector_db.html

In [None]:
get_chromadb_collection = partial(Chroma, collection_name='langchain', embedding_function=OpenAIEmbeddings(), 
                                  persist_directory=CHROMA_DB_PATH, client_settings=CHROMA_DB_NOTRACK_SETTING, 
                                  collection_metadata=None)

In [None]:
# Now we can load the persisted database from disk, and use it as normal. 
# embedding_function = OpenAIEmbeddings()
# db_paulg = get_chromadb_collection(collection_name='paulg_essays')

## Paul G Essay

In [None]:
db_paulg = get_chromadb_collection(collection_name='paulg_essays')

Using embedded DuckDB with persistence: data will be stored in: ../.chromadb


In [None]:
loader = TextLoader('../examples/data/pg_essay_beyond_smart.txt')
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(documents)#[:10] #limit length to 10 for this example.
docs2 = []
for doc in docs:
    doc.metadata.update({'namespace':'paulg_essays'})
    docs2.append(doc)
print(len(docs2))
embeddings = OpenAIEmbeddings()

11


In [None]:
# db = Chroma.from_documents(docs, embeddings)
db_paulg.add_documents(documents=docs2, collection_name='paulg_essays')
db_paulg.persist()

In [None]:
db_paulg.similarity_search_with_score('What is the most important thing?', k=2)

[(Document(page_content="I grew up thinking that being smart was the thing most to be desired. Perhaps you did too. But I bet it's not what you really want. Imagine you had a choice between being really smart but discovering nothing new, and being less smart but discovering lots of new ideas. Surely you'd take the latter. I would. The choice makes me uncomfortable, but when you see the two options laid out explicitly like that, it's obvious which is better.\n\nThe reason the choice makes me uncomfortable is that being smart still feels like the thing that matters, even though I know intellectually that it isn't. I spent so many years thinking it was. The circumstances of childhood are a perfect storm for fostering this illusion. Intelligence is much easier to measure than the value of new ideas, and you're constantly being judged by it. Whereas even the kids who will ultimately discover new things aren't usually discovering them yet. For kids that way inclined, intelligence is the only

In [None]:
try:
    db_paulg.similarity_search_with_score('test', k=2, filter={'namespace':'paulg_essays2'})
except chromadb.errors.NoDatapointsException:
    print('got NoDatapointsException which is expected because the namespace filter did not match')

got NoDatapointsException which is expected because the namespace filter did not match


In [None]:
db_paulg2 = get_chromadb_collection(collection_name='paulg_essays2')

Using embedded DuckDB with persistence: data will be stored in: ../.chromadb


In [None]:
try:
    db_paulg2.similarity_search_with_score('test', k=2)
except chromadb.errors.NoIndexException:
    print('got NoIndexException which is expected because the collection_name "paulg_essays2" does not exist.')

got NoIndexException which is expected because the collection_name "paulg_essays2" does not exist.


In [None]:
# db.delete_collection()

## Load KWA data

In [None]:
db_kwa = get_chromadb_collection(collection_name='kwa')

Using embedded DuckDB with persistence: data will be stored in: ../.chromadb


In [None]:
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=100)
all_documents = []
for filepath in tqdm(Path(os.environ["KWA_PATH"]).iterdir()):
    if filepath.is_file() and filepath.suffix == '.txt':
        loader = TextLoader(str(filepath))
        document = loader.load()
        split_document_chunks = splitter.split_documents(document)
        all_documents += split_document_chunks

0it [00:00, ?it/s]

In [None]:
len(all_documents), len(split_document_chunks[0].page_content)

(638, 2509)

In [None]:
# for doc in all_documents:
#     print(doc.metadata)

In [None]:
db_kwa.add_documents(documents=all_documents)
db_kwa.persist()

In [None]:
[('fish' in o.page_content.lower()) for o in db_kwa.similarity_search('What was the name of the project trying to save fish?')]

[True, True, True, True]