In [1]:
from langchain_community.vectorstores import Chroma
from pathlib import Path
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from tqdm import tqdm
import chromadb

In [2]:
CHROMA_DATA_PATH = Path("test/chroma")

In [3]:
def open_db(
    name="default",
) -> Chroma:
    CHROMA_DATA_PATH.mkdir(exist_ok=True, parents=True)
    persistent_client = chromadb.PersistentClient(str(CHROMA_DATA_PATH))
    collection = persistent_client.get_or_create_collection(name)
    # collection.add(ids=["1", "2", "3"], documents=["a", "b", "c"])
    embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    return Chroma(
        client = persistent_client,
        collection_name = name,
        embedding_function=embedding_model,
    )

In [9]:
def populate_db(name: str, docs_book: list):
    db = open_db(
        name=name,
    )
    docs = docs_book # documents in plain text converted from LaTex, add a function with prepare
    splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
    print(f"Indexing {len(docs)} documents")
    for doc in tqdm(docs):
        splitted_docs = splitter.split_documents([doc])
        db.add_documents(splitted_docs)

In [5]:
def search_for_documents(query: str, db_name: str):
    db = open_db(name=db_name)
    results = db.similarity_search(query)
    return results

In [8]:
# !pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl.metadata (11 kB)
Collecting transformers<5.0.0,>=4.32.0 (from sentence-transformers)
  Downloading transformers-4.39.3-py3-none-any.whl.metadata (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.2.2-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting Pillow (from sentence-transformers)
  Down

In [12]:
class Document:
    def __init__(self, page_content, metadata=None):
        if metadata is None:
            metadata = {}
        self.page_content = page_content
        self.metadata = metadata

In [14]:
def test_db(db_name: str):
    with open("book_1.txt") as my_file:
        book_1 = my_file.read()

    with open("book_2.txt") as my_file:
        book_2 = my_file.read()

    doc = [Document(book_1, {}), Document(book_2, {})]

    populate_db(name=db_name, docs_book=doc)

    search_query = "What was Jenny doing?"
    search_results = search_for_documents(query=search_query, db_name=db_name)
    
    print("\nSearch Results:")
    for result in search_results:
        print(result)

test_db('book')

Indexing 2 documents


  0%|          | 0/2 [00:00<?, ?it/s]Created a chunk of size 1176, which is longer than the specified 1000
Created a chunk of size 1098, which is longer than the specified 1000
Created a chunk of size 1672, which is longer than the specified 1000
Created a chunk of size 1622, which is longer than the specified 1000
Created a chunk of size 1113, which is longer than the specified 1000
Created a chunk of size 2244, which is longer than the specified 1000
 50%|█████     | 1/2 [00:00<00:00,  3.52it/s]Created a chunk of size 1002, which is longer than the specified 1000
Created a chunk of size 1480, which is longer than the specified 1000
Created a chunk of size 1003, which is longer than the specified 1000
Created a chunk of size 1701, which is longer than the specified 1000
Created a chunk of size 1555, which is longer than the specified 1000
100%|██████████| 2/2 [00:00<00:00,  4.07it/s]



Search Results:
page_content='On her arrival in this place, she went immediately to the habitation\nof an elderly matron; to whom, as this matron had the good fortune to\nresemble herself in the comeliness of her person, as well as in her\nage, she had generally been more favourable than to any of the rest.\nTo this woman she imparted what had happened, and the design upon\nwhich she was come thither that morning. These two began presently to\nscrutinize the characters of the several young girls who lived in any\nof those houses, and at last fixed their strongest suspicion on one\nJenny Jones, who, they both agreed, was the likeliest person to have\ncommitted this fact.'
page_content='“As to your child, let no thoughts concerning it molest you; I will\nprovide for it in a better manner than you can ever hope. And now\nnothing remains but that you inform me who was the wicked man that\nseduced you; for my anger against him will be much greater than you\nhave experienced on this occasio