In [8]:
# export CHROMA_PATH="/Users/ongbt/Downloads/psdsrc"

In [14]:
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain.vectorstores.chroma import Chroma
import os
import shutil

CHROMA_PATH = "../chroma"
DATA_PATH = "../data/books"


def main():
    generate_data_store()


def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks)


def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.pdf")
    loader = PyPDFDirectoryLoader(DATA_PATH)
    documents = loader.load()
    return documents


def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[0]
    print(document.page_content)
    print(document.metadata)

    return chunks


def save_to_chroma(chunks: list[Document]):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, OllamaEmbeddings(), persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")


if __name__ == "__main__":
    main()

Split 53 documents into 629 chunks.
PUBLIC SECTOR DATA SECURITY REVIEW COMMITTEE REPORT  
A-1 
 ANNEX A: EXISTING GO VERNMENT EFFORTS IN 
USING DATA SECURELY  
 
1 The Committee focused on data security, rather than cybersecurity, as there 
are ongoing work streams to strengthen the Government’s cybersecurity posture. The
{'source': '../data/books/annexes-to-the-psdsrc-final-report.pdf', 'page': 0, 'start_index': 0}
Saved 629 chunks to ../chroma.
