In [11]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import chromadb

In [12]:
# Setting the environment
DATA_PATH = r"data"
CHROMA_PATH = r"chroma_db"

chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)

collection = chroma_client.get_or_create_collection(name="vegetable_gardening")

In [13]:
# Loading the document
loader = PyPDFDirectoryLoader(DATA_PATH)

raw_document = loader.load()

In [None]:
# Splitting the document
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 300,
    chunk_overlap = 100,
    length_function = len,
    is_separator_regex = False
)

chunks = text_splitter.split_documents(raw_document)

print("Number of document chunks: ", {len(chunks)})
print("Sample chunk 0:\n", {chunks[0].page_content})

In [None]:
# Preparing to added in chromadb
documents = []
metadata = []
ids = []

i = 0

for chunk in chunks:
    documents.append(chunk.page_content)
    ids.append("ID"+str(i))
    metadata.append(chunk.metadata)

    i += 1


print("IDs: ", ids)
print("Documents: ", documents) 
print("Metadata: ", metadata)

In [16]:
# Adding to chromadb
collection.upsert(
    documents = documents,
    metadatas = metadata,
    ids = ids
)