### Import required libraries for document loading, splitting, embedding, and vector database storage

In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
import os

### Load PDF documents from the 'data' directory and extract each page as a separate document

In [3]:
documents = []
for filename in os.listdir("./data"):
    if filename.endswith(".pdf"):
        loader = PyPDFLoader(os.path.join("./data", filename))
        pdf_pages = loader.load()
        for i, page in enumerate(pdf_pages):
            page.metadata["source"] = filename
            page.metadata["page"] = i + 1
        documents.extend(pdf_pages)

### Split the loaded documents into smaller, overlapping chunks for better embedding performance

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
docs = text_splitter.split_documents(documents)

### Initialize the HuggingFace embedding model to convert text chunks into vector representations

In [5]:
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


### Create a new Chroma vector database (if not exists), or load the existing one and add embeddings

In [6]:
if os.path.exists("./chroma_db"):
    vectordb = Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)
    print("📁 Loaded existing Chroma vector database.")
else:
    vectordb = Chroma.from_documents(docs, embedding_function, persist_directory="./chroma_db")
    vectordb.persist()
    print("✅ Created and saved new Chroma vector database.")

✅ Created and saved new Chroma vector database.


  vectordb.persist()
