## RAG (Retrieval Augmented Generation) model

dependencies : langchain , chromadb , pypdf ,pytest

In [1]:
from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

# Importing ollama embedding model that we run Locally
from langchain_community.embeddings.ollama import OllamaEmbeddings


# Importing chroma database
from langchain_community.vectorstores.chroma import Chroma


import argparse
import os
import shutil

Global Variables

In [2]:
CHROMA_PATH = "chroma/"
DATA_PATH = "data/"

# Loading the pdf data from our folder

In [3]:
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

# Splitting the documents

In [4]:
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

# Embedding function
Used nomic-embed-text , using Ollama local server (http://localhost:11434)

In [5]:
def get_embedding_function():
    # embeddings = OllamaEmbeddings(model="llama3")
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    return embeddings

# Reformatting chunks to (Source:Page:Chunk) = data/invoice.pdf:6:2

In [6]:
def calculate_chunk_ids(chunks):

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        chunk.metadata["id"] = chunk_id

    return chunks

# Adding to chromaDb

In [7]:
def add_to_chroma(chunks: list[Document], batch_size=5):
    db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
    )

    chunks_with_ids = calculate_chunk_ids(chunks)

    existing_items = db.get(include=[])
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        for i in range(0, len(new_chunks), batch_size):
            batch = new_chunks[i:i+batch_size]
            batch_ids = [chunk.metadata["id"] for chunk in batch]
            db.add_documents(batch, ids=batch_ids)
            print(f"Added batch {i // batch_size + 1}/{(len(new_chunks) + batch_size - 1) // batch_size}")

        db.persist()
    else:
        print("✅ No new documents to add")


# Clearing ChromaDb

In [8]:
def clear_database():
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)
        print("✅ Database cleared")

# Main for testing

In [9]:
documents = load_documents()
chunks = split_documents(documents)
add_to_chroma(chunks)


Number of existing documents in DB: 0
👉 Adding new documents: 82
Added batch 1/17
Added batch 2/17
Added batch 3/17
Added batch 4/17
Added batch 5/17
Added batch 6/17
Added batch 7/17
Added batch 8/17
Added batch 9/17
Added batch 10/17
Added batch 11/17
Added batch 12/17
Added batch 13/17
Added batch 14/17
Added batch 15/17
Added batch 16/17
Added batch 17/17


  warn_deprecated(
