## Simple RAG (HF + Chroma)

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()
from huggingface_hub import login
login(token=os.getenv("HUGGINGFACEHUB_API_TOKEN"))

In [3]:
# Import necessary libraries
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEndpoint
from langchain.chains import RetrievalQA
import os

# Step 1: Load the PDF document
def load_pdf(pdf_path):
    print("Loading PDF...")
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    return documents

# Step 2: Split the document into chunks
def split_documents(documents):
    print("Splitting PDF...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_documents(documents)
    return chunks

# Step 3: Create embeddings and vector store
def get_vector_store(pdf_path):
    
    # Initialize embeddings for vector store
    model_name = "BAAI/bge-small-en-v1.5"
    model_kwargs = {'device': 'cpu'}
    encode_kwargs = {'normalize_embeddings': True}
    
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    
    # Check if vector store exists
    persist_directory = "./chroma_db"
    if os.path.exists(persist_directory) and os.listdir(persist_directory):
        print("Loading existing Vector Store...")
        vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
    else:
        # Load and process the PDF
        documents = load_pdf(pdf_path)
        chunks = split_documents(documents)
        
        print("Creating Vector Store...")
        # Create vector store
        vectorstore = Chroma.from_documents(
            documents=chunks,
            embedding=embeddings,
            persist_directory="./chroma_db"
        )
    
    return vectorstore

# Step 4: Initialize the LLM using HuggingFacePipeline
def initialize_llm():
    print("Initlializing LLM...")
    llm = HuggingFaceEndpoint(
        repo_id="mistralai/Mistral-7B-Instruct-v0.2",
        max_new_tokens=512,
        temperature=0.2,
        do_sample=True,
        repetition_penalty=1.1,
        return_full_text=False,
    )
    return llm

# Step 5: Create the RAG pipeline
def create_rag_pipeline(vectorstore, llm):
    print("Creating QA Chain...")
    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )
    
    return qa_chain

# Step 6: Main function to run the RAG system
def main(pdf_path, query):
        
    # Get Vector Store
    vectorstore = get_vector_store(pdf_path)
    
    # Initialize LLM
    llm = initialize_llm()
    
    # Create RAG pipeline
    qa_chain = create_rag_pipeline(vectorstore, llm)
    
    # Query the system
    response = qa_chain.invoke({"query": query})
    
    return {
        "answer": response["result"],
        "source_documents": response["source_documents"]
    }

# Example usage
if __name__ == "__main__":
    pdf_path = "Trigger_Developer_Guide.pdf"
    query = "What is the difference between PRE and POST operations in OpenPages Triggers?"
    
    response = main(pdf_path, query)
    print("="*100)
    print("Questions:", query)
    print("Answer:", response["answer"])
    print("="*100)

Loading PDF...
Splitting PDF...
Creating Vector Store...
Initlializing LLM...
Creating QA Chain...
Questions: What is the difference between PRE and POST operations in OpenPages Triggers?
Answer:  In OpenPages Triggers, PRE operations refer to the phase before the execution of a method, while POST operations refer to the phase after the execution of a method. PRE triggers are used for validating data, setting new values, and performing calculations before the data is persisted, while POST triggers are used for updating related objects, creating, moving, copying, or locking other objects based on the current object in context. Additionally, due to the parallel processing capabilities introduced in OpenPages v7.1, it may be necessary to configure trigger executions to occur serially to ensure consistent behavior.
