In [7]:

from langchain_groq import ChatGroq 
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import os
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
import glob
load_dotenv()

True

In [8]:
def check_credentials():
    if "GROQ_API_KEY" not in os.environ:
        print("GROQ_API_KEY not found") 
    if "GOOGLE_API_KEY" not in os.environ:
        print("GOOGLE_API_KEY not found") 
 
check_credentials()

In [9]:

def load_pdf(path: str):
    print(f"Loading PDF from: {path}")
    loader = PyPDFLoader(path)
    return loader.load()  
def load_pdf_from_dir(directory_path: str):
    all_docs = [] 
    pdf_files = glob.glob(os.path.join(directory_path, "*.pdf"))
    
    for pdf_path in pdf_files: 
        doc = load_pdf(pdf_path)
        all_docs.append(doc)
        
    return all_docs
def split_documents(docs, chunk_size=1000, chunk_overlap=150):
    print(f"Splitting documents into chunks of size {chunk_size} with overlap {chunk_overlap}")
     
    flattened_docs = [doc for sublist in docs for doc in sublist]

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap
    ) 
    return splitter.split_documents(flattened_docs)


def build_vectorstore(splits):
    print("Building vector store")
    emb = HuggingFaceEmbeddings(model_name="ibm-granite/granite-embedding-30m-english")
    return FAISS.from_documents(splits, emb)

def setup_pipeline(dir_path: str, chunk_size=1000, chunk_overlap=150):
    print("Setting up the RAG pipeline")
    docs = load_pdf_from_dir(dir_path)
    splits = split_documents(docs, chunk_size, chunk_overlap)
    vs = build_vectorstore(splits)
    return vs

def format_docs(docs): 
    return "\n\n".join(d.page_content for d in docs)



In [10]:
llm = ChatGroq(
    model="llama-3.1-8b-instant", 
)  

prompt = ChatPromptTemplate.from_messages([
    ("system", "Answer ONLY from the provided context. If not found, say you don't know."),
    ("human", "Question: {question}\n\nContext:\n{context}")
])

In [11]:

def query(vectorstore, question: str): 
    
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})

    parallel = RunnableParallel({
        "context": retriever | RunnableLambda(format_docs),
        "question": RunnablePassthrough(),
    })

    chain = parallel | prompt | llm | StrOutputParser()
 
    return chain.invoke(question)


In [12]:
dir_path = "./docs"  
vectorstore = setup_pipeline(dir_path)

Setting up the RAG pipeline
Loading PDF from: ./docs/bukhari.pdf
Loading PDF from: ./docs/book1.pdf
Splitting documents into chunks of size 1000 with overlap 150
Building vector store


In [13]:
ques= "What is Event Driven Architecture?"
ans = query(vectorstore, ques)
print(ans)

Event Driven Architecture (EDA) is an architecture style that reverses the tightly coupled synchronous calls between services, instead decoupling systems in time and allowing them to evolve independently of one another. This approach uses events to make the system pluggable and to transfer state between systems.


In [14]:
ques= "Who is imaam bukhari"
ans = query(vectorstore, ques)
print(ans)

Imaam Bukhari was a scholar of Hadith and the compiler of the book "Sahih Bukhari", a collection of authentic Hadiths of the Holy Prophet Muhammad.
