In [None]:
RAG pipeline strictly PDF-only, so that:
=========================================
It never answers using pretrained model knowledge.

It only answers from your loaded PDF chunks.

If nothing relevant is found, it always says Answer not found in the document.


In [None]:
#  Install (if not already)
# pip install langchain langchain_community faiss-cpu sentence-transformers PyPDF2

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Step 1️: Load your PDF (or use any small sample PDF)
loader = PyPDFLoader("attention.pdf")
documents = loader.load()

# Step 2️: Split text into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = splitter.split_documents(documents)

# Step 3️: Create embeddings using Hugging Face
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 4️: Build FAISS vector database
vectorstore = FAISS.from_documents(docs, embedding=embeddings)

# Step 5️: Create retriever with threshold
retriever_obj = vectorstore.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"score_threshold": 0.2, "k": 3}
)

# Step 6️: Ask a question to test retrieval
query = "Explain what artificial intelligence means."


search_results = vectorstore.similarity_search_with_score(query, k=3)

print(f"\n Query: {query}")
print("=======================================")

for i, (doc, score) in enumerate(search_results, start=1):
    print(f"\n Result {i}:  (Similarity Score: {round(score, 3)})")
    print("---------------------------------------")
    print(doc.page_content[:400], "...")


In [None]:
Full Example: PDF Summarization using Groq (LangChain RAG)

In [None]:
#  Step 1: Install required packages
# pip install langchain langchain_community langchain_groq sentence-transformers faiss-cpu PyPDF2

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
import os

# Step 2: Set your Groq API key
os.environ["GROQ_API_KEY"] = "os.getenv('GROQ_API_KEY')"

#  Step 3: Load the PDF
loader = PyPDFLoader("attention.pdf")   # Replace with your own PDF filename
documents = loader.load()
print(f" Loaded {len(documents)} pages from PDF")

#  Step 4: Split into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
docs = splitter.split_documents(documents)
print(f"Split into {len(docs)} chunks")

#  Step 5: Create embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 6: Create FAISS vectorstore
vectorstore = FAISS.from_documents(docs, embedding=embeddings)

#  Step 7: Create retriever
retriever = vectorstore.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"score_threshold": 0.2, "k": 4}
)

#  Step 8: Initialize Groq LLM
llm_model = ChatGroq(model="llama-3.1-8b-instant", api_key=os.getenv("GROQ_API_KEY"))

#  Step 9: Create prompt
prompt = ChatPromptTemplate.from_template("""
You are a professional document summarizer.
Use ONLY the following context from the document to summarize the content.
If you don’t find enough information, say “Information not found in the document.”

Context:
{context}

User question:
{input}
""")

#  Step 10: Create Stuff Documents Chain
qa_chain = create_stuff_documents_chain(llm_model, prompt)

#  Step 11: Combine Retriever + QA chain = RAG pipeline
rag_chain = create_retrieval_chain(retriever, qa_chain)

#  Step 12: Run query — ask for summary
query = "Summarize the key results and findings from this PDF."
response = rag_chain.invoke({"input": query})

print("\n Final Summary:\n")
print(response["answer"])
