# RAG Agent for PDF Documents

Question-answering system for PDF documents using RAG.

Features:
- Load PDF documents
- Extract text and metadata
- Create searchable vector index
- Query with natural language
- Get answers with sources

In [None]:
import os, sys
from dotenv import load_dotenv
sys.path.append('..')
load_dotenv()

from Python_RAG_Agent.data_loader import load_pdf_documents
from Python_RAG_Agent.Embeddings import get_default_embeddings
from Python_RAG_Agent.vector_store import VectorStoreManager

In [None]:
# Load PDF documents
pdf_docs = load_pdf_documents('../sample_data/pdf_files')
print(f'Loaded {len(pdf_docs)} pages from PDFs')

In [None]:
# Create vector store
embeddings = get_default_embeddings()
vector_manager = VectorStoreManager(embeddings, chunk_size=1000, chunk_overlap=200)
vector_manager.create_vector_store(pdf_docs)
vector_manager.save('../data_storage/vector_store')

In [None]:
# Build Q&A chain
from langchain_anthropic import ChatAnthropic
from langchain.chains import RetrievalQA

llm = ChatAnthropic(model='claude-3-5-sonnet-20241022', temperature=0)
retriever = vector_manager.get_retriever(search_kwargs={'k': 3})
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

In [None]:
# Ask questions
question = 'What is the company revenue?'
response = qa_chain.invoke({'query': question})
print(f'Q: {question}')
print(f'A: {response["result"]}')
print(f'Sources: {[d.metadata for d in response["source_documents"]]}')