In [None]:
# Installations: 
# pip install langchain langchain-google-genai chromadb pypdf sentence-transformers tiktoken

from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.retrievers import MultiQueryRetriever, ContextualCompressionRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter
import os

In [4]:
# Set API key (get from: https://aistudio.google.com/app/apikey)
from dotenv import load_dotenv
load_dotenv()
os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API_KEY")

In [21]:
def load_documents(pdf_directory):
    """Load and split PDFs with metadata preservation"""
    loader = DirectoryLoader(
        pdf_directory, 
        glob="*.pdf", 
        loader_cls=PyPDFLoader,
        show_progress=True
    )
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        add_start_index=True
    )
    docs = loader.load()
    return text_splitter.split_documents(docs)

In [22]:
print("Loading documents...")
doc_dir = r"D:\AI_ML_DL\papers"
docs = load_documents(doc_dir)
print(f"Loaded {len(docs)} document chunks from {doc_dir}")


Loading documents...


100%|██████████| 9/9 [00:02<00:00,  3.51it/s]

Loaded 653 document chunks from D:\AI_ML_DL\papers





In [30]:
# ------------------
# Embedding model
# ------------------
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [31]:
def create_vector_store(docs):
    """Create ChromaDB vector store with Gemini embeddings"""
    
    # Create with metadata
    vector_store = Chroma.from_documents(
        documents=docs,
        embedding=embeddings,
        collection_metadata={"hnsw:space": "cosine"},
        persist_directory="./chroma_multi_pdf"
    )

    return vector_store

print("Creating vector store...")
vector_store = create_vector_store(docs)

Creating vector store...


In [32]:
def create_advanced_retriever(pdf_directory):
    """Create multi-query retriever with contextual compression"""
    # Base retriever
    base_retriever = vector_store.as_retriever(
        search_type="mmr",
        search_kwargs={
            "k": 10,
            "filter": {"source": {"$in": [pdf_directory]}},
            "score_threshold": 0.3
        }
    )
    
    # Multi-query expansion
    multi_retriever = MultiQueryRetriever.from_llm(
        retriever=base_retriever,
        llm=ChatGoogleGenerativeAI(model="gemini-pro", temperature=0)
    )
    
    # Contextual compression
    embeddings_filter = EmbeddingsFilter(
        embeddings=embeddings,
        similarity_threshold=0.75
    )
    return ContextualCompressionRetriever(
        base_compressor=embeddings_filter,
        base_retriever=multi_retriever
    )

print("Creating retriever...")
retriever = create_advanced_retriever(doc_dir)

Creating retriever...


In [33]:
# Custom prompt template with source awareness
prompt_template = """
You are an expert research assistant analyzing multiple documents.
Use ONLY the following context from multiple PDF sources to answer:

{context}

---
Question: {question}
Answer in MARKDOWN format with SOURCE CITATIONS. If uncertain, say "I need more context".

Structure:
1. Direct answer
2. Supporting evidence with [Source: filename.pdf, page X]
3. Comparison of perspectives if multiple sources exist
"""

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# Create QA chain with Gemini Pro
llm = ChatGoogleGenerativeAI(
    model="gemini-pro",
    temperature=0.1,
    convert_system_message_to_human=True
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True
)

def query(question):
    """Execute a query with source citation"""
    if not qa_chain:
        raise ValueError("Initialize the system first with .initialize()")
    
    result = qa_chain.invoke({"query": question})
    
    # Process sources
    sources = {}
    for doc in result['source_documents']:
        source = doc.metadata['source']
        page = doc.metadata.get('page', 0) + 1
        if source not in sources:
            sources[source] = []
        sources[source].append(page)
    
    # Format sources
    source_list = []
    for source, pages in sources.items():
        short_name = source.split("/")[-1]
        source_list.append(f"{short_name} (pages {', '.join(map(str, sorted(set(pages))))}")
    
    return {
        "answer": result['result'],
        "sources": "\n".join(source_list)
    }

In [None]:
# Example conversation
questions = [
"What are the main differences in methodology between paper A and paper B?",
"How do these papers approach ethical considerations?",
"Create a comparison table of results from all papers"
]

for question in questions:
    print(f"\n\033[1mQuestion:\033[0m {question}")
    response = chatbot.query(question)
    print(f"\n\033[1mAnswer:\033[0m\n{response['answer']}")
    print(f"\n\033[1mSources:\033[0m\n{response['sources']}")
    print("\n" + "-"*80)