In [33]:
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path
import os
from typing import List, Any
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer


In [34]:
DATA_PATH=r"C:\Users\andia\OneDrive\Desktop\EXAM"

In [36]:
#Loading a Document 
#loader=PyPDFLoader(r"C:\Users\andia\OneDrive\Desktop\EXAM\RAG_and_HR_Policies_FAQs.pdf")
#document=loader.load()

pdf_files = [f for f in os.listdir(DATA_PATH) if f.endswith(".pdf")]

document = []

for pdf_file in pdf_files:
    file_path = os.path.join(DATA_PATH, pdf_file)
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    document.extend(documents)  # accumulate all pages from all PDFs

print(f"Loaded {len(document)} pages from {len(pdf_files)} PDFs")

Loaded 14 pages from 2 PDFs


In [37]:
## load all the pdf files from the directory
dir_loader=DirectoryLoader(
    DATA_PATH,
    glob="**/*.pdf", ## Pattern to match files  
    loader_cls= PyMuPDFLoader, ##loader class to use
    show_progress=False

)

pdf_documents=dir_loader.load()

In [38]:
#READING THE PDF
def process_all_pdfs(pdf_directory):
    all_documents = []
    pdf_dir = Path(DATA_PATH)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents




In [39]:
# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("DATA_PATH")

Found 2 PDF files to process

Processing: DecisionTrees.pdf
  ✓ Loaded 11 pages

Processing: RAG_and_HR_Policies_FAQs.pdf
  ✓ Loaded 3 pages

Total documents loaded: 14


In [40]:
#CHUNKING AND SPLITTING DATA
def split_documents(documents,chunk_size=2000,chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    
    return split_docs


In [41]:
chunks=split_documents(all_pdf_documents)

Split 14 documents into 17 chunks


In [42]:
#DUMMY EMBEDDING CLASS
class EmbeddingManager:
    """Handles document embedding generation using TF-IDF."""

    def __init__(self):
        self.vectorizer = TfidfVectorizer(
            max_features=2048,
            stop_words="english"
        )
        self.fitted = False
        print("TF-IDF Embedding Manager Initialized")

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if not self.fitted:
            print("Fitting TF-IDF vectorizer...")
            self.vectorizer.fit(texts)
            self.fitted = True
        embeddings = self.vectorizer.transform(texts).toarray()
        print(f"Generated embeddings: {embeddings.shape}")
        return embeddings

    def embed_query(self, query: str) -> np.ndarray:
        if not self.fitted:
            raise RuntimeError("Vectorizer not fitted yet!")
        return self.vectorizer.transform([query]).toarray()[0]


In [43]:
embedding_manager = EmbeddingManager()

TF-IDF Embedding Manager Initialized


In [44]:
texts = [doc.page_content for doc in chunks]
embeddings = embedding_manager.generate_embeddings(texts)

Fitting TF-IDF vectorizer...
Generated embeddings: (17, 1017)


In [45]:
#DUMMY VECTOR STORE #for decision trees.pdf this worked
class VectorStore:
    def __init__(self, collection_name="pdf_documents", persist_directory="./vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self._initialize_store()

    def _initialize_store(self):
        os.makedirs(self.persist_directory, exist_ok=True)
        self.client = chromadb.PersistentClient(path=self.persist_directory)
        self.collection = self.client.get_or_create_collection(
        name=self.collection_name,
        metadata={"description": "PDF document embeddings for RAG",
              "hnsw:space": "cosine"}  # specify cosine distance metric here

        )
        print(f"✅ Chroma collection '{self.collection_name}' ready")
        print(f"Existing documents: {self.collection.count()}")

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Mismatch: documents and embeddings count")

        ids, metadatas, contents, vectors = [], [], [], []
        for i, (doc, emb) in enumerate(zip(documents, embeddings)):
            ids.append(f"doc_{uuid.uuid4().hex[:8]}")
            metadatas.append(doc.metadata)
            contents.append(doc.page_content)
            vectors.append(emb.tolist())

        self.collection.add(
            ids=ids,
            embeddings=vectors,
            metadatas=metadatas,
            documents=contents
        )
        print(f"Added {len(documents)} docs to Chroma")
        print(f"Total in collection: {self.collection.count()}")

In [46]:
#for faqs
# DUMMY VECTOR STORE
import os
import uuid
import numpy as np
import chromadb
from typing import List, Any
from chromadb.errors import InvalidDimensionException

class VectorStore:
    def __init__(self, collection_name="pdf_documents", persist_directory="./vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self._initialize_store()

    def _initialize_store(self):
        os.makedirs(self.persist_directory, exist_ok=True)
        self.client = chromadb.PersistentClient(path=self.persist_directory)
        self.collection = self.client.get_or_create_collection(
            name=self.collection_name,
            metadata={
                "description": "PDF document embeddings for RAG",
                "hnsw:space": "cosine"  # specify cosine distance metric here
            }
        )
        print(f"Chroma collection '{self.collection_name}' ready")
        print(f"Existing documents: {self.collection.count()}")

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Mismatch: documents and embeddings count")

        ids, metadatas, contents, vectors = [], [], [], []
        for i, (doc, emb) in enumerate(zip(documents, embeddings)):
            ids.append(f"doc_{uuid.uuid4().hex[:8]}")
            metadatas.append(doc.metadata)
            contents.append(doc.page_content)
            vectors.append(emb.tolist())

        try:
            self.collection.add(
                ids=ids,
                embeddings=vectors,
                metadatas=metadatas,
                documents=contents
            )
            print(f"Added {len(documents)} docs to Chroma")
            print(f"Total in collection: {self.collection.count()}")

        except InvalidDimensionException as e:
            print("Embedding dimension mismatch detected!")
            print("Resetting the collection to match the new embedding size...")

            # delete and recreate the collection
            self.client.delete_collection(self.collection_name)
            self.collection = self.client.create_collection(
                name=self.collection_name,
                metadata={
                    "description": "PDF document embeddings for RAG (reset after dimension mismatch)",
                    "hnsw:space": "cosine"
                }
            )

            # retry adding documents
            self.collection.add(
                ids=ids,
                embeddings=vectors,
                metadatas=metadatas,
                documents=contents
            )
            print(f"Collection reset and {len(documents)} docs added successfully")


In [47]:

# Add the same chunked documents and their embeddings
vector_store = VectorStore()
vector_store.add_documents(chunks, embeddings)

Chroma collection 'pdf_documents' ready
Existing documents: 68
Added 17 docs to Chroma
Total in collection: 85


In [48]:
#DUMMY RETRIEVER CLASS
class RAGRetriever:
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0):
        print(f"\nQuery: {query}")
        query_emb = self.embedding_manager.embed_query(query)

        results = self.vector_store.collection.query(
            query_embeddings=[query_emb.tolist()],
            n_results=top_k
        )

        retrieved_docs = []
        if results and results.get("documents") and results["documents"][0]:
            docs = results["documents"][0]
            metas = results["metadatas"][0]
            dists = results["distances"][0]
            ids = results["ids"][0]

            for i, (doc_id, text, meta, dist) in enumerate(zip(ids, docs, metas, dists)):
                score = 1 - dist
                if score >= score_threshold:
                    retrieved_docs.append({
                        "id": doc_id,
                        "content": text,
                        "metadata": meta,
                        "similarity_score": score,
                        "rank": i + 1
                    })
            print(f"Retrieved {len(retrieved_docs)} documents")
        else:
            print("No documents found")

        return retrieved_docs


In [49]:
rag_retriever=RAGRetriever(vector_store,embedding_manager)

In [51]:
#exmaple retrieval
rag_retriever = RAGRetriever(vector_store, embedding_manager)
results = rag_retriever.retrieve("what is Leave Policy?", top_k=3)
for r in results:
    print(f"\nRank {r['rank']}:")
    print(f"Score: {r['similarity_score']:.3f}")
    print(f"Content snippet: {r['content'][:250]}...")


Query: what is Leave Policy?
Retrieved 3 documents

Rank 1:
Score: 0.375
Content snippet: - Regularly re-embed when documents are updated.
**FAQs:**
Q1: Why does RAG sometimes return irrelevant context?
A1: Chunking or embedding mismatch can cause poor retrieval. Recheck embeddings and
retriever logic.
Q2: Can I use local LLMs like Ollama...

Rank 2:
Score: 0.375
Content snippet: - Regularly re-embed when documents are updated.
**FAQs:**
Q1: Why does RAG sometimes return irrelevant context?
A1: Chunking or embedding mismatch can cause poor retrieval. Recheck embeddings and
retriever logic.
Q2: Can I use local LLMs like Ollama...

Rank 3:
Score: 0.375
Content snippet: - Regularly re-embed when documents are updated.
**FAQs:**
Q1: Why does RAG sometimes return irrelevant context?
A1: Chunking or embedding mismatch can cause poor retrieval. Recheck embeddings and
retriever logic.
Q2: Can I use local LLMs like Ollama...


In [52]:
import os
from dotenv import load_dotenv
from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI
from langchain.chat_models import ChatOpenAI

# Load environment variables
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if not OPENAI_API_KEY:
    raise ValueError("Please set your OPENAI_API_KEY in the environment variables (.env file)")

class OpenAILLM:
    def __init__(self, model_name: str = "gpt-3.5-turbo", api_key: str = None):
        """
        Initialize OpenAI LLM
        """
        self.model_name = model_name
        self.api_key = api_key or OPENAI_API_KEY
        
        self.llm = ChatOpenAI(
            model=model_name,        # new param name is `model` not `model_name`
            temperature=0.1,
            api_key=self.api_key,    # new param name is `api_key`
            max_tokens=1024
        )
        
        print(f"Initialized OpenAI LLM with model: {self.model_name}")

    def generate_response(self, query: str, context: str, max_length: int = 500) -> str:
        """
        Generate response using retrieved context
        """
        prompt = f"""
You are a helpful AI assistant. Use the following context to answer the question accurately and concisely.

Context:
{context}

Question: {query}

Answer:"""
        try:
            messages = [HumanMessage(content=prompt)]
            response = self.llm.invoke(messages)  # use .invoke() instead of calling directly
            return response.content
        except Exception as e:
            return f"Error generating response: {str(e)}"

    def generate_response_simple(self, query: str, context: str) -> str:
        """
        Simple response generation without complex prompting
        """
        prompt = f"""Based on this context: {context}

Question: {query}

Answer:"""
        try:
            messages = [HumanMessage(content=prompt)]
            response = self.llm.invoke(messages)
            return response.content
        except Exception as e:
            return f"Error: {str(e)}"


In [53]:
llm = ChatOpenAI(
    model_name="gpt-4o-mini",
    openai_api_base="https://openrouter.ai/api/v1",
    openai_api_key="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",
)

from langchain.schema import HumanMessage, SystemMessage

def rag_simple(query, retriever, llm, top_k=3):
    """Retrieve top-k context and generate a concise answer using ChatOpenAI"""

    # Retrieve relevant documents
    results = retriever.retrieve(query, top_k=top_k)
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""

    if not context:
        return "No relevant context found to answer the question."

    # Build the prompt
    prompt = f"""Use the following context to answer the question exactly as given in context for educational purpose. If the context has no answers say I don't know

Context:
{context}

Question: {query}

Answer:"""

    # Print retrieved context
    print("Context retrieved:\n", context)

    # Call the LLM directly with messages
    response = llm(
        messages=[
            SystemMessage(content="You are a helpful assistant that answers questions concisely using provided context."),
            HumanMessage(content=prompt),
        ]
    )

    # Return the answer text
    return response.content

In [62]:
answer=rag_simple("explain random forests and who introduced in which year?",rag_retriever,llm)
print("Answer is\n\n")
print(answer)


Query: explain random forests and who introduced in which year?
Retrieved 3 documents
Context retrieved:
 1-9
3 Random Forests
Random Forests is a popular technique to solve classification and regression problems and was introduced
by Breiman [2001]. It builds on a combination of tree predictors (ensemble method) that operates by
constructing a multitude of trees during the training phase and asking each tree to output the mode of the
classes (most popular class in classification) or mean predictions (in regression). Then we pick the majority
(classification) or mean (regression) across trees to make predictions. From a mathematical standpoint, we
can interpret the function approximation of random forests as follows:
f(x) =
BX
t=1
ft(x)
B ,
where B is the number of trees in the ensemble, and eachft(x) is a CART that trains on a subset of
the data that is chosen randomly with replacement (this is known asbootstrapping). Bagging (bootstrap
aggregation) uses the aboveB bootstrap samples 

In [60]:
answer=rag_simple("what is the Leave policy?",rag_retriever,llm)
print("Answer is\n\n")
print(answer)


Query: what is the Leave policy?
Retrieved 3 documents
Context retrieved:
 - Regularly re-embed when documents are updated.
**FAQs:**
Q1: Why does RAG sometimes return irrelevant context?
A1: Chunking or embedding mismatch can cause poor retrieval. Recheck embeddings and
retriever logic.
Q2: Can I use local LLMs like Ollama or Llama 3?
A2: Yes, with LangChain or LlamaIndex integration using `OllamaLLM` or API endpoints.
Q3: What’s the difference between RAG and fine-tuning?
A3: RAG retrieves knowledge dynamically; fine-tuning embeds it permanently into the
model.
Q4: How to improve accuracy?
A4: Use domain-specific embeddings, better retriever ranking, or hybrid (keyword +
semantic) search.
Section 2: HR Policies and Employee FAQs
**1. Leave Policy**
- Employees are entitled to annual leave, sick leave, and public holidays as per labor law.
- All leave must be approved by reporting managers.
- Emergency leave must be informed at the earliest.
**2. Attendance & Working Hours**
- Standa

In [59]:
answer=rag_simple("To control the bias-variance trade-off what is used?",rag_retriever,llm)
print("Answer is\n\n")
print(answer)


Query: To control the bias-variance trade-off what is used?
Retrieved 3 documents
Context retrieved:
 1-8
2.4 Pruning a tree
There is a trade-off between the model interpretability and its performance on the training set:
• Small tree with fewer splits⇒ More interpretable (lower variance), but poorer fit in training set (higher
bias);
• Larger tree with many splits⇒ Less interpretable (higher variance), better fit in training set (lower
bias).
Yet, what often really matters is the model performance on the test set, meaning that we are interested in
the bias-variance trade-off (illustrated in Figure 1.6)—where biasis the error introduced by approximating
a functionf with a simpler functionˆf, while variancerefers to how the estimation would change if we used
a different training set. This leads to the following trade-off:
• More complex model ˆf ⇒ Low bias and high variance, since the model will capture many features of
the data;
• Simpler model ˆf ⇒ High bias and low variance.
Figure 

In [64]:
answer=rag_simple("give function approximation of random forests?",rag_retriever,llm)
print("Answer is\n\n")
print(answer)


Query: give function approximation of random forests?
Retrieved 3 documents
Context retrieved:
 1-9
3 Random Forests
Random Forests is a popular technique to solve classification and regression problems and was introduced
by Breiman [2001]. It builds on a combination of tree predictors (ensemble method) that operates by
constructing a multitude of trees during the training phase and asking each tree to output the mode of the
classes (most popular class in classification) or mean predictions (in regression). Then we pick the majority
(classification) or mean (regression) across trees to make predictions. From a mathematical standpoint, we
can interpret the function approximation of random forests as follows:
f(x) =
BX
t=1
ft(x)
B ,
where B is the number of trees in the ensemble, and eachft(x) is a CART that trains on a subset of
the data that is chosen randomly with replacement (this is known asbootstrapping). Bagging (bootstrap
aggregation) uses the aboveB bootstrap samples and averag

In [80]:
from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from fastapi.middleware.cors import CORSMiddleware
from fastapi.exceptions import RequestValidationError

# Import your LLM and LangChain schema
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage

# -----------------------------
# FastAPI App Setup
# -----------------------------
app = FastAPI(title="RAG Chatbot")

# CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Serve static files
app.mount("/static", StaticFiles(directory="static"), name="static")

# HTML templates
templates = Jinja2Templates(directory="templates")

# -----------------------------
# LLM Setup
# -----------------------------
llm = ChatOpenAI(
    model_name="gpt-4o-mini",
    openai_api_base="https://openrouter.ai/api/v1",
    openai_api_key="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",
)

# -----------------------------
# Exception Handling
# -----------------------------
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(request, exc):
    return JSONResponse(status_code=400, content={"error": str(exc)})

# -----------------------------
# Serve Frontend
# -----------------------------
@app.get("/", response_class=HTMLResponse)
def serve_frontend(request: Request):
    return templates.TemplateResponse("index.html", {"request": request})

# -----------------------------
# RAG Query Function
# -----------------------------
def rag_simple(query, retriever, llm, top_k=3):
    """Retrieve top-k context and generate a concise answer using ChatOpenAI"""
    results = retriever.retrieve(query, top_k=top_k)
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""

    if not context:
        return "No relevant context found to answer the question."

    prompt = f"""Use the following context to answer the question exactly as given in context for educational purpose. If the context has no answers say I don't know

Context:
{context}

Question: {query}

Answer:"""

    response = llm(
        messages=[
            SystemMessage(content="You are a helpful assistant that answers questions concisely using provided context."),
            HumanMessage(content=prompt),
        ]
    )

    return response.content

# -----------------------------
# Query Endpoint
# -----------------------------
@app.post("/query")
async def query(request: Request):
    try:
        data = await request.json()
        query_text = data.get("query", "").strip()

        if not query_text:
            return {"error": "Empty query"}

        # Call the RAG + LLM function
        answer = rag_simple(query_text, rag_retriever, llm, top_k=3)

        return {"answer": answer}

    except Exception as e:
        print("Error:", e)
        return {"error": str(e)}


In [81]:
import nest_asyncio
nest_asyncio.apply()


In [None]:
import uvicorn

# Run in notebook
uvicorn.run(app, host="0.0.0.0", port=8000)


INFO:     Started server process [20252]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:52759 - "GET / HTTP/1.1" 200 OK
INFO:     127.0.0.1:52759 - "GET /static/app.js HTTP/1.1" 304 Not Modified

Query: random forest
Retrieved 3 documents
INFO:     127.0.0.1:52765 - "POST /query HTTP/1.1" 200 OK


In [75]:
results

[{'id': 'doc_c5c2928e',
  'content': '- Regularly re-embed when documents are updated.\n**FAQs:**\nQ1: Why does RAG sometimes return irrelevant context?\nA1: Chunking or embedding mismatch can cause poor retrieval. Recheck embeddings and\nretriever logic.\nQ2: Can I use local LLMs like Ollama or Llama 3?\nA2: Yes, with LangChain or LlamaIndex integration using `OllamaLLM` or API endpoints.\nQ3: What’s the difference between RAG and fine-tuning?\nA3: RAG retrieves knowledge dynamically; fine-tuning embeds it permanently into the\nmodel.\nQ4: How to improve accuracy?\nA4: Use domain-specific embeddings, better retriever ranking, or hybrid (keyword +\nsemantic) search.\nSection 2: HR Policies and Employee FAQs\n**1. Leave Policy**\n- Employees are entitled to annual leave, sick leave, and public holidays as per labor law.\n- All leave must be approved by reporting managers.\n- Emergency leave must be informed at the earliest.\n**2. Attendance & Working Hours**\n- Standard working hours: 9