1. Environment Setup

In [None]:

!pip install langchain langchain_community sentence-transformers chromadb pymupdf rank_bm25

Collecting langchain_community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting chromadb
  Downloading chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.m

2. Mounting Drive

In [None]:

from google.colab import drive
drive.mount('/content/drive')


pdf_path = "/content/drive/MyDrive/HSC26-Bangla1st-Paper.pdf"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


3. Imports and API Key Configuration

In [42]:
# All our necessary imports go here.
import pymupdf
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_groq import ChatGroq
from google.colab import userdata
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers import BM25Retriever, EnsembleRetriever


try:
    GROQ_API_KEY = userdata.get('GROQ_API_KEY')
    print("✅ Groq API Key loaded successfully.")
except Exception as e:
    print("⚠️ Could not load Groq API Key. Please ensure it's set correctly in Colab Secrets.")
    GROQ_API_KEY = None

✅ Groq API Key loaded successfully.


4. Data Loading and Preprocessing

In [43]:
story_pages_docs = []

# Since the PDF's text layer is unreliable, we're using a surgical extraction method.
# We'll loop through only the story pages (6-17) and extract text word by word,
# which is more robust than trying to extract full paragraphs.
with pymupdf.open(pdf_path) as pdf_file:
    for page_num in range(5, 17): # 0-indexed for pages 6-17
        page = pdf_file.load_page(page_num)
        words = page.get_text("words")
        text = " ".join([word[4] for word in words])
        doc = Document(page_content=text, metadata={"page": page_num + 1})
        story_pages_docs.append(doc)

print(f"Surgically extracted {len(story_pages_docs)} pages using a robust method.")

# Now, we chunk the extracted text into smaller, overlapping pieces.
# This helps the retriever find more specific and relevant context.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=128
)
docs = text_splitter.split_documents(story_pages_docs)
print(f"Split the story into {len(docs)} high-precision chunks.")

# Let's check a sample chunk to see if our data looks reasonable.
if docs:
    print("\n--- Sample Chunk ---")
    print(docs[0].page_content)
    print("--------------------")

Surgically extracted 12 pages using a robust method.
Split the story into 66 high-precision chunks.

--- Sample Chunk ---
েূ গ্ে আিআমািব্ সসাতািমাত্র।এিীব্নটানাদদকঘিযির্হসাকব্ব়্ে, নাগুকনির্হসাকব্।তব্ুইহািএকটুর্ব্কিষ মূলযআকি।ইহাযসইফুকলিমকতাোহািব্ুককিউপকিভ্রমিআর্স াব্র্স ার্িল, এব্ংযসইপদকক্ষকপিইর্তহাস তাহািিীব্কনিমাঝখাকনফকলিমকতাগুটিধর্ি াউঠি াকি। যসইইর্তহাসটুকুআকাকিযিাকটা, তাহাককযিাকটাকর্ি াইজলর্খব্।যিাকটাককেঁাহািাসামানযব্জল াভুলককিন নাতঁাহািা ইহািিসব্ুজঝকব্ন। ককলকিেতগুকলাপিীক্ষাপািকর্িব্ািসব্আর্মচুকাই ার্ি।যিকলকব্লা আমাি সুন্দিযচহািালই াপজণ্ডতমিা আমাককর্িমুলফুলওমাকালফকলিসর্হত তুলনা কর্ি া, র্ব্দ্রুপকর্িব্ািসু্কোগ পাই
--------------------


5. Building the Hybrid Retriever

In [44]:

# The EnsembleRetriever combines multiple search methods to get more accurate results.

# Keyword Search (BM25): Great for finding exact words and names.
bm25_retriever = BM25Retriever.from_documents(docs)
bm25_retriever.k = 2 # We'll ask for the top 2 keyword-based results.

#  Semantic Search (MiniLM): Great for understanding the meaning and context of the query.
# We're using the 'MiniLM' model because it's lightweight and has proven more robust
# for this specific noisy dataset.
model_name = "paraphrase-multilingual-MiniLM-L12-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={'device': 'cpu'})
vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)
semantic_retriever = vectorstore.as_retriever(search_kwargs={"k": 4}) # And the top 4 semantic results.

#  Combine them into a single hybrid retriever.
# We're giving them equal weight in finding the best documents.
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, semantic_retriever], weights=[0.5, 0.5]
)

print("✅ Definitive Hybrid Retriever built successfully.")

✅ Definitive Hybrid Retriever built successfully.


6. Constructing the RAG Chain

In [45]:
# This is our Large Language Model from Groq - it's fast and powerful.
llm = ChatGroq(model_name="llama3-8b-8192", groq_api_key=GROQ_API_KEY)

# This prompt is a strict set of instructions for the LLM. We're telling it to
# act as a fact-checker, not a creative writer, and to only use the context we provide.
prompt_template = """
You are a fact-checking expert for the story 'Oporichita'. Your task is to find the direct answer to the user's question from the provided text.

**Instructions:**
1. Carefully read the provided context.
2. Identify the single word or short phrase that directly answers the question.
3. Provide only that answer and nothing else. Do not add any conversational text.
4. If the answer is not in the context, and only then, state: "The answer is not available in the provided text."
5. Respond in the same language as the question.

**Context:**
{context}

**Question:**
{question}
"""
prompt = ChatPromptTemplate.from_template(prompt_template)

# This helper function just formats our retrieved documents into a single string.
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Here's the final chain. It follows these steps:
# 1. The user's question goes to the retriever to find relevant docs.
# 2. The docs are formatted into a string.
# 3. The docs and the original question are put into the prompt.
# 4. The complete prompt goes to the LLM.
# 5. The LLM's response is parsed into a clean string.
rag_chain = (
    {"context": ensemble_retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("✅ RAG Chain is ready to go!")

✅ RAG Chain is ready to go!


7. Final Testing

In [47]:
print("--- Running Final Tests ---\n")

# --- Test 1 ---
query_1 = "অনুপমের ভাষায় সুপুরুষ কাকে বলা হয়েছে?"
response_1 = rag_chain.invoke(query_1)
print(f"Query: {query_1}")
print(f"Response: {response_1}")
print("Expected: শম্ভুনাথ")

# --- Test 2 ---
query_2 = "কাকে অনুপমের ভাগ্য দেবতা বলে উল্লেখ করা হয়েছে?"
response_2 = rag_chain.invoke(query_2)
print(f"\nQuery: {query_2}")
print(f"Response: {response_2}")
print("Expected: মামাকে")

# --- Test 3 ---
query_3 = "বিয়ের সময় কল্যাণীর প্রকৃত বয়স কত ছিল?"
response_3 = rag_chain.invoke(query_3)
print(f"\nQuery: {query_3}")
print(f"Response: {response_3}")
print("Expected: ১৫ বছর")

--- Running Final Tests ---

Query: অনুপমের ভাষায় সুপুরুষ কাকে বলা হয়েছে?
Response: তুর্ম
Expected: শম্ভুনাথ

Query: কাকে অনুপমের ভাগ্য দেবতা বলে উল্লেখ করা হয়েছে?
Response: মামা
Expected: মামাকে

Query: বিয়ের সময় কল্যাণীর প্রকৃত বয়স কত ছিল?
Response: The answer is not available in the provided text.
Expected: ১৫ বছর


8. Defining and building the api

In [48]:
# This cell creates our lightweight REST API using FastAPI.
# It will use the 'rag_chain' we've already built in the previous cells.

# Install the necessary libraries for the API server
print("Installing API libraries...")
!pip install fastapi uvicorn python-multipart pyngrok --quiet
print("✅ API libraries installed successfully!")

#  Import FastAPI and define our data models
from fastapi import FastAPI
from pydantic import BaseModel

# This model defines the structure for our incoming requests (a JSON with a "query" key).
class QueryRequest(BaseModel):
    query: str

# This model defines the structure for our outgoing responses.
class QueryResponse(BaseModel):
    answer: str

# Create the FastAPI app
app = FastAPI(
    title="Oporichita RAG API",
    description="An API to ask questions about the story 'Oporichita'."
)

#  Define the API endpoint
# This decorator tells FastAPI that the function below will handle POST requests to the /query URL.
@app.post("/query", response_model=QueryResponse)
async def ask_question(request: QueryRequest):
    """
    Accepts a user's query and returns the RAG system's answer.
    This directly uses the 'rag_chain' variable from our notebook.
    """
    response_text = rag_chain.invoke(request.query)
    return {"answer": response_text}

# This is a simple "welcome" endpoint for the root URL.
@app.get("/")
async def root():
    return {"message": "Welcome! The RAG API is running. Use the /query endpoint to ask questions."}

print("\n✅ API application defined successfully.")

Installing API libraries...
✅ API libraries installed successfully!

✅ API application defined successfully.


9. Launching the API Server

In [51]:
# Cell 8: Launch the API Server
# This cell will start the server and make our API accessible via a public URL.

from pyngrok import ngrok
import uvicorn
import asyncio
from google.colab import userdata

# Get the ngrok authtoken from Colab secrets
try:
    NGROK_AUTHTOKEN = userdata.get('NGROK_AUTHTOKEN')
    print("✅ ngrok authtoken loaded successfully.")
except Exception as e:
    print("⚠️ Could not load ngrok authtoken. Please ensure it's set correctly in Colab Secrets.")
    NGROK_AUTHTOKEN = None

# Authenticate pyngrok with the authtoken
if NGROK_AUTHTOKEN:
    ngrok.set_auth_token(NGROK_AUTHTOKEN)
    print("✅ pyngrok authenticated with authtoken.")
else:
    print("❌ Could not authenticate pyngrok without an authtoken. Please add NGROK_AUTHTOKEN to Colab Secrets.")
    # Exit the cell execution if authtoken is not available
    raise ValueError("NGROK_AUTHTOKEN not found in Colab Secrets.")


# Create a public URL using pyngrok.
public_url = ngrok.connect(8000)
print(f"🚀 Your API is live! Access it at: {public_url}")
print("You can now send POST requests to the /query endpoint at this URL.")

# Run the FastAPI app using uvicorn.
# We use asyncio.create_task to run it in the background of the Colab cell.
# This prevents the cell from running forever and allows you to stop it.
try:
    # We need to run uvicorn in a way that it works within a notebook
    # Nest_asyncio is a common library for this.
    import nest_asyncio
    nest_asyncio.apply()

    # Run the uvicorn server.
    uvicorn.run(app, host="0.0.0.0", port=8000)

except KeyboardInterrupt:
    print("Server stopped.")
    ngrok.kill()

✅ ngrok authtoken loaded successfully.
✅ pyngrok authenticated with authtoken.
🚀 Your API is live! Access it at: NgrokTunnel: "https://3d209ca03fd1.ngrok-free.app" -> "http://localhost:8000"
You can now send POST requests to the /query endpoint at this URL.


INFO:     Started server process [290]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     103.134.240.18:0 - "GET / HTTP/1.1" 200 OK
INFO:     103.134.240.18:0 - "GET /favicon.ico HTTP/1.1" 404 Not Found


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [290]
