In [79]:
# ===============================
# 0. Install dependencies (run once)
# ===============================
!pip install pandas langchain chromadb sentence-transformers python-dotenv langchain-google-genai --quiet

# ===============================
# 1. Imports
# ===============================
import pandas as pd
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from textwrap import wrap
import os
from dotenv import load_dotenv

from langchain_google_genai import ChatGoogleGenerativeAI

# ===============================
# 2. Load dataset
# ===============================
csv_path = "data/bw_courses - Sheet1.csv"  # replace with your path
df = pd.read_csv(csv_path)

# Quick overview
print(df.shape)
df.head()

# ===============================
# 3. Preprocessing
# ===============================

# Fill missing values in "Who This Course is For"
df['Who This Course is For'] = df['Who This Course is For'].fillna("Not specified")

# Strip whitespace from all string fields
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Map language codes to names
lang_map = {
    6: "Hindi",
    7: "Kannada",
    11: "Malayalam",
    20: "Tamil",
    21: "Telugu",
    24: "English"
}

def map_languages(cell):
    codes = str(cell).split(",")
    return [lang_map.get(int(c.strip()), f"Unknown-{c.strip()}") for c in codes]

df['Released Languages'] = df['Released Languages'].apply(map_languages)

# Show the 2 rows that were missing originally (for notebook demo)
missing_rows_demo = df[df['Who This Course is For'] == "Not specified"]
missing_rows_demo

# ===============================
# 4. Prepare documents with chunking
# ===============================

documents = []
metadata_list = []

MAX_TOKENS = 200  # Approximate chunk size for description

for _, row in df.iterrows():
    # Chunk the description if it is long
    description_chunks = wrap(row['Course Description'], MAX_TOKENS) or [""]  # fallback empty string
    
    for desc_chunk in description_chunks:
        text = f"Course Title: {row['Course Title']}\n" \
               f"Description: {desc_chunk}\n" \
               f"Who This Course is For: {row['Who This Course is For']}\n" \
               f"Languages: {', '.join(row['Released Languages'])}"
        documents.append(text)
        metadata_list.append({
            "course_no": row['Course No'],
            "course_title": row['Course Title'],
            "released_languages": ', '.join(row['Released Languages']),  # join with commas
            "who_for": row['Who This Course is For']
        })


print(f"Total chunks created: {len(documents)}")

# ===============================
# 5. Create embeddings & build ChromaDB
# ===============================
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectordb = Chroma.from_texts(
    texts=documents,
    embedding=embedding_model,
    metadatas=metadata_list,
    persist_directory="data/chroma_db"
)

vectordb.persist()
print("✅ Chroma Vector DB created and persisted!")

# ===============================
# 6. Setup Google Gemini LLM
# ===============================
load_dotenv()  # load API key from .env
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise ValueError("GEMINI_API_KEY not found in .env")

gemini_llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=api_key,
    temperature=0.0,
    max_tokens=512,
    timeout=30,
    max_retries=3
)

# ===============================
# 7. Retriever function
# ===============================
def get_relevant_courses(query, k=3):
    """
    Retrieve top-k relevant course chunks for a query
    """
    results = vectordb.similarity_search(query, k=k)
    return [res.page_content for res in results]

# ===============================
# 8. Query function using Gemini
# ===============================
def generate_answer(context_docs, user_query):
    """
    Use Gemini LLM to answer user queries strictly based on dataset.
    """
    context = "\n\n".join(context_docs)
    prompt = f"""
You are an AI Support Agent for **BossWallah**, specializing in answering questions
about the available courses. Follow these rules strictly:

1. Only answer using the provided dataset context. 
2. If the answer is not present in the dataset, say politely:
   "Sorry, I could not find a relevant course in the BossWallah catalog."
3. Always include the **Course Title** and key details if available.
4. If multiple courses are relevant, list them clearly in bullet points.
5. If the user asks in general terms (e.g., poultry farming, financial freedom),
   map it to the most relevant courses from the dataset.
6. Be clear, concise, and helpful. Do not make up content beyond the dataset.

---
📘 Dataset Context:
{context}

---
💡 User Question:
{user_query}

Now provide the best possible helpful answer:
    """

    response = gemini_llm.invoke(prompt)
    return response.content.strip()


# ===============================
# 9. Demo: Sample queries
# ===============================
sample_queries = [
    "Tell me about honey bee farming course",
    "I want to learn how to start a poultry farm",
    "Do you have any courses in Tamil?",
    "I am a recent high school graduate, are there any opportunities for me?"
]

for q in sample_queries:
    relevant_docs = get_relevant_courses(q)
    answer = generate_answer(relevant_docs, q)
    print(f"\n--- Query: {q} ---")
    print("Answer:")
    print(answer)
    print("\nDocuments considered:")
    for doc in relevant_docs:
        print(doc)
        print("------")



[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: C:\Users\nikhi\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


(100, 5)


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Total chunks created: 100
✅ Chroma Vector DB created and persisted!

--- Query: Tell me about honey bee farming course ---
Answer:
We have a **Course on Honey Bee Farming** available.

This course is designed to help you transform your passion for bees into a lucrative career.

It is suitable for:
*   Beginners looking to start a career in beekeeping
*   Experienced beekeepers looking to expand their knowledge and skills
*   Entrepreneurs interested in starting their own beekeeping business
*   Farmers and landowners looking to diversify their income
*   Anyone with a passion for bees and a desire to learn about the industry

The course is available in Hindi, Kannada, Malayalam, Tamil, Telugu, and English.

Documents considered:
Course Title: Course on Honey Bee Farming
Description: Transform Your Passion for Bees into a Lucrative Career: Join Our Honey Bee Farming Course Now!
Who This Course is For: Beginners looking to start a career in beekeeping ||| Experienced beekeepers looking t

In [67]:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0  # make results deterministic

# Map langdetect codes to your supported language names
lang_code_map = {
    "hi": "Hindi",
    "kn": "Kannada",
    "ta": "Tamil",
    "te": "Telugu",
    "ml": "Malayalam",
    "en": "English"
}

def detect_language(text: str) -> str:
    """
    Detects the language of the input text and maps it
    to one of the supported languages. Defaults to English if unsupported.
    """
    try:
        detected_code = detect(text)
        return lang_code_map.get(detected_code, "English")
    except Exception:
        return "English"

In [68]:
user_query = "ഒരു ഡെയ്‌റി ഫാം ആരംഭിക്കാൻ എത്ര പശുക്കൾ ആവശ്യമാകും?"

detect_language(user_query)

'Malayalam'

In [74]:
# ===============================
# 6. Setup Google Gemini LLM
# ===============================
load_dotenv()  # load API key from .env
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise ValueError("GEMINI_API_KEY not found in .env")

gemini_llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=api_key,
    temperature=0.0,
    max_tokens=512,
    timeout=30,
    max_retries=3
)

In [76]:
from langdetect import detect
from langchain_google_genai import ChatGoogleGenerativeAI


def translator_agent(text: str, target_lang: str) -> str:
    """
    Translate English text into target language using Gemini.
    """
    translation_prompt = f"""
    You are a translation agent. Your task is to translate the following English text
    into **{target_lang}**. Keep the meaning accurate and natural for a native speaker. 

    Text to translate:
    {text}
    """
    return gemini_llm.invoke(translation_prompt).content.strip()


def translate_to_english(text: str) -> str:
    """
    Translate text from any supported language into English.
    """
    translation_prompt = f"""
    You are a translation agent. Translate the following text into **English** only. 
    Keep the meaning intact, do not summarize, just translate.

    Text:
    {text}
    """
    return gemini_llm.invoke(translation_prompt).content.strip()



In [83]:
def generate_answer(user_query: str) -> str:
    """
    End-to-end pipeline with debug logs:
    1. Detect query language
    2. Translate query → English (if needed)
    3. RAG Agent finds the answer in English
    4. Translator Agent translates back to user’s language
    """

    print("\n================ DEBUG START ================")
    print(f"User Query: {user_query}")

    # Step 1: Detect query language
    user_lang = detect_language(user_query)
    print(f"[DEBUG] Detected Language: {user_lang}")

    # Step 2: Translate query to English if needed
    query_in_english = (
        translate_to_english(user_query) if user_lang != "English" else user_query
    )
    print(f"[DEBUG] Query in English: {query_in_english}")

    # Step 3: Retrieve relevant docs
    relevant_docs = get_relevant_courses(query_in_english)
    print(f"[DEBUG] Retrieved {len(relevant_docs)} relevant docs")
    if relevant_docs:
        print("[DEBUG] Sample Relevant Doc:\n", relevant_docs[0][:300], "...\n")

    # Step 4: Build RAG prompt
    context = "\n\n".join(relevant_docs)
    rag_prompt = f"""
    Answer the user's question based on the following course information:
    {context}

    User Question: {query_in_english}
    Answer in English:
    """
    print(f"[DEBUG] RAG Prompt:\n{rag_prompt[:500]}...\n")

    # Step 5: RAG Answer
    rag_response = gemini_llm.invoke(rag_prompt).content.strip()
    print(f"[DEBUG] Raw RAG Response: {rag_response}")

    # Step 6: Translate back if needed
    if user_lang != "English":
        rag_response = translator_agent(rag_response, user_lang)
        print(f"[DEBUG] Translated Response: {rag_response}")

    print("================ DEBUG END ================\n")
    return rag_response


In [84]:
queries = [
    "ഒരു ഡെയ്‌റി ഫാം ആരംഭിക്കാൻ എത്ര പശുക്കൾ ആവശ്യമാകും?"        # English
]

for q in queries:
    print(f"\nUser Query: {q}")
    print("Detected Language:", detect_language(q))
    print("Final Answer:", generate_answer(q))



User Query: ഒരു ഡെയ്‌റി ഫാം ആരംഭിക്കാൻ എത്ര പശുക്കൾ ആവശ്യമാകും?
Detected Language: Malayalam

User Query: ഒരു ഡെയ്‌റി ഫാം ആരംഭിക്കാൻ എത്ര പശുക്കൾ ആവശ്യമാകും?
[DEBUG] Detected Language: Malayalam
[DEBUG] Query in English: How many cows will be needed to start a dairy farm?
[DEBUG] Retrieved 3 relevant docs
[DEBUG] Sample Relevant Doc:
 Course Title: Dairy Farming Course
Description: Ready to milk your potential? Learn the art of dairy farming and earn substantial profit from just 10 cows.
Who This Course is For: Farmers looking for new income streams ||| Aspiring dairy farm business owners ||| Individuals or mention a specific cat ...

[DEBUG] RAG Prompt:

    Answer the user's question based on the following course information:
    Course Title: Dairy Farming Course
Description: Ready to milk your potential? Learn the art of dairy farming and earn substantial profit from just 10 cows.
Who This Course is For: Farmers looking for new income streams ||| Aspiring dairy farm business owner

In [87]:
# ============================================
# 🔹 Imports
# ============================================
import os
from langdetect import detect
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv


# ============================================
# 🔹 Language Detection
# ============================================
def detect_language(text: str) -> str:
    """
    Detect the language of the given text using langdetect.
    """
    try:
        lang_code = detect(text)
    except Exception as e:
        print(f"[ERROR] Language detection failed: {e}")
        return "English"

    lang_map = {
        "te": "Telugu",
        "hi": "Hindi",
        "kn": "Kannada",
        "ta": "Tamil",
        "ml": "Malayalam",
        "en": "English",
    }
    return lang_map.get(lang_code, "English")

# ============================================
# 🔹 Translator (Bi-directional)
# ============================================
def translate_text(text: str, target_lang: str) -> str:
    """
    Translate text into the target language using Gemini.
    If target_lang = 'English', ensures output is English.
    """
    prompt = f"""
    You are a strict translation agent.
    Translate the following text into {target_lang}.
    Do not explain, do not summarize, do not add anything extra.
    Return only the translated text.

    Text:
    {text}
    """
    try:
        response = gemini_llm.invoke(prompt)
        if hasattr(response, "content") and response.content:
            return response.content.strip()
        else:
            return str(response).strip()
    except Exception as e:
        print(f"[ERROR] Translation failed: {e}")
        return text  # fallback

# ============================================
# 🔹 Dummy Retriever (replace with vectorstore retrieval)
# ============================================
def get_relevant_courses(query: str):
    """
    Mock retriever: in real app, fetch relevant docs from vectorstore (ChromaDB).
    """
    return [
        "Course Title: Dairy Farming Course\n"
        "Description: Ready to milk your potential? Learn the art of dairy farming and earn substantial profit from just 10 cows.\n"
        "Who This Course is For: Farmers looking for new income streams ||| Aspiring dairy farm business owners ||| Entrepreneurs seeking to invest in the industry"
    ]

# ============================================
# 🔹 Main Pipeline
# ============================================
def generate_answer(user_query: str) -> str:
    """
    End-to-end pipeline with debug logs:
    1. Detect query language
    2. Translate query → English (if needed)
    3. RAG Agent finds the answer in English
    4. Translate answer back to user’s language
    """

    print("\n================ DEBUG START ================")
    print(f"User Query: {user_query}")

    # Step 1: Detect query language
    user_lang = detect_language(user_query)
    print(f"[DEBUG] Detected Language: {user_lang}")

    # Step 2: Translate query to English if needed
    query_in_english = (
        translate_text(user_query, "English") if user_lang != "English" else user_query
    )
    print(f"[DEBUG] Query in English: {query_in_english}")

    # Step 3: Retrieve relevant docs
    relevant_docs = get_relevant_courses(query_in_english)
    print(f"[DEBUG] Retrieved {len(relevant_docs)} relevant docs")
    if relevant_docs:
        print("[DEBUG] Sample Relevant Doc:\n", relevant_docs[0][:300], "...\n")

    # Step 4: Build RAG prompt
    context = "\n\n".join(relevant_docs)
    rag_prompt = f"""
    Answer the user's question based on the following course information:
    {context}

    User Question: {query_in_english}
    Answer in English:
    """
    print(f"[DEBUG] RAG Prompt:\n{rag_prompt[:500]}...\n")

    # Step 5: Get RAG Answer
    try:
        rag_response = gemini_llm.invoke(rag_prompt).content.strip()
    except Exception as e:
        print(f"[ERROR] RAG generation failed: {e}")
        rag_response = "Sorry, I could not generate an answer."

    print(f"[DEBUG] Raw RAG Response: {rag_response}")

    # Step 6: Translate back to original language if needed
    if user_lang != "English":
        rag_response_translated = translate_text(rag_response, user_lang)
        print(f"[DEBUG] Translated Response: {rag_response_translated}")
        rag_response = rag_response_translated

    print("================ DEBUG END ================\n")
    return rag_response

# ============================================
# 🔹 Test
# ============================================
queries = [
    "ഒരു ഡെയ്‌റി ഫാം ആരംഭിക്കാൻ എത്ര പശുക്കൾ ആവശ്യമാകും?",  # Malayalam
    "पोल्ट्री फार्म कैसे शुरू करें?",                       # Hindi
    "Can you tell me about the honey bee farming course?"   # English
]

for q in queries:
    print(f"\nUser Query: {q}")
    print("Final Answer:", generate_answer(q))
    print("__________________________________")



User Query: ഒരു ഡെയ്‌റി ഫാം ആരംഭിക്കാൻ എത്ര പശുക്കൾ ആവശ്യമാകും?

User Query: ഒരു ഡെയ്‌റി ഫാം ആരംഭിക്കാൻ എത്ര പശുക്കൾ ആവശ്യമാകും?
[DEBUG] Detected Language: Malayalam
[DEBUG] Query in English: How many cows will be needed to start a dairy farm?
[DEBUG] Retrieved 1 relevant docs
[DEBUG] Sample Relevant Doc:
 Course Title: Dairy Farming Course
Description: Ready to milk your potential? Learn the art of dairy farming and earn substantial profit from just 10 cows.
Who This Course is For: Farmers looking for new income streams ||| Aspiring dairy farm business owners ||| Entrepreneurs seeking to invest in th ...

[DEBUG] RAG Prompt:

    Answer the user's question based on the following course information:
    Course Title: Dairy Farming Course
Description: Ready to milk your potential? Learn the art of dairy farming and earn substantial profit from just 10 cows.
Who This Course is For: Farmers looking for new income streams ||| Aspiring dairy farm business owners ||| Entrepreneurs seeking t

In [6]:
# ===============================
# 0. Install dependencies (run once)
# ===============================
# !pip install pandas langchain chromadb sentence-transformers python-dotenv langchain-google-genai langdetect --quiet

# ===============================
# 1. Imports
# ===============================
import pandas as pd
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from textwrap import wrap
import os
from dotenv import load_dotenv
from langdetect import detect
from langchain_google_genai import ChatGoogleGenerativeAI

# ===============================
# 2. Load dataset
# ===============================
csv_path = "data/bw_courses - Sheet1.csv"  # replace with your path
df = pd.read_csv(csv_path)

# Quick overview
print(df.shape)
df.head()

# ===============================
# 3. Preprocessing
# ===============================
df['Who This Course is For'] = df['Who This Course is For'].fillna("Not specified")
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

lang_map = {
    6: "Hindi",
    7: "Kannada",
    11: "Malayalam",
    20: "Tamil",
    21: "Telugu",
    24: "English"
}

def map_languages(cell):
    codes = str(cell).split(",")
    return [lang_map.get(int(c.strip()), f"Unknown-{c.strip()}") for c in codes]

df['Released Languages'] = df['Released Languages'].apply(map_languages)

# Show the missing rows demo
missing_rows_demo = df[df['Who This Course is For'] == "Not specified"]
missing_rows_demo

# ===============================
# 4. Prepare documents with chunking
# ===============================
documents = []
metadata_list = []
MAX_TOKENS = 200  

for _, row in df.iterrows():
    description_chunks = wrap(row['Course Description'], MAX_TOKENS) or [""]

    for desc_chunk in description_chunks:
        text = f"Course Title: {row['Course Title']}\n" \
               f"Description: {desc_chunk}\n" \
               f"Who This Course is For: {row['Who This Course is For']}\n" \
               f"Languages: {', '.join(row['Released Languages'])}"
        documents.append(text)
        metadata_list.append({
            "course_no": row['Course No'],
            "course_title": row['Course Title'],
            "released_languages": ', '.join(row['Released Languages']),
            "who_for": row['Who This Course is For']
        })

print(f"Total chunks created: {len(documents)}")

# ===============================
# 5. Create embeddings & build ChromaDB
# ===============================
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectordb = Chroma.from_texts(
    texts=documents,
    embedding=embedding_model,
    metadatas=metadata_list,
    persist_directory="data/chroma_db"
)

vectordb.persist()
print("✅ Chroma Vector DB created and persisted!")

# ===============================
# 6. Setup Google Gemini LLM
# ===============================
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise ValueError("GEMINI_API_KEY not found in .env")

gemini_llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=api_key,
    temperature=0.0,
    max_tokens=512,
    timeout=30,
    max_retries=3
)

(100, 5)
Total chunks created: 100


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


✅ Chroma Vector DB created and persisted!


In [9]:
# ===============================
# 7. Translation & Language Handling
# ===============================
from translate import Translator

def detect_language(text: str) -> str:
    """Detect query language using langdetect."""
    try:
        return detect(text)
    except:
        return "en"  # default to English if uncertain

def translate_text(text: str, target_lang: str) -> str:
    """Translate text using translate library."""
    try:
        translator = Translator(to_lang=target_lang)
        return translator.translate(text)
    except Exception as e:
        print(f"⚠️ Translation error: {e}")
        return text  # fallback to original text if error


# ===============================
# 8. RAG Query Function
# ===============================
def rag_query(user_query: str):
    # Step 1: Detect input language
    query_lang = detect_language(user_query)
    print(f"🔎 Detected language: {query_lang}")

    # Step 2: If not English, translate to English
    if query_lang != "en":
        translated_query = translate_text(user_query, "en")
        print(f"🌐 Translated query: {translated_query}")
    else:
        translated_query = user_query

    # Step 3: Perform retrieval
    retriever = vectordb.as_retriever(search_kwargs={"k": 3})
    docs = retriever.get_relevant_documents(translated_query)

    context = "\n\n".join([doc.page_content for doc in docs])

    prompt = f"""
    You are an assistant helping users understand course details. 
    Use the context below to answer the query.

    Context:
    {context}

    Question: {translated_query}
    Answer:
    """

    # Step 4: Get response from Gemini
    response = gemini_llm.invoke(prompt)
    rag_answer = response.content.strip()

    # Step 5: If original query language ≠ English, translate response back
    if query_lang != "en":
        rag_answer_translated = translate_text(rag_answer, query_lang)
        return rag_answer_translated

    return rag_answer


# ===============================
# 9. Example Run
# ===============================
if __name__ == "__main__":
    # Example in English
    q1 = "Which courses are available in Tamil?"
    print("💬 User:", q1)
    print("🤖 RAG Answer:", rag_query(q1))
    print("="*80)

    # Example in Hindi
    q2 = "ഒരു ഡെയ്‌റി ഫാം ആരംഭിക്കാൻ എത്ര പശുക്കൾ ആവശ്യമാകും?"
    print("💬 User:", q2)
    print("🤖 RAG Answer:", rag_query(q2))


💬 User: Which courses are available in Tamil?
🔎 Detected language: en


  docs = retriever.get_relevant_documents(translated_query)


🤖 RAG Answer: *   Course on National Pension Scheme
*   Education Loan Course
💬 User: ഒരു ഡെയ്‌റി ഫാം ആരംഭിക്കാൻ എത്ര പശുക്കൾ ആവശ്യമാകും?
🔎 Detected language: ml
🌐 Translated query: ഒരു ഡെയ്‌റി ഫാം ആരംഭിക്കാൻ എത്ര പശുക്കൾ ആവശ്യമാകും?
🤖 RAG Answer: ഈ ചോയ്സ് തത്തുല്യമായ ഉൽപന്നങ്ങൾ നൽകുന്നതിൽ പരാജയപ്പെട്ടാൽ കോളിഫ്ളവർ ലഭ്യമല്ല. ഇന്‍ഷുറന്‍സ് പോളിസിപോളിസി എടുക്കുക.


In [11]:
# ===============================
# Improved Multilingual RAG Pipeline
# ===============================

import pandas as pd
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from textwrap import wrap
import os
from dotenv import load_dotenv
from langdetect import detect, DetectorFactory
from langchain_google_genai import ChatGoogleGenerativeAI
import requests
import re

# Set seed for consistent language detection
DetectorFactory.seed = 0

# ===============================
# Enhanced Language Detection & Translation
# ===============================

# Language mappings for Indian languages
LANGUAGE_MAPPINGS = {
    'ml': 'Malayalam',
    'hi': 'Hindi', 
    'ta': 'Tamil',
    'te': 'Telugu',
    'kn': 'Kannada',
    'en': 'English'
}

SUPPORTED_LANGUAGES = {'ml', 'hi', 'ta', 'te', 'kn', 'en'}

def enhanced_language_detection(text: str) -> str:
    """Enhanced language detection with fallbacks for Indian languages."""
    
    # Simple script-based detection for better accuracy
    def detect_by_script(text):
        # Malayalam Unicode range
        if re.search(r'[\u0D00-\u0D7F]', text):
            return 'ml'
        # Hindi/Devanagari Unicode range
        elif re.search(r'[\u0900-\u097F]', text):
            return 'hi'
        # Tamil Unicode range
        elif re.search(r'[\u0B80-\u0BFF]', text):
            return 'ta'
        # Telugu Unicode range
        elif re.search(r'[\u0C00-\u0C7F]', text):
            return 'te'
        # Kannada Unicode range
        elif re.search(r'[\u0C80-\u0CFF]', text):
            return 'kn'
        return None
    
    # Try script-based detection first
    script_lang = detect_by_script(text)
    if script_lang:
        return script_lang
    
    # Fallback to langdetect
    try:
        detected = detect(text)
        return detected if detected in SUPPORTED_LANGUAGES else 'en'
    except:
        return 'en'

def translate_with_gemini(text: str, target_lang: str, source_lang: str = 'en') -> str:
    """Use Gemini for translation with proper language names."""
    
    if source_lang == target_lang:
        return text
    
    source_name = LANGUAGE_MAPPINGS.get(source_lang, 'English')
    target_name = LANGUAGE_MAPPINGS.get(target_lang, 'English')
    
    translation_prompt = f"""
    Translate the following text from {source_name} to {target_name}.
    Provide only the translation, no explanations or additional text.
    
    Text to translate: {text}
    
    Translation:
    """
    
    try:
        # Use the same gemini_llm instance
        response = gemini_llm.invoke(translation_prompt)
        return response.content.strip()
    except Exception as e:
        print(f"⚠️ Gemini translation error: {e}")
        return text

# ===============================
# Data Loading and Processing (Same as before)
# ===============================

def load_and_process_data(csv_path: str):
    """Load and preprocess the course data."""
    df = pd.read_csv(csv_path)
    
    # Fill missing values
    df['Who This Course is For'] = df['Who This Course is For'].fillna("Not specified")
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    
    # Language mapping
    lang_map = {
        6: "Hindi",
        7: "Kannada", 
        11: "Malayalam",
        20: "Tamil",
        21: "Telugu",
        24: "English"
    }
    
    def map_languages(cell):
        codes = str(cell).split(",")
        return [lang_map.get(int(c.strip()), f"Unknown-{c.strip()}") for c in codes]
    
    df['Released Languages'] = df['Released Languages'].apply(map_languages)
    return df

def create_documents_and_vectordb(df, persist_dir: str = "data/chroma_db"):
    """Create document chunks and build vector database."""
    documents = []
    metadata_list = []
    MAX_TOKENS = 200
    
    for _, row in df.iterrows():
        description_chunks = wrap(row['Course Description'], MAX_TOKENS) or [""]
        
        for desc_chunk in description_chunks:
            text = f"Course Title: {row['Course Title']}\n" \
                   f"Description: {desc_chunk}\n" \
                   f"Who This Course is For: {row['Who This Course is For']}\n" \
                   f"Languages: {', '.join(row['Released Languages'])}"
            documents.append(text)
            metadata_list.append({
                "course_no": row['Course No'],
                "course_title": row['Course Title'], 
                "released_languages": ', '.join(row['Released Languages']),
                "who_for": row['Who This Course is For']
            })
    
    # Create embeddings
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    
    vectordb = Chroma.from_texts(
        texts=documents,
        embedding=embedding_model,
        metadatas=metadata_list,
        persist_directory=persist_dir
    )
    
    vectordb.persist()
    return vectordb

# ===============================
# Setup LLM
# ===============================

def setup_gemini_llm():
    """Initialize Gemini LLM."""
    load_dotenv()
    api_key = os.getenv("GEMINI_API_KEY")
    if not api_key:
        raise ValueError("GEMINI_API_KEY not found in .env file")
    
    return ChatGoogleGenerativeAI(
        model="gemini-2.5-flash",
        google_api_key=api_key,
        temperature=0.0,
        max_tokens=512,
        timeout=30,
        max_retries=3
    )

# ===============================
# Enhanced RAG Query Function
# ===============================

def enhanced_rag_query(user_query: str, vectordb, llm):
    """
    Enhanced RAG query with improved multilingual support.
    """
    print(f"🔍 Processing query: {user_query[:50]}...")
    
    # Step 1: Enhanced language detection
    query_lang = enhanced_language_detection(user_query)
    lang_name = LANGUAGE_MAPPINGS.get(query_lang, 'Unknown')
    print(f"🌐 Detected language: {lang_name} ({query_lang})")
    
    # Step 2: Translate to English for retrieval if needed
    if query_lang != 'en':
        print("🔄 Translating query to English...")
        english_query = translate_with_gemini(user_query, 'en', query_lang)
        print(f"📝 English query: {english_query}")
    else:
        english_query = user_query
    
    # Step 3: Retrieve relevant documents
    print("🔍 Searching for relevant courses...")
    retriever = vectordb.as_retriever(search_kwargs={"k": 3})
    docs = retriever.get_relevant_documents(english_query)
    
    context = "\n\n".join([doc.page_content for doc in docs])
    
    # Step 4: Generate response
    prompt = f"""
    You are a helpful assistant for Boswallah courses. Use the provided context to answer the user's question accurately and helpfully.
    
    Context:
    {context}
    
    Question: {english_query}
    
    Please provide a comprehensive answer based on the context. If the context doesn't contain enough information, mention that clearly.
    
    Answer:
    """
    
    print("🤖 Generating response...")
    response = llm.invoke(prompt)
    english_answer = response.content.strip()
    
    # Step 5: Translate response back to original language if needed
    if query_lang != 'en':
        print(f"🔄 Translating response to {lang_name}...")
        final_answer = translate_with_gemini(english_answer, query_lang, 'en')
    else:
        final_answer = english_answer
    
    return {
        'query': user_query,
        'detected_language': lang_name,
        'answer': final_answer,
        'retrieved_docs': len(docs)
    }

# ===============================
# Main Pipeline Class
# ===============================

class MultilingualRAGPipeline:
    """Complete multilingual RAG pipeline for Boswallah courses."""
    
    def __init__(self, csv_path: str, persist_dir: str = "data/chroma_db"):
        self.csv_path = csv_path
        self.persist_dir = persist_dir
        self.vectordb = None
        self.llm = None
        
    def initialize(self):
        """Initialize the pipeline."""
        print("🚀 Initializing Multilingual RAG Pipeline...")
        
        # Setup LLM
        print("🔧 Setting up Gemini LLM...")
        self.llm = setup_gemini_llm()
        
        # Load and process data
        print("📁 Loading course data...")
        df = load_and_process_data(self.csv_path)
        print(f"✅ Loaded {len(df)} courses")
        
        # Create vector database
        print("🔍 Creating vector database...")
        self.vectordb = create_documents_and_vectordb(df, self.persist_dir)
        print("✅ Vector database created and persisted!")
        
        # Set global LLM for translation functions
        global gemini_llm
        gemini_llm = self.llm
        
        print("🎉 Pipeline initialization complete!")
    
    def query(self, user_query: str):
        """Process a user query."""
        if not self.vectordb or not self.llm:
            raise ValueError("Pipeline not initialized. Call initialize() first.")
        
        return enhanced_rag_query(user_query, self.vectordb, self.llm)

# ===============================
# Usage Example
# ===============================

if __name__ == "__main__":
    # Initialize pipeline
    pipeline = MultilingualRAGPipeline("data/bw_courses - Sheet1.csv")
    pipeline.initialize()
    
    # Test queries in different languages
    test_queries = [
        "Which courses are available in Tamil?",
        "ഒരു ഡെയ്‌റി ഫാം ആരംഭിക്കാൻ എത്ര പശുക്കൾ ആവശ്യമാകും?",  # Malayalam
    ]
    
    for query in test_queries:
        print("=" * 80)
        print(f"💬 User Query: {query}")
        print("-" * 40)
        
        try:
            result = pipeline.query(query)
            print(f"🌐 Detected Language: {result['detected_language']}")
            print(f"📚 Retrieved Documents: {result['retrieved_docs']}")
            print(f"🤖 Answer: {result['answer']}")
        except Exception as e:
            print(f"❌ Error: {e}")
        
        print()

🚀 Initializing Multilingual RAG Pipeline...
🔧 Setting up Gemini LLM...
📁 Loading course data...
✅ Loaded 100 courses
🔍 Creating vector database...


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


✅ Vector database created and persisted!
🎉 Pipeline initialization complete!
💬 User Query: Which courses are available in Tamil?
----------------------------------------
🔍 Processing query: Which courses are available in Tamil?...
🌐 Detected language: English (en)
🔍 Searching for relevant courses...
🤖 Generating response...
🌐 Detected Language: English
📚 Retrieved Documents: 3
🤖 Answer: Based on the provided context, the following courses are available in Tamil:

*   **Course on National Pension Scheme**
*   **Education Loan Course**

💬 User Query: ഒരു ഡെയ്‌റി ഫാം ആരംഭിക്കാൻ എത്ര പശുക്കൾ ആവശ്യമാകും?
----------------------------------------
🔍 Processing query: ഒരു ഡെയ്‌റി ഫാം ആരംഭിക്കാൻ എത്ര പശുക്കൾ ആവശ്യമാകും...
🌐 Detected language: Malayalam (ml)
🔄 Translating query to English...
📝 English query: How many cows are needed to start a dairy farm?
🔍 Searching for relevant courses...
🤖 Generating response...
🔄 Translating response to Malayalam...
🌐 Detected Language: Malayalam
📚 Retrieved 

In [15]:
# ===============================
# Complete Multilingual RAG Pipeline with Google Translate
# ===============================

# First install required packages:
# pip install pandas langchain chromadb sentence-transformers python-dotenv langchain-google-genai langdetect googletrans==4.0.0rc1

import pandas as pd
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from textwrap import wrap
import os
from dotenv import load_dotenv
from langdetect import detect, DetectorFactory
from langchain_google_genai import ChatGoogleGenerativeAI
from googletrans import Translator
import time
import re

# Set seed for consistent language detection
DetectorFactory.seed = 0

# ===============================
# Language Detection & Translation
# ===============================

# Language mappings for Indian languages
LANGUAGE_MAPPINGS = {
    'ml': 'Malayalam',
    'hi': 'Hindi', 
    'ta': 'Tamil',
    'te': 'Telugu',
    'kn': 'Kannada',
    'en': 'English'
}

SUPPORTED_LANGUAGES = {'ml', 'hi', 'ta', 'te', 'kn', 'en'}

# Google Translate language codes
GOOGLE_LANG_CODES = {
    'ml': 'ml',  # Malayalam
    'hi': 'hi',  # Hindi
    'ta': 'ta',  # Tamil
    'te': 'te',  # Telugu
    'kn': 'kn',  # Kannada
    'en': 'en'   # English
}

def enhanced_language_detection(text: str) -> str:
    """Enhanced language detection with fallbacks for Indian languages."""
    
    # Simple script-based detection for better accuracy
    def detect_by_script(text):
        # Malayalam Unicode range
        if re.search(r'[\u0D00-\u0D7F]', text):
            return 'ml'
        # Hindi/Devanagari Unicode range
        elif re.search(r'[\u0900-\u097F]', text):
            return 'hi'
        # Tamil Unicode range
        elif re.search(r'[\u0B80-\u0BFF]', text):
            return 'ta'
        # Telugu Unicode range
        elif re.search(r'[\u0C00-\u0C7F]', text):
            return 'te'
        # Kannada Unicode range
        elif re.search(r'[\u0C80-\u0CFF]', text):
            return 'kn'
        return None
    
    # Try script-based detection first
    script_lang = detect_by_script(text)
    if script_lang:
        return script_lang
    
    # Fallback to langdetect
    try:
        detected = detect(text)
        return detected if detected in SUPPORTED_LANGUAGES else 'en'
    except:
        return 'en'

def translate_with_googletrans(text: str, target_lang: str, source_lang: str = 'en') -> str:
    """Translation using Google Translate."""
    
    if source_lang == target_lang or not text.strip():
        return text
    
    source_code = GOOGLE_LANG_CODES.get(source_lang, 'en')
    target_code = GOOGLE_LANG_CODES.get(target_lang, 'en')
    
    try:
        translator = Translator()
        
        # Add small delay to avoid rate limiting
        time.sleep(0.1)
        
        print(f"🔄 Translating from {LANGUAGE_MAPPINGS.get(source_lang, 'Unknown')} to {LANGUAGE_MAPPINGS.get(target_lang, 'Unknown')}...")
        print(f"📝 Original text: {text[:100]}{'...' if len(text) > 100 else ''}")
        
        result = translator.translate(text, src=source_code, dest=target_code)
        translated = result.text
        
        print(f"✅ Translation result: {translated[:100]}{'...' if len(translated) > 100 else ''}")
        
        if not translated or not translated.strip():
            print("⚠️ Translation returned empty result, using original text")
            return text
            
        return translated
        
    except Exception as e:
        print(f"⚠️ Google Translate error: {e}")
        print(f"🔄 Returning original text as fallback")
        return text

def translate_with_gemini(text: str, target_lang: str, source_lang: str = 'en', llm=None) -> str:
    """Backup translation using Gemini (if Google Translate fails)."""
    
    if source_lang == target_lang or not text.strip():
        return text
    
    if not llm:
        print("⚠️ No LLM provided for Gemini translation")
        return text
    
    source_name = LANGUAGE_MAPPINGS.get(source_lang, 'English')
    target_name = LANGUAGE_MAPPINGS.get(target_lang, 'English')
    
    translation_prompt = f"""You are a professional translator. Translate the following text accurately from {source_name} to {target_name}.

Rules:
1. Provide ONLY the translation, no explanations
2. Maintain the original meaning and context
3. Use natural, fluent {target_name}
4. Do not add any prefixes, suffixes, or explanations

Text to translate: {text}

Translation:"""
    
    try:
        print(f"🔄 Using Gemini for {source_name} to {target_name} translation...")
        
        response = llm.invoke(translation_prompt)
        translated = response.content.strip()
        
        print(f"✅ Gemini translation result: {translated[:100]}{'...' if len(translated) > 100 else ''}")
        
        if not translated or not translated.strip():
            print("⚠️ Gemini translation returned empty result")
            return text
            
        return translated
        
    except Exception as e:
        print(f"⚠️ Gemini translation error: {e}")
        return text

def robust_translate(text: str, target_lang: str, source_lang: str = 'en', llm=None) -> str:
    """Try Google Translate first, fallback to Gemini if needed."""
    
    if source_lang == target_lang or not text.strip():
        return text
    
    # Try Google Translate first
    result = translate_with_googletrans(text, target_lang, source_lang)
    
    # If Google Translate fails or returns unchanged text, try Gemini
    if not result or not result.strip() or result == text:
        print("🔄 Google Translate failed, trying Gemini as fallback...")
        if llm:
            result = translate_with_gemini(text, target_lang, source_lang, llm)
        else:
            print("⚠️ No LLM available for Gemini fallback")
    
    return result

# ===============================
# Data Loading and Processing
# ===============================

def load_and_process_data(csv_path: str):
    """Load and preprocess the course data."""
    df = pd.read_csv(csv_path)
    
    # Fill missing values
    df['Who This Course is For'] = df['Who This Course is For'].fillna("Not specified")
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    
    # Language mapping
    lang_map = {
        6: "Hindi",
        7: "Kannada", 
        11: "Malayalam",
        20: "Tamil",
        21: "Telugu",
        24: "English"
    }
    
    def map_languages(cell):
        if pd.isna(cell):
            return ["English"]
        codes = str(cell).split(",")
        return [lang_map.get(int(c.strip()), f"Unknown-{c.strip()}") for c in codes if c.strip().isdigit()]
    
    df['Released Languages'] = df['Released Languages'].apply(map_languages)
    return df

def create_documents_and_vectordb(df, persist_dir: str = "data/chroma_db"):
    """Create document chunks and build vector database."""
    documents = []
    metadata_list = []
    MAX_TOKENS = 200
    
    for _, row in df.iterrows():
        description_chunks = wrap(str(row['Course Description']), MAX_TOKENS) or [""]
        
        for desc_chunk in description_chunks:
            text = f"Course Title: {row['Course Title']}\n" \
                   f"Description: {desc_chunk}\n" \
                   f"Who This Course is For: {row['Who This Course is For']}\n" \
                   f"Languages: {', '.join(row['Released Languages'])}"
            documents.append(text)
            metadata_list.append({
                "course_no": row['Course No'],
                "course_title": row['Course Title'], 
                "released_languages": ', '.join(row['Released Languages']),
                "who_for": row['Who This Course is For']
            })
    
    # Create embeddings
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    
    vectordb = Chroma.from_texts(
        texts=documents,
        embedding=embedding_model,
        metadatas=metadata_list,
        persist_directory=persist_dir
    )
    
    vectordb.persist()
    return vectordb

# ===============================
# Setup LLM
# ===============================

def setup_gemini_llm():
    """Initialize Gemini LLM."""
    load_dotenv()
    api_key = os.getenv("GEMINI_API_KEY")
    if not api_key:
        raise ValueError("GEMINI_API_KEY not found in .env file")
    
    return ChatGoogleGenerativeAI(
        model="gemini-2.5-flash",
        google_api_key=api_key,
        temperature=0.0,
        max_tokens=512,
        timeout=30,
        max_retries=3
    )

# ===============================
# Enhanced RAG Query Function
# ===============================

def enhanced_rag_query(user_query: str, vectordb, llm):
    """
    Enhanced RAG query with Google Translate and Gemini fallback.
    """
    print(f"🔍 Processing query: {user_query[:50]}...")
    
    # Step 1: Enhanced language detection
    query_lang = enhanced_language_detection(user_query)
    lang_name = LANGUAGE_MAPPINGS.get(query_lang, 'Unknown')
    print(f"🌐 Detected language: {lang_name} ({query_lang})")
    
    # Step 2: Translate to English for retrieval if needed
    if query_lang != 'en':
        print("🔄 Translating query to English...")
        english_query = robust_translate(user_query, 'en', query_lang, llm)
        print(f"📝 English query: {english_query}")
        
        # Fallback if translation completely fails
        if not english_query or english_query.strip() == "":
            print("⚠️ Translation failed completely, using original query")
            english_query = user_query
    else:
        english_query = user_query
    
    # Step 3: Retrieve relevant documents
    print("🔍 Searching for relevant courses...")
    retriever = vectordb.as_retriever(search_kwargs={"k": 3})
    docs = retriever.get_relevant_documents(english_query)
    
    context = "\n\n".join([doc.page_content for doc in docs])
    print(f"📄 Retrieved context length: {len(context)} characters")
    
    # Step 4: Generate response with improved prompt
    prompt = f"""You are a helpful assistant for Boswallah courses. Based on the provided course information, answer the user's question completely and accurately.

Course Information Available:
{context}

User Question: {english_query}

Instructions:
- Provide a complete, detailed answer based on the course information
- If the exact information isn't available, explain what related information is available
- Be specific about course names, details, and requirements
- Write a comprehensive response (at least 2-3 sentences)

Complete Answer:"""
    
    print("🤖 Generating response...")
    try:
        response = llm.invoke(prompt)
        english_answer = response.content.strip()
        print(f"✅ Generated English answer: {english_answer[:150]}{'...' if len(english_answer) > 150 else ''}")
        
        # Check if English answer is empty
        if not english_answer or not english_answer.strip():
            print("⚠️ Generated answer is empty")
            english_answer = "I couldn't find specific information about your question in the available course data. Please try rephrasing your question or contact support for more details."
        
    except Exception as e:
        print(f"❌ Error generating response: {e}")
        english_answer = "I encountered an error while processing your question. Please try again or contact support."
    
    # Step 5: Translate response back to original language if needed
    if query_lang != 'en':
        print(f"🔄 Translating response back to {lang_name}...")
        final_answer = robust_translate(english_answer, query_lang, 'en', llm)
        
        # Double-check final answer isn't empty
        if not final_answer or not final_answer.strip():
            print("⚠️ Final translation failed, providing English answer with note")
            final_answer = f"[English response - translation unavailable]: {english_answer}"
            
    else:
        final_answer = english_answer
    
    return {
        'query': user_query,
        'detected_language': lang_name,
        'english_query': english_query,
        'english_answer': english_answer,
        'answer': final_answer,
        'retrieved_docs': len(docs)
    }

# ===============================
# Main Pipeline Class
# ===============================

class MultilingualRAGPipeline:
    """Complete multilingual RAG pipeline for Boswallah courses."""
    
    def __init__(self, csv_path: str, persist_dir: str = "data/chroma_db"):
        self.csv_path = csv_path
        self.persist_dir = persist_dir
        self.vectordb = None
        self.llm = None
        
    def initialize(self):
        """Initialize the pipeline."""
        print("🚀 Initializing Multilingual RAG Pipeline...")
        
        # Setup LLM
        print("🔧 Setting up Gemini LLM...")
        self.llm = setup_gemini_llm()
        
        # Load and process data
        print("📁 Loading course data...")
        df = load_and_process_data(self.csv_path)
        print(f"✅ Loaded {len(df)} courses")
        
        # Create vector database
        print("🔍 Creating vector database...")
        self.vectordb = create_documents_and_vectordb(df, self.persist_dir)
        print("✅ Vector database created and persisted!")
        
        print("🎉 Pipeline initialization complete!")
    
    def query(self, user_query: str):
        """Process a user query."""
        if not self.vectordb or not self.llm:
            raise ValueError("Pipeline not initialized. Call initialize() first.")
        
        return enhanced_rag_query(user_query, self.vectordb, self.llm)

# ===============================
# Usage Example
# ===============================

if __name__ == "__main__":
    # Initialize pipeline
    pipeline = MultilingualRAGPipeline("data/bw_courses - Sheet1.csv")
    pipeline.initialize()
    
    # Test queries in different languages
    test_queries = [
        "Which courses are available in Tamil?",
        "ഒരു ഡെയ്‌റി ഫാം ആരംഭിക്കാൻ എത്ര പശുക്കൾ ആവശ്യമാകും?",  # Malayalam
        
    ]
    
    for i, query in enumerate(test_queries, 1):
        print("=" * 80)
        print(f"TEST {i}: {query}")
        print("-" * 40)
        
        try:
            result = pipeline.query(query)
            print(f"🌐 Detected Language: {result['detected_language']}")
            print(f"📚 Retrieved Documents: {result['retrieved_docs']}")
            print(f"🔍 English Query: {result.get('english_query', 'N/A')}")
            print(f"🔤 English Answer: {result.get('english_answer', 'N/A')[:200]}{'...' if len(result.get('english_answer', '')) > 200 else ''}")
            print(f"🤖 Final Answer: {result['answer']}")
        except Exception as e:
            print(f"❌ Error: {e}")
            import traceback
            traceback.print_exc()
        
        print()
        time.sleep(1)  # Small delay between requests

🚀 Initializing Multilingual RAG Pipeline...
🔧 Setting up Gemini LLM...
📁 Loading course data...
✅ Loaded 100 courses
🔍 Creating vector database...


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


✅ Vector database created and persisted!
🎉 Pipeline initialization complete!
TEST 1: Which courses are available in Tamil?
----------------------------------------
🔍 Processing query: Which courses are available in Tamil?...
🌐 Detected language: English (en)
🔍 Searching for relevant courses...
📄 Retrieved context length: 1906 characters
🤖 Generating response...
✅ Generated English answer: Based on the provided course information, the following courses are available in Tamil:

*   **Course on National Pension Scheme:** This course is des...
🌐 Detected Language: English
📚 Retrieved Documents: 3
🔍 English Query: Which courses are available in Tamil?
🔤 English Answer: Based on the provided course information, the following courses are available in Tamil:

*   **Course on National Pension Scheme:** This course is designed to help you make your golden years truly gol...
🤖 Final Answer: Based on the provided course information, the following courses are available in Tamil:

*   **Course on Na