In [79]:
# ===============================
# 0. Install dependencies (run once)
# ===============================
!pip install pandas langchain chromadb sentence-transformers python-dotenv langchain-google-genai --quiet

# ===============================
# 1. Imports
# ===============================
import pandas as pd
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from textwrap import wrap
import os
from dotenv import load_dotenv

from langchain_google_genai import ChatGoogleGenerativeAI

# ===============================
# 2. Load dataset
# ===============================
csv_path = "data/bw_courses - Sheet1.csv"  # replace with your path
df = pd.read_csv(csv_path)

# Quick overview
print(df.shape)
df.head()

# ===============================
# 3. Preprocessing
# ===============================

# Fill missing values in "Who This Course is For"
df['Who This Course is For'] = df['Who This Course is For'].fillna("Not specified")

# Strip whitespace from all string fields
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Map language codes to names
lang_map = {
    6: "Hindi",
    7: "Kannada",
    11: "Malayalam",
    20: "Tamil",
    21: "Telugu",
    24: "English"
}

def map_languages(cell):
    codes = str(cell).split(",")
    return [lang_map.get(int(c.strip()), f"Unknown-{c.strip()}") for c in codes]

df['Released Languages'] = df['Released Languages'].apply(map_languages)

# Show the 2 rows that were missing originally (for notebook demo)
missing_rows_demo = df[df['Who This Course is For'] == "Not specified"]
missing_rows_demo

# ===============================
# 4. Prepare documents with chunking
# ===============================

documents = []
metadata_list = []

MAX_TOKENS = 200  # Approximate chunk size for description

for _, row in df.iterrows():
    # Chunk the description if it is long
    description_chunks = wrap(row['Course Description'], MAX_TOKENS) or [""]  # fallback empty string
    
    for desc_chunk in description_chunks:
        text = f"Course Title: {row['Course Title']}\n" \
               f"Description: {desc_chunk}\n" \
               f"Who This Course is For: {row['Who This Course is For']}\n" \
               f"Languages: {', '.join(row['Released Languages'])}"
        documents.append(text)
        metadata_list.append({
            "course_no": row['Course No'],
            "course_title": row['Course Title'],
            "released_languages": ', '.join(row['Released Languages']),  # join with commas
            "who_for": row['Who This Course is For']
        })


print(f"Total chunks created: {len(documents)}")

# ===============================
# 5. Create embeddings & build ChromaDB
# ===============================
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectordb = Chroma.from_texts(
    texts=documents,
    embedding=embedding_model,
    metadatas=metadata_list,
    persist_directory="data/chroma_db"
)

vectordb.persist()
print("✅ Chroma Vector DB created and persisted!")

# ===============================
# 6. Setup Google Gemini LLM
# ===============================
load_dotenv()  # load API key from .env
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise ValueError("GEMINI_API_KEY not found in .env")

gemini_llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=api_key,
    temperature=0.0,
    max_tokens=512,
    timeout=30,
    max_retries=3
)

# ===============================
# 7. Retriever function
# ===============================
def get_relevant_courses(query, k=3):
    """
    Retrieve top-k relevant course chunks for a query
    """
    results = vectordb.similarity_search(query, k=k)
    return [res.page_content for res in results]

# ===============================
# 8. Query function using Gemini
# ===============================
def generate_answer(context_docs, user_query):
    """
    Use Gemini LLM to answer user queries strictly based on dataset.
    """
    context = "\n\n".join(context_docs)
    prompt = f"""
You are an AI Support Agent for **BossWallah**, specializing in answering questions
about the available courses. Follow these rules strictly:

1. Only answer using the provided dataset context. 
2. If the answer is not present in the dataset, say politely:
   "Sorry, I could not find a relevant course in the BossWallah catalog."
3. Always include the **Course Title** and key details if available.
4. If multiple courses are relevant, list them clearly in bullet points.
5. If the user asks in general terms (e.g., poultry farming, financial freedom),
   map it to the most relevant courses from the dataset.
6. Be clear, concise, and helpful. Do not make up content beyond the dataset.

---
📘 Dataset Context:
{context}

---
💡 User Question:
{user_query}

Now provide the best possible helpful answer:
    """

    response = gemini_llm.invoke(prompt)
    return response.content.strip()


# ===============================
# 9. Demo: Sample queries
# ===============================
sample_queries = [
    "Tell me about honey bee farming course",
    "I want to learn how to start a poultry farm",
    "Do you have any courses in Tamil?",
    "I am a recent high school graduate, are there any opportunities for me?"
]

for q in sample_queries:
    relevant_docs = get_relevant_courses(q)
    answer = generate_answer(relevant_docs, q)
    print(f"\n--- Query: {q} ---")
    print("Answer:")
    print(answer)
    print("\nDocuments considered:")
    for doc in relevant_docs:
        print(doc)
        print("------")



[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: C:\Users\nikhi\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


(100, 5)


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Total chunks created: 100
✅ Chroma Vector DB created and persisted!

--- Query: Tell me about honey bee farming course ---
Answer:
We have a **Course on Honey Bee Farming** available.

This course is designed to help you transform your passion for bees into a lucrative career.

It is suitable for:
*   Beginners looking to start a career in beekeeping
*   Experienced beekeepers looking to expand their knowledge and skills
*   Entrepreneurs interested in starting their own beekeeping business
*   Farmers and landowners looking to diversify their income
*   Anyone with a passion for bees and a desire to learn about the industry

The course is available in Hindi, Kannada, Malayalam, Tamil, Telugu, and English.

Documents considered:
Course Title: Course on Honey Bee Farming
Description: Transform Your Passion for Bees into a Lucrative Career: Join Our Honey Bee Farming Course Now!
Who This Course is For: Beginners looking to start a career in beekeeping ||| Experienced beekeepers looking t

In [67]:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0  # make results deterministic

# Map langdetect codes to your supported language names
lang_code_map = {
    "hi": "Hindi",
    "kn": "Kannada",
    "ta": "Tamil",
    "te": "Telugu",
    "ml": "Malayalam",
    "en": "English"
}

def detect_language(text: str) -> str:
    """
    Detects the language of the input text and maps it
    to one of the supported languages. Defaults to English if unsupported.
    """
    try:
        detected_code = detect(text)
        return lang_code_map.get(detected_code, "English")
    except Exception:
        return "English"

In [68]:
user_query = "ഒരു ഡെയ്‌റി ഫാം ആരംഭിക്കാൻ എത്ര പശുക്കൾ ആവശ്യമാകും?"

detect_language(user_query)

'Malayalam'

In [74]:
# ===============================
# 6. Setup Google Gemini LLM
# ===============================
load_dotenv()  # load API key from .env
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise ValueError("GEMINI_API_KEY not found in .env")

gemini_llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=api_key,
    temperature=0.0,
    max_tokens=512,
    timeout=30,
    max_retries=3
)

In [76]:
from langdetect import detect
from langchain_google_genai import ChatGoogleGenerativeAI


def translator_agent(text: str, target_lang: str) -> str:
    """
    Translate English text into target language using Gemini.
    """
    translation_prompt = f"""
    You are a translation agent. Your task is to translate the following English text
    into **{target_lang}**. Keep the meaning accurate and natural for a native speaker. 

    Text to translate:
    {text}
    """
    return gemini_llm.invoke(translation_prompt).content.strip()


def translate_to_english(text: str) -> str:
    """
    Translate text from any supported language into English.
    """
    translation_prompt = f"""
    You are a translation agent. Translate the following text into **English** only. 
    Keep the meaning intact, do not summarize, just translate.

    Text:
    {text}
    """
    return gemini_llm.invoke(translation_prompt).content.strip()



In [83]:
def generate_answer(user_query: str) -> str:
    """
    End-to-end pipeline with debug logs:
    1. Detect query language
    2. Translate query → English (if needed)
    3. RAG Agent finds the answer in English
    4. Translator Agent translates back to user’s language
    """

    print("\n================ DEBUG START ================")
    print(f"User Query: {user_query}")

    # Step 1: Detect query language
    user_lang = detect_language(user_query)
    print(f"[DEBUG] Detected Language: {user_lang}")

    # Step 2: Translate query to English if needed
    query_in_english = (
        translate_to_english(user_query) if user_lang != "English" else user_query
    )
    print(f"[DEBUG] Query in English: {query_in_english}")

    # Step 3: Retrieve relevant docs
    relevant_docs = get_relevant_courses(query_in_english)
    print(f"[DEBUG] Retrieved {len(relevant_docs)} relevant docs")
    if relevant_docs:
        print("[DEBUG] Sample Relevant Doc:\n", relevant_docs[0][:300], "...\n")

    # Step 4: Build RAG prompt
    context = "\n\n".join(relevant_docs)
    rag_prompt = f"""
    Answer the user's question based on the following course information:
    {context}

    User Question: {query_in_english}
    Answer in English:
    """
    print(f"[DEBUG] RAG Prompt:\n{rag_prompt[:500]}...\n")

    # Step 5: RAG Answer
    rag_response = gemini_llm.invoke(rag_prompt).content.strip()
    print(f"[DEBUG] Raw RAG Response: {rag_response}")

    # Step 6: Translate back if needed
    if user_lang != "English":
        rag_response = translator_agent(rag_response, user_lang)
        print(f"[DEBUG] Translated Response: {rag_response}")

    print("================ DEBUG END ================\n")
    return rag_response


In [84]:
queries = [
    "ഒരു ഡെയ്‌റി ഫാം ആരംഭിക്കാൻ എത്ര പശുക്കൾ ആവശ്യമാകും?"        # English
]

for q in queries:
    print(f"\nUser Query: {q}")
    print("Detected Language:", detect_language(q))
    print("Final Answer:", generate_answer(q))



User Query: ഒരു ഡെയ്‌റി ഫാം ആരംഭിക്കാൻ എത്ര പശുക്കൾ ആവശ്യമാകും?
Detected Language: Malayalam

User Query: ഒരു ഡെയ്‌റി ഫാം ആരംഭിക്കാൻ എത്ര പശുക്കൾ ആവശ്യമാകും?
[DEBUG] Detected Language: Malayalam
[DEBUG] Query in English: How many cows will be needed to start a dairy farm?
[DEBUG] Retrieved 3 relevant docs
[DEBUG] Sample Relevant Doc:
 Course Title: Dairy Farming Course
Description: Ready to milk your potential? Learn the art of dairy farming and earn substantial profit from just 10 cows.
Who This Course is For: Farmers looking for new income streams ||| Aspiring dairy farm business owners ||| Individuals or mention a specific cat ...

[DEBUG] RAG Prompt:

    Answer the user's question based on the following course information:
    Course Title: Dairy Farming Course
Description: Ready to milk your potential? Learn the art of dairy farming and earn substantial profit from just 10 cows.
Who This Course is For: Farmers looking for new income streams ||| Aspiring dairy farm business owner

In [None]:
# ============================================
# 🔹 Imports
# ============================================
import os
from langdetect import detect
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv


# ============================================
# 🔹 Language Detection
# ============================================
def detect_language(text: str) -> str:
    """
    Detect the language of the given text using langdetect.
    """
    try:
        lang_code = detect(text)
    except Exception as e:
        print(f"[ERROR] Language detection failed: {e}")
        return "English"

    lang_map = {
        "te": "Telugu",
        "hi": "Hindi",
        "kn": "Kannada",
        "ta": "Tamil",
        "ml": "Malayalam",
        "en": "English",
    }
    return lang_map.get(lang_code, "English")

# ============================================
# 🔹 Translator (Bi-directional)
# ============================================
def translate_text(text: str, target_lang: str) -> str:
    """
    Translate text into the target language using Gemini.
    If target_lang = 'English', ensures output is English.
    """
    prompt = f"""
    You are a strict translation agent.
    Translate the following text into {target_lang}.
    Do not explain, do not summarize, do not add anything extra.
    Return only the translated text.

    Text:
    {text}
    """
    try:
        response = gemini_llm.invoke(prompt)
        if hasattr(response, "content") and response.content:
            return response.content.strip()
        else:
            return str(response).strip()
    except Exception as e:
        print(f"[ERROR] Translation failed: {e}")
        return text  # fallback

# ============================================
# 🔹 Dummy Retriever (replace with vectorstore retrieval)
# ============================================
def get_relevant_courses(query: str):
    """
    Mock retriever: in real app, fetch relevant docs from vectorstore (ChromaDB).
    """
    return [
        "Course Title: Dairy Farming Course\n"
        "Description: Ready to milk your potential? Learn the art of dairy farming and earn substantial profit from just 10 cows.\n"
        "Who This Course is For: Farmers looking for new income streams ||| Aspiring dairy farm business owners ||| Entrepreneurs seeking to invest in the industry"
    ]

# ============================================
# 🔹 Main Pipeline
# ============================================
def generate_answer(user_query: str) -> str:
    """
    End-to-end pipeline with debug logs:
    1. Detect query language
    2. Translate query → English (if needed)
    3. RAG Agent finds the answer in English
    4. Translate answer back to user’s language
    """

    print("\n================ DEBUG START ================")
    print(f"User Query: {user_query}")

    # Step 1: Detect query language
    user_lang = detect_language(user_query)
    print(f"[DEBUG] Detected Language: {user_lang}")

    # Step 2: Translate query to English if needed
    query_in_english = (
        translate_text(user_query, "English") if user_lang != "English" else user_query
    )
    print(f"[DEBUG] Query in English: {query_in_english}")

    # Step 3: Retrieve relevant docs
    relevant_docs = get_relevant_courses(query_in_english)
    print(f"[DEBUG] Retrieved {len(relevant_docs)} relevant docs")
    if relevant_docs:
        print("[DEBUG] Sample Relevant Doc:\n", relevant_docs[0][:300], "...\n")

    # Step 4: Build RAG prompt
    context = "\n\n".join(relevant_docs)
    rag_prompt = f"""
    Answer the user's question based on the following course information:
    {context}

    User Question: {query_in_english}
    Answer in English:
    """
    print(f"[DEBUG] RAG Prompt:\n{rag_prompt[:500]}...\n")

    # Step 5: Get RAG Answer
    try:
        rag_response = gemini_llm.invoke(rag_prompt).content.strip()
    except Exception as e:
        print(f"[ERROR] RAG generation failed: {e}")
        rag_response = "Sorry, I could not generate an answer."

    print(f"[DEBUG] Raw RAG Response: {rag_response}")

    # Step 6: Translate back to original language if needed
    if user_lang != "English":
        rag_response_translated = translate_text(rag_response, user_lang)
        print(f"[DEBUG] Translated Response: {rag_response_translated}")
        rag_response = rag_response_translated

    print("================ DEBUG END ================\n")
    return rag_response

# ============================================
# 🔹 Test
# ============================================
queries = [
    "ഒരു ഡെയ്‌റി ഫാം ആരംഭിക്കാൻ എത്ര പശുക്കൾ ആവശ്യമാകും?",  # Malayalam
    "पोल्ट्री फार्म कैसे शुरू करें?",                       # Hindi
    "Can you tell me about the honey bee farming course?"   # English
]

for q in queries:
    print(f"\nUser Query: {q}")
    print("Final Answer:", generate_answer(q))


DefaultCredentialsError: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.