# GUVI Multilingual GPT Chatbot

# GPT-2

In [None]:
pip install transformers datasets



In [None]:
import pandas as pd
import json

# Load Excel file
df = pd.read_excel("/content/guvi_qa_table.xlsx")

# Clean and convert to chat-style JSONL format
with open("guvi_chat_gpt2.jsonl", "w", encoding="utf-8") as f:
    for _, row in df.iterrows():
        question = str(row['Question']).strip()
        answer = str(row['Answer']).strip()
        chat_text = f"<|user|> {question} <|assistant|> {answer}"
        json.dump({"text": chat_text}, f)
        f.write("\n")


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch

# Load dataset
dataset = load_dataset("json", data_files="guvi_chat_gpt2.jsonl", split="train")

# Load tokenizer and model
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Needed for padding
model = GPT2LMHeadModel.from_pretrained(model_name)

# Resize embedding if tokenizer updated
model.resize_token_embeddings(len(tokenizer))

# Tokenization
def tokenize(example):
    inputs = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")

# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-guvi-finetuned",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

# Train
trainer.train()

# Save model and tokenizer after training
trainer.save_model("./gpt2-guvi-finetuned")             # saves model weights
tokenizer.save_pretrained("./gpt2-guvi-finetuned")      # saves tokenizer config



Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/561 [00:00<?, ? examples/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,9.6438
20,0.749
30,0.2
40,0.2001
50,0.1654
60,0.1694
70,0.1616
80,0.1666
90,0.1572
100,0.1355


('./gpt2-guvi-finetuned/tokenizer_config.json',
 './gpt2-guvi-finetuned/special_tokens_map.json',
 './gpt2-guvi-finetuned/vocab.json',
 './gpt2-guvi-finetuned/merges.txt',
 './gpt2-guvi-finetuned/added_tokens.json')

Saving to Drive

In [None]:
!zip -r gpt2-guvi-finetuned.zip gpt2-guvi-finetuned


  adding: gpt2-guvi-finetuned/ (stored 0%)
  adding: gpt2-guvi-finetuned/generation_config.json (deflated 24%)
  adding: gpt2-guvi-finetuned/model.safetensors (deflated 7%)
  adding: gpt2-guvi-finetuned/checkpoint-843/ (stored 0%)
  adding: gpt2-guvi-finetuned/checkpoint-843/generation_config.json (deflated 24%)
  adding: gpt2-guvi-finetuned/checkpoint-843/model.safetensors (deflated 7%)
  adding: gpt2-guvi-finetuned/checkpoint-843/scaler.pt (deflated 60%)
  adding: gpt2-guvi-finetuned/checkpoint-843/trainer_state.json (deflated 79%)
  adding: gpt2-guvi-finetuned/checkpoint-843/optimizer.pt (deflated 8%)
  adding: gpt2-guvi-finetuned/checkpoint-843/config.json (deflated 51%)
  adding: gpt2-guvi-finetuned/checkpoint-843/scheduler.pt (deflated 55%)
  adding: gpt2-guvi-finetuned/checkpoint-843/rng_state.pth (deflated 25%)
  adding: gpt2-guvi-finetuned/checkpoint-843/training_args.bin (deflated 52%)
  adding: gpt2-guvi-finetuned/checkpoint-562/ (stored 0%)
  adding: gpt2-guvi-finetuned/che

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cp /content/gpt2-guvi-finetuned.zip /content/drive/MyDrive/


# Model predict

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("./gpt2-guvi-finetuned")

prompt = "<|user|> what courses offered? <|assistant|>"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


<|user|> what courses offered? <|assistant|> GUVI offers courses in Full Stack Development, Data Science, Automation Testing, UI/UX, DevOps, VFX, CAD, and more.


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("./gpt2-guvi-finetuned")

prompt = "<|user|>  placement support? <|assistant|>"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


<|user|>  placement support? <|assistant|> Yes, GUVI provides job support, but placement depends on student effort, test clearance, and preparation.


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("./gpt2-guvi-finetuned")

prompt = "<|user|>   are guvi mentor supportive? <|assistant|>"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


<|user|>   are guvi mentor supportive? <|assistant|> Yes, mentors are available to assist in project preparation, mock interviews, and project reviews.


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("./gpt2-guvi-finetuned")

prompt = "<|user|>   guvi certificate? <|assistant|>"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


<|user|>   guvi certificate? <|assistant|> Yes, GUVI certificates are valid for life.


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("./gpt2-guvi-finetuned")

prompt = "<|user|>   guvi mentors? <|assistant|>"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


<|user|>   guvi mentors? <|assistant|>  Yes, GUVI mentors are trained in Python, Java, and SQL.


# Translator

In [2]:
!pip install transformers sentencepiece sacremoses torch

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collec

In [3]:
!pip install streamlit pyngrok transformers langdetect


Collecting streamlit
  Downloading streamlit-1.47.1-py3-none-any.whl.metadata (9.0 kB)
Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.47.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.3.0-py3-none-any.whl (

In [4]:
import zipfile
import os

# Path to the ZIP file inside Google Drive
zip_path = '/content/drive/MyDrive/gpt2-guvi-finetuned.zip'

# Destination path outside the drive (in local Colab filesystem)
extract_to = '/content/gpt2-guvi-finetuned'

# Create the directory if it doesn't exist
os.makedirs(extract_to, exist_ok=True)

# Extract
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print("Unzipped to:", extract_to)


Unzipped to: /content/gpt2-guvi-finetuned


In [5]:
import os
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
from langdetect import detect
import gradio as gr
import torch

# Set Hugging Face token for accessing gated models
os.environ["HF_TOKEN"] = "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"

# Load fine-tuned GPT-2 model and tokenizer
model_path = "/content/gpt2-guvi-finetuned/gpt2-guvi-finetuned"
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained(model_path)
model.eval()

# Translator to English (multi-language → English)
translator_to_en = pipeline(
    "translation",
    model="Helsinki-NLP/opus-mt-mul-en",
    token=os.getenv("HF_TOKEN")
)

# Mapping English → Indian language translators
def get_translator_from_en(lang_code):
    model_map = {
        "hi": "Helsinki-NLP/opus-mt-en-hi",     # Hindi
        "ta": "Helsinki-NLP/opus-mt-en-ta",    # Tamil
        "te": "Helsinki-NLP/opus-mt-en-tel",    # Telugu
        "kn": "Helsinki-NLP/opus-mt-en-kan",    # Kannada
        "ml": "Helsinki-NLP/opus-mt-en-mal",    # Malayalam
        "bn": "Helsinki-NLP/opus-mt-en-ben",    # Bengali
        "mr": "Helsinki-NLP/opus-mt-en-mar",    # Marathi
        "gu": "Helsinki-NLP/opus-mt-en-guj",    # Gujarati
        "pa": "Helsinki-NLP/opus-mt-en-pan",    # Punjabi
        "ur": "Helsinki-NLP/opus-mt-en-ur",     # Urdu
        "or": "Helsinki-NLP/opus-mt-en-ory",     # Odia
        "de": "Helsinki-NLP/opus-mt-en-de",     # German
        "ru": "Helsinki-NLP/opus-mt-en-ru"      # Russian
    }

    model_name = model_map.get(lang_code)
    return pipeline("translation", model=model_name, token=os.getenv("HF_TOKEN")) if model_name else None

# Generate GPT-2 response
def generate_response(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    output_ids = model.generate(
        input_ids,
        max_new_tokens=100,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True).replace(prompt, "").strip()

# Multilingual chat logic
def multilingual_chat(user_input):
    original_lang = detect(user_input)
    user_input_en = translator_to_en(user_input)[0]["translation_text"] if original_lang != "en" else user_input
    prompt = f"<|user|> {user_input_en} <|assistant|>"
    response_en = generate_response(prompt)

    if original_lang != "en":
        translator_back = get_translator_from_en(original_lang)
        if translator_back:
            return translator_back(response_en)[0]["translation_text"]
    return response_en


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/310M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/310M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/707k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/791k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [None]:
test_inputs = [
    "मुझे प्रमाणपत्र कैसे मिलेगा?",             # Hindi
    "Tell me about GUVI Zen classes."            # English
]

for user_input in test_inputs:
    print("🗣 Input:", user_input)
    print("🔍 Detected language:", detect(user_input))
    print("🤖 Chatbot response:", multilingual_chat(user_input))
    print("-" * 80)


🗣 Input: मुझे प्रमाणपत्र कैसे मिलेगा?
🔍 Detected language: hi


Device set to use cuda:0


🤖 Chatbot response: आपके प्रमाणपत्र को प्राप्त करने के लिए, आपको सभी प्रश्नोत्तरीओं को पूरा करना होगा और (यदि लागू हो तो) 100% प्रगति के साथ जमा करना होगा.
--------------------------------------------------------------------------------
🗣 Input: Tell me about GUVI Zen classes.
🔍 Detected language: en
🤖 Chatbot response: GUVI offers Zen Class programs in Tamil, Telugu, Hindi, Kannada, Marathi, Bengali, and English.
--------------------------------------------------------------------------------


In [None]:
test_inputs = [
    "GUVI Zen  کلاسز میں کیا پڑھایا جاتا ہے؟"  # urudu
]

for user_input in test_inputs:
    print("🗣 Input:", user_input)
    print("🔍 Detected language:", detect(user_input))
    print("🤖 Chatbot response:", multilingual_chat(user_input))
    print("-" * 80)


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


🗣 Input: GUVI Zen  کلاسز میں کیا پڑھایا جاتا ہے؟
🔍 Detected language: ur


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/816k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/848k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


🤖 Chatbot response: انٹرنیٹ پر ایک باقاعدہ تحقیقی پروگرام ہے جس میں ماہرِنفسیات سائنس ، سائنسی تحقیق ، یو . ایس .
--------------------------------------------------------------------------------


In [7]:
test_inputs = [
    "Как получить сертификат?",             # Russian
    "Können Sie mir etwas über die GUVI Zen-Kurse erzählen?"            # German
]

for user_input in test_inputs:
    print("🗣 Input:", user_input)
    print("🔍 Detected language:", detect(user_input))
    print("🤖 Chatbot response:", multilingual_chat(user_input))
    print("-" * 80)

🗣 Input: Как получить сертификат?
🔍 Detected language: ru


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/307M [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


🤖 Chatbot response: Для получения сертификата вы должны заполнить все викторины и представления (если это применимо) с 100-процентным продвижением.
--------------------------------------------------------------------------------
🗣 Input: Können Sie mir etwas über die GUVI Zen-Kurse erzählen?
🔍 Detected language: de


Device set to use cuda:0


🤖 Chatbot response: Ja, Zen ist ein Kurs in KI, Data Science, UI/UX, DevOps, VFX, CAD und mehr.
--------------------------------------------------------------------------------


# Streamlit

In [8]:
!pip install streamlit



In [9]:
!pip install pyngrok



In [18]:
%%writefile app.py
import os
import torch
import streamlit as st
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
from langdetect import detect

# --- Hugging Face Token ---
os.environ["HF_TOKEN"] = "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"

# --- Streamlit Page Config ---
st.set_page_config(page_title="GUVI Multilingual Chatbot", layout="wide")

# --- Sidebar ---
st.sidebar.markdown("🧭 Sidebar Navigation")
page = st.sidebar.radio("Go to", ["Home", "Chatbot", "Report Page"])

# --- Load GPT-2 Fine-Tuned Model ---
@st.cache_resource
def load_model():
    model_path = "/content/gpt2-guvi-finetuned/gpt2-guvi-finetuned"
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.eval()
    return tokenizer, model

tokenizer, model = load_model()

# --- Translation Pipeline: any → English ---
@st.cache_resource
def load_translator_to_en():
    return pipeline(
        "translation",
        model="Helsinki-NLP/opus-mt-mul-en",
        token=os.getenv("HF_TOKEN")
    )

translator_to_en = load_translator_to_en()

# --- English → Other languages ---
def get_translator_from_en(lang_code):
    model_map = {
        "hi": "Helsinki-NLP/opus-mt-en-hi",
        "ta": "Helsinki-NLP/opus-mt-en-tam",
        "te": "Helsinki-NLP/opus-mt-en-tel",
        "kn": "Helsinki-NLP/opus-mt-en-kan",
        "ml": "Helsinki-NLP/opus-mt-en-mal",
        "bn": "Helsinki-NLP/opus-mt-en-ben",
        "mr": "Helsinki-NLP/opus-mt-en-mar",
        "gu": "Helsinki-NLP/opus-mt-en-guj",
        "pa": "Helsinki-NLP/opus-mt-en-pan",
        "ur": "Helsinki-NLP/opus-mt-en-ur",
        "or": "Helsinki-NLP/opus-mt-en-ory",
        "de": "Helsinki-NLP/opus-mt-en-de",
        "ru": "Helsinki-NLP/opus-mt-en-ru"
    }
    model_name = model_map.get(lang_code)
    return pipeline("translation", model=model_name, token=os.getenv("HF_TOKEN")) if model_name else None

# --- Generate GPT Response ---
def generate_response(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    output_ids = model.generate(input_ids, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True).replace(prompt, "").strip()

# --- Chat Pipeline ---
def multilingual_chat(user_input):
    original_lang = detect(user_input)
    user_input_en = translator_to_en(user_input)[0]["translation_text"] if original_lang != "en" else user_input
    prompt = f"<|user|> {user_input_en} <|assistant|>"
    response_en = generate_response(prompt)

    if original_lang != "en":
        translator_back = get_translator_from_en(original_lang)
        if translator_back:
            return translator_back(response_en)[0]["translation_text"]
    return response_en

# --- Home Page ---
if page == "Home":
    st.header("🏠 GUVI Multilingual GPT Chatbot using Streamlit – Integrated Translation and Domain-Specific Model Deployment")
    st.markdown("""
    This is an integrated chatbot built using:
    - GPT-2 Fine-Tuned Model
    - Helsinki-NLP Translation Pipelines
    - Deployed in Hugging Face

    **Navigation:**
    Use the sidebar to access the Chatbot or view the Report Page.
    """)

    st.markdown("### 🔗 Links")
    st.markdown("""
    - [💻 GitHub Repository](https://github.com/mmakshaya/GUVI-Multilingual-GPT-Chatbot-.git)
    - [🚀 Live Demo on Hugging Face Spaces](https://huggingface.co/spaces/mmar92/Guvi_chatbot)
    """)

# --- Chatbot Page ---
elif page == "Chatbot":
    st.title("🗣️ Multilingual Chatbot for GUVI Learners")
    st.markdown("Ask questions in English, Hindi, Urdu, Russian, or German.")

    if "history" not in st.session_state:
        st.session_state.history = []

    # --- Display Chat History ---
    for i, (user, bot) in enumerate(st.session_state.history):
        with st.chat_message("user", avatar="👤"):
            st.markdown(user)
        with st.chat_message("assistant", avatar="🤖"):
            st.markdown(bot)

    # --- Chat Input ---
    user_input = st.chat_input("Type your message here...")
    if user_input:
        response = multilingual_chat(user_input)
        st.session_state.history.append((user_input, response))
        st.rerun()

    # --- Divider ---
    st.markdown("---")
    st.markdown("### 🤖 Ask a Frequently Asked Question")

    # --- FAQ Lists ---
    faqs_english = [
        "What is GUVI?",
        "What is the main goal of GUVI?",
        "Guvi placement details?",
        "What courses are offered in GUVI?",
        "Guvi recruiters?"
    ]

    faqs_hindi = [
        "GUVI क्या है?",
        "GUVI का मुख्य उद्देश्य क्या है?",
        "GUVI किन क्षेत्रों में कोर्स प्रदान करता है?",
        "GUVI में किस प्रकार की शिक्षण विधियाँ हैं?",
        "GUVI के मुफ़्त कोर्स कौन से हैं?"
    ]

    faqs_urdu = [
        "GUVI کیا ہے؟",
        "GUVI کا بنیادی مقصد کیا ہے؟",
        "GUVI کی ملازمت کی تفصیلات؟",
        "GUVI میں سیکھنے کے کون سے طریقے ہیں؟",
        "GUVI کے کون سے ریکروٹرز ہیں؟"
    ]

    faqs_russian = [
        "Что такое GUVI?",
        "Какова основная цель GUVI?",
        "Детали трудоустройства от GUVI?",
        "Какие формы обучения предлагает GUVI?",
        "Какие компании нанимают через GUVI?"
    ]

    faqs_german = [
        "Was ist GUVI?",
        "Was ist das Hauptziel von GUVI?",
        "Informationen zur Vermittlung bei GUVI?",
        "Welche Lernmethoden bietet GUVI an?",
        "Welche Unternehmen rekrutieren über GUVI?"
    ]

    # --- FAQ Dropdowns ---
    selected_en = st.selectbox("📘 Select an English FAQ", ["-- Select --"] + faqs_english, key="faq_en")
    if selected_en and selected_en != "-- Select --":
        response = multilingual_chat(selected_en)
        st.success(f"**Answer:** {response}")

    selected_hi = st.selectbox("📙 कोई हिन्दी प्रश्न चुनें", ["-- चुनें --"] + faqs_hindi, key="faq_hi")
    if selected_hi and selected_hi != "-- चुनें --":
        response = multilingual_chat(selected_hi)
        st.success(f"**उत्तर:** {response}")

    selected_ur = st.selectbox("📗 ایک سوال منتخب کریں (اردو)", ["-- منتخب کریں --"] + faqs_urdu, key="faq_ur")
    if selected_ur and selected_ur != "-- منتخب کریں --":
        response = multilingual_chat(selected_ur)
        st.success(f"**جواب:** {response}")

    selected_ru = st.selectbox("📒 Выберите вопрос (Russian)", ["-- Выбрать --"] + faqs_russian, key="faq_ru")
    if selected_ru and selected_ru != "-- Выбрать --":
        response = multilingual_chat(selected_ru)
        st.success(f"**Ответ:** {response}")

    selected_de = st.selectbox("📕 Wählen Sie eine Frage (German)", ["-- Wählen --"] + faqs_german, key="faq_de")
    if selected_de and selected_de != "-- Wählen --":
        response = multilingual_chat(selected_de)
        st.success(f"**Antwort:** {response}")

# --- Report Page ---
elif page == "Report Page":
    st.title("📊 Report Page")
        # New Section - GUVI Multilingual Chatbot Project Report
    st.markdown("## 🧾 GUVI Multilingual GPT Chatbot Report")

    st.markdown("""
### 🧠 Objective
To build a multilingual AI-powered chatbot using a fine-tuned GPT-2 model that supports real-time interactions in multiple languages, helping GUVI learners access information in their native language.

### 🌐 Domain
- Artificial Intelligence (AI)
- Natural Language Processing (NLP)
- Web Development (Streamlit)

### ❓ Problem Statement
Build a chatbot that:
- Accepts user input in Indian/international languages
- Translates to English
- Generates GUVI-specific responses
- Translates back to the original language
- Presents results via a Streamlit interface

### 💼 Business Use Cases
1. Customer Support Automation
2. E-Learning Accessibility
3. Career Guidance & Mentorship
4. Course Recommendation System

### ✨ Key Features
- Language detection (`langdetect`)
- Translation (↔ English) via Hugging Face models
- GPT-2 based response generation
- Multilingual FAQ support
- Persistent chat history using `st.session_state`

### 🛠️ Tools & Technologies
- Python 3.11
- Hugging Face Transformers
- Streamlit
- Langdetect
- GitHub, VS Code / Jupyter

### 🏗️ Architecture
1. User Input Layer
2. Language Detection
3. Translation Layer
4. GPT-2 Response Generation
5. Streamlit UI Rendering

### 🔁 Fine-Tuning Process
1. Data Collection
2. Preprocessing
3. Tokenization
4. Fine-Tuning GPT-2
5. Evaluation

### ✅ Results
- Multilingual chatbot with real-time support in 10+ languages
- Dynamic FAQs auto-translated
- Domain-specific contextual answers
- Extensible design for future languages or enterprises

### 📊 Evaluation Metrics
- Code Modularity
- Multilingual Capability
- Response Clarity
- UI Responsiveness
- GitHub Documentation Quality

### 🧾 Dataset Info
- GUVI-specific textual content
- Hugging Face translation datasets
- ~13,500 tokens

### 📦 Deliverables
- `app.py`
- `requirements.txt`
- `README.md`
- GitHub Repository
- (Optional) Demo Video

### 📈 Suggested Improvements
- Add speech-to-text / text-to-speech
- Add analytics dashboard
- Add user feedback system
- Optimize for mobile
- Support additional languages

### 🚀 Deployment
- Hugging Face Spaces
- Secure token handling
- Cached model loading for faster inference

### 🏁 Conclusion
This project demonstrates how deep learning, NLP, and translation can be integrated into a multilingual chatbot tailored for GUVI. It improves learner accessibility, engagement, and personalization.
""")



Overwriting app.py


In [14]:
!npm install localtunnel

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K
up to date, audited 23 packages in 1s
[1G[0K⠧[1G[0K
[1G[0K⠧[1G[0K3 packages are looking for funding
[1G[0K⠧[1G[0K  run `npm fund` for details
[1G[0K⠧[1G[0K
2 [31m[1mhigh[22m[39m severity vulnerabilities

To address all issues (including breaking changes), run:
  npm audit fix --force

Run `npm audit` for details.
[1G[0K⠇[1G[0K

In [17]:
!streamlit run /content/app.py &>/content/logs.txt & npx localtunnel --port 8501 & curl ipv4.icanhazip.com

34.16.250.178
[1G[0K⠙[1G[0Kyour url is: https://whole-tables-joke.loca.lt


# Translator IndicTrans2

In [None]:
pip install transformers torch gradio langdetect




In [None]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `hf auth whoami` to get more information or `hf auth logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
The tok

In [None]:

import os
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, GPT2LMHeadModel, GPT2Tokenizer
from langdetect import detect
import torch
import gradio as gr

# Set your Hugging Face token (required for IndicTrans2)
os.environ["HF_TOKEN"] = "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"

# Load fine-tuned GPT-2 model
model_path = "./gpt2-guvi-finetuned"
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained(model_path)
gpt2_model.eval()

# Load IndicTrans2 models
print("Loading IndicTrans2 models...")
indic_to_en_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-multilingual-en", token=os.environ["HF_TOKEN"])
indic_to_en_model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-multilingual-en", token=os.environ["HF_TOKEN"])

en_to_indic_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-en-multilingual", token=os.environ["HF_TOKEN"])
en_to_indic_model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en-multilingual", token=os.environ["HF_TOKEN"])

# Supported language codes
LANG_CODE_MAP = {
    "hi": "hi",  # Hindi
    "ta": "ta",  # Tamil
    "te": "te",  # Telugu
    "kn": "kn",  # Kannada
    "ml": "ml",  # Malayalam
    "bn": "bn",  # Bengali
    "mr": "mr",  # Marathi
    "gu": "gu",  # Gujarati
    "pa": "pa",  # Punjabi
    "ur": "ur",  # Urdu
    "or": "or"   # Odia
}

# Translation pipelines using IndicTrans2
def translate_to_en(text, src_lang):
    inputs = indic_to_en_tokenizer(text, return_tensors="pt", padding=True)
    inputs['lang_code'] = indic_to_en_tokenizer.convert_ids_to_tokens(indic_to_en_tokenizer.encode(src_lang))[0]
    with torch.no_grad():
        output = indic_to_en_model.generate(**inputs, max_length=256)
    return indic_to_en_tokenizer.decode(output[0], skip_special_tokens=True)

def translate_from_en(text, tgt_lang):
    inputs = en_to_indic_tokenizer(text, return_tensors="pt", padding=True)
    inputs['lang_code'] = en_to_indic_tokenizer.convert_ids_to_tokens(en_to_indic_tokenizer.encode(tgt_lang))[0]
    with torch.no_grad():
        output = en_to_indic_model.generate(**inputs, max_length=256)
    return en_to_indic_tokenizer.decode(output[0], skip_special_tokens=True)

# Generate GPT-2 response
def generate_response(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(gpt2_model.device)
    output_ids = gpt2_model.generate(
        input_ids,
        max_new_tokens=100,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True).replace(prompt, "").strip()

# Multilingual Chat Logic
def multilingual_chat(user_input):
    original_lang = detect(user_input)
    if original_lang in LANG_CODE_MAP and original_lang != "en":
        src_lang = LANG_CODE_MAP[original_lang]
        try:
            user_input_en = translate_to_en(user_input, src_lang)
        except:
            return "❌ Error translating to English."
    else:
        user_input_en = user_input

    prompt = f"<|user|> {user_input_en} <|assistant|>"
    response_en = generate_response(prompt)

    if original_lang in LANG_CODE_MAP and original_lang != "en":
        tgt_lang = LANG_CODE_MAP[original_lang]
        try:
            translated_response = translate_from_en(response_en, tgt_lang)
            return translated_response
        except:
            return "❌ Error translating back to your language."

    return response_en

# Gradio UI
demo = gr.Interface(fn=multilingual_chat, inputs="text", outputs="text", title="🇮🇳 Multilingual Chatbot (GPT2 + IndicTrans2)",
                    description="Chat with GPT2 in Tamil, Telugu, Hindi, Malayalam, and more using IndicTrans2 translation!")

demo.launch()


Loading IndicTrans2 models...


OSError: ai4bharat/indictrans2-multilingual-en is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`

# Translator NLLB

In [None]:
!pip install langdetect



In [None]:
pip install transformers sentencepiece




In [None]:
pip install --upgrade transformers


Collecting transformers
  Downloading transformers-4.54.1-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.54.1-py3-none-any.whl (11.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.54.0
    Uninstalling transformers-4.54.0:
      Successfully uninstalled transformers-4.54.0
Successfully installed transformers-4.54.1


In [None]:
import transformers
print(transformers.__version__)


4.54.1


In [None]:
from transformers import NllbTokenizer

model_id = "facebook/nllb-200-distilled-600M"
tokenizer = NllbTokenizer.from_pretrained(model_id)

print("Tokenizer Type:", type(tokenizer))
print("Has lang_code_to_id?", hasattr(tokenizer, "lang_code_to_id"))
print("Example (Tamil):", tokenizer.lang_code_to_id["tam_Taml"])


Tokenizer Type: <class 'transformers.models.nllb.tokenization_nllb.NllbTokenizer'>
Has lang_code_to_id? False


AttributeError: NllbTokenizer has no attribute lang_code_to_id

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
from langdetect import detect

# ========== Load GPT-2 Fine-tuned Model ==========
gpt2_model_path = "./gpt2-guvi-finetuned"  # Update to your path
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained(gpt2_model_path)
gpt2_model.eval()

# ========== Load NLLB-200 Translation Model ==========
nllb_model_id = "facebook/nllb-200-distilled-600M"
nllb_tokenizer = NllbTokenizer.from_pretrained(nllb_model_id)
nllb_model = AutoModelForSeq2SeqLM.from_pretrained(nllb_model_id)

# ========== Language Map (ISO code ➝ NLLB code) ==========
lang_map = {
    "as": "asm_Beng",   # Assamese
    "bn": "ben_Beng",   # Bengali
    "gu": "guj_Gujr",   # Gujarati
    "hi": "hin_Deva",   # Hindi
    "kn": "kan_Knda",   # Kannada
    "ml": "mal_Mlym",   # Malayalam
    "mr": "mar_Deva",   # Marathi
    "ne": "npi_Deva",   # Nepali
    "or": "ory_Orya",   # Odia
    "pa": "pan_Guru",   # Punjabi
    "sa": "san_Deva",   # Sanskrit
    "ta": "tam_Taml",   # Tamil
    "te": "tel_Telu",   # Telugu
    "ur": "urd_Arab",   # Urdu
    "ks": "kas_Arab",   # Kashmiri
    "sd": "snd_Arab",   # Sindhi
    "kok": "kok_Deva",  # Konkani
    "mai": "mai_Deva",  # Maithili
    "bho": "bho_Deva",  # Bhojpuri
    "mni": "mni_Beng",  # Manipuri
    "doi": "doi_Deva",  # Dogri
    "en": "eng_Latn"    # English
}

# ========== Translate using NLLB ==========
def translate_nllb(text, src_lang_code, tgt_lang_code):
    inputs = nllb_tokenizer(text, return_tensors="pt")
    inputs["forced_bos_token_id"] = nllb_tokenizer.lang_code_to_id[tgt_lang_code]
    with torch.no_grad():
        output_tokens = nllb_model.generate(**inputs, max_new_tokens=200)
    return nllb_tokenizer.decode(output_tokens[0], skip_special_tokens=True)

# ========== Generate GPT-2 Response ==========
def generate_response(prompt):
    input_ids = gpt2_tokenizer(prompt, return_tensors="pt").input_ids.to(gpt2_model.device)
    with torch.no_grad():
        output_ids = gpt2_model.generate(
            input_ids,
            max_new_tokens=100,
            pad_token_id=gpt2_tokenizer.eos_token_id
        )
    return gpt2_tokenizer.decode(output_ids[0], skip_special_tokens=True).replace(prompt, "").strip()

# ========== Chatbot Logic ==========
def multilingual_chat(user_input):
    # Step 1: Detect language
    detected_lang = detect(user_input)
    src_lang = lang_map.get(detected_lang, "eng_Latn")  # fallback to English

    # Step 2: Translate to English if needed
    if detected_lang != "en":
        user_input_en = translate_nllb(user_input, src_lang, "eng_Latn")
    else:
        user_input_en = user_input

    # Step 3: Format prompt and get GPT-2 response
    prompt = f"<|user|> {user_input_en} <|assistant|>"
    response_en = generate_response(prompt)

    # Step 4: Translate back to original language if needed
    if detected_lang != "en":
        response_final = translate_nllb(response_en, "eng_Latn", src_lang)
    else:
        response_final = response_en

    return response_final




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
if __name__ == "__main__":
    # Sample inputs in different Indian languages
    test_inputs = [
        "மொழி எதனால் முக்கியம்?",                   # Tamil
        "ప్రత్యామ్నాయ విద్య అంటే ఏమిటి?",           # Telugu
        "ভবিষ্যতের পরিকল্পনা কী?",                    # Bengali
        "गूवी कोर्स कैसे जॉइन करूं?",                # Hindi
        "What is GUVI's Zen Class?"                   # English
    ]

    for query in test_inputs:
        print(f"\nUser ({detect(query)}): {query}")
        print("Bot:", multilingual_chat(query))


User (ta): மொழி எதனால் முக்கியம்?


AttributeError: NllbTokenizer has no attribute lang_code_to_id

In [None]:
print(nllb_tokenizer.lang_code_to_id["tam_Taml"])  # Tamil


AttributeError: NllbTokenizer has no attribute lang_code_to_id

In [None]:
print(multilingual_chat("मुझे प्रमाणपत्र कैसे मिलेगा?"))        # Hindi
print(multilingual_chat("சான்றிதழ் பெற என்ன செய்ய வேண்டும்?"))  # Tamil
print(multilingual_chat("Tell me about GUVI Zen class"))         # English


AttributeError: NllbTokenizer has no attribute lang_code_to_id

In [None]:
import os
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langdetect import detect

# Load fine-tuned GPT-2 model and tokenizer
model_path = "./gpt2-guvi-finetuned"
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained(model_path)
model.eval()

# Load NLLB model and tokenizer
nllb_model_id = "facebook/nllb-200-distilled-600M"
nllb_tokenizer = AutoTokenizer.from_pretrained(nllb_model_id)
nllb_model = AutoModelForSeq2SeqLM.from_pretrained(nllb_model_id)

# Indian language codes for NLLB
lang_codes = {
    "hi": "hin_Deva",   # Hindi
    "ta": "tam_Taml",   # Tamil
    "te": "tel_Telu",   # Telugu
    "kn": "kan_Knda",   # Kannada
    "ml": "mal_Mlym",   # Malayalam
    "bn": "ben_Beng",   # Bengali
    "mr": "mar_Deva",   # Marathi
    "gu": "guj_Gujr",   # Gujarati
    "pa": "pan_Guru",   # Punjabi
    "ur": "urd_Arab",   # Urdu
    "or": "ory_Orya",   # Odia
    "en": "eng_Latn"    # English
}

# Use NLLB for translation between any two languages
def translate_with_nllb(text, source_lang, target_lang):
    inputs = nllb_tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(nllb_model.device)
    inputs['forced_bos_token_id'] = nllb_tokenizer.lang_code_to_id[target_lang]
    outputs = nllb_model.generate(**inputs, max_new_tokens=256)
    return nllb_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

# Generate GPT-2 response
def generate_response(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    output_ids = model.generate(
        input_ids,
        max_new_tokens=100,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True).replace(prompt, "").strip()

# Multilingual chat logic
def multilingual_chat(user_input):
    original_lang = detect(user_input)
    print(f"🔍 Detected: {original_lang}")

    if original_lang not in lang_codes:
        return "❗ Unsupported language"

    # Translate to English if needed
    if original_lang != "en":
        user_input_en = translate_with_nllb(user_input, lang_codes[original_lang], lang_codes["en"])
    else:
        user_input_en = user_input

    # Generate response in English
    prompt = f"<|user|> {user_input_en} <|assistant|>"
    response_en = generate_response(prompt)

    # Translate back to original language
    if original_lang != "en":
        response_translated = translate_with_nllb(response_en, lang_codes["en"], lang_codes[original_lang])
    else:
        response_translated = response_en

    return response_translated


In [1]:
print(multilingual_chat("சான்றிதழ் பெற என்ன செய்ய வேண்டும்?"))  # Tamil
print(multilingual_chat("मुझे प्रमाणपत्र कैसे मिलेगा?"))           # Hindi
print(multilingual_chat("Tell me about GUVI Zen class"))           # English


NameError: name 'multilingual_chat' is not defined