In [None]:
!pip install datasets==2.14.5

Collecting datasets==2.14.5
  Downloading datasets-2.14.5-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.14.5)
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting xxhash (from datasets==2.14.5)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets==2.14.5)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<2023.9.0,>=2023.1.0 (from fsspec[http]<2023.9.0,>=2023.1.0->datasets==2.14.5)
  Downloading fsspec-2023.6.0-py3-none-any.whl.metadata (6.7 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets==2.14.5)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
  Downloading multiprocess-0.70.15-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.14.5

In [None]:
import json
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)

# Step 1: Load and parse intents.json
print("🔍 Loading and parsing intents.json...")
with open("/content/intents.json", "r") as f:
    data = json.load(f)

records = []

# Handle both Format A and Format B
if isinstance(data, dict):
    # Format B: {"intent": ["response1", "response2", ...]}
    for intent, responses in data.items():
        for response in responses:
            records.append({"question": intent, "answer": response})
elif isinstance(data, list) and isinstance(data[0], dict):
    # Format A: [{"intent": ..., "responses": [...]}, ...]
    for item in data:
        intent = item.get("intent", "")
        responses = item.get("responses", [])
        for response in responses:
            records.append({"question": intent, "answer": response})
else:
    raise ValueError("Unsupported format in intents.json")

# ✅ Limit to first 100 records
records = records[:100]

# Step 2: Create DataFrame
df = pd.DataFrame(records)
df.dropna(subset=["question", "answer"], inplace=True)

# Format the text
def format_chat(row):
    return f"User: {row['question']}\nBot: {row['answer']}"

df["formatted"] = df.apply(format_chat, axis=1)

# Step 3: Convert to HuggingFace Dataset
hf_dataset = Dataset.from_pandas(df[["formatted"]])
hf_dataset = hf_dataset.train_test_split(test_size=0.1)
train_dataset = hf_dataset["train"]
eval_dataset = hf_dataset["test"]

# Step 4: Load tokenizer and model
print("🔧 Loading GPT-2 tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Step 5: Tokenization
def tokenize(example):
    return tokenizer(example["formatted"], padding="max_length", truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize, batched=True)
eval_dataset = eval_dataset.map(tokenize, batched=True)

# Step 6: Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Step 7: Training arguments
training_args = TrainingArguments(
    output_dir="./chatbot_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none"
)

# Step 8: Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)

# Step 9: Train
print("🚀 Training GPT-2 chatbot model...")
trainer.train()

# Step 10: Save model and tokenizer
print("💾 Saving trained model and tokenizer...")
model.save_pretrained("trained_chatbot")
tokenizer.save_pretrained("trained_chatbot")

print("✅ Done! Chatbot fine-tuned and saved.")


🔍 Loading and parsing intents.json...
🔧 Loading GPT-2 tokenizer and model...


Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]



🚀 Training GPT-2 chatbot model...


Epoch,Training Loss,Validation Loss
1,No log,3.017894
2,No log,2.71504
3,No log,2.360424


💾 Saving trained model and tokenizer...
✅ Done! Chatbot fine-tuned and saved.


In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding
)
import os

# Disable WandB logging to avoid API key errors
os.environ["WANDB_DISABLED"] = "true"

# Load dataset
dataset = load_dataset("csv", data_files="/content/emotion_intelligence.csv")

# Select the first 200 examples
dataset["train"] = dataset["train"].select(range(100))

# Split dataset into train and validation sets
dataset = dataset["train"].train_test_split(test_size=0.1)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Add a pad token explicitly (DistilBERT does not have an eos_token)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenization function
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Define label columns
label_columns = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion", "curiosity",
    "desire", "disappointment", "disapproval", "disgust", "embarrassment", "excitement", "fear",
    "gratitude", "grief", "joy", "love", "nervousness", "optimism", "pride", "realization", "relief",
    "remorse", "sadness", "surprise", "neutral"
]

# Encode labels as tensors
def encode_labels(example):
    labels = [float(example[label]) for label in label_columns]
    return {"labels": labels}

# Apply label encoding
tokenized_datasets = tokenized_datasets.map(encode_labels)

# Load pre-trained DistilBERT model for multi-label classification
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label_columns),
    problem_type="multi_label_classification"
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./emotion_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none"
)

# Data collator for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train the model
trainer.train()

# Save the trained model
model.save_pretrained("trained_emotion_model")
tokenizer.save_pretrained("trained_emotion_model")

print(" Emotion Intelligence Model Training Complete (First 100 examples)!")


Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.477956
2,No log,0.386562
3,No log,0.3615


 Emotion Intelligence Model Training Complete (First 5000 examples)!


In [None]:
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding
)
import os
import re

# Disable WandB logging
os.environ["WANDB_DISABLED"] = "true"

# Function to clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Preprocessing function for batching: expands each row into 2
def preprocess_batch(batch):
    new_examples = {"text": [], "label": []}
    for sent_more, sent_less in zip(batch["sent_more"], batch["sent_less"]):
        new_examples["text"].append(clean_text(sent_more))
        new_examples["label"].append(1)
        new_examples["text"].append(clean_text(sent_less))
        new_examples["label"].append(0)
    return new_examples

# Load dataset
raw_dataset = load_dataset("csv", data_files="/content/crows_pairs.csv")

# ✅ Limit to first 100 rows before expanding
raw_dataset["train"] = raw_dataset["train"].select(range(100))

# Apply preprocessing and flatten
processed_dataset = raw_dataset["train"].map(
    preprocess_batch,
    batched=True,
    remove_columns=raw_dataset["train"].column_names
)

# Train-test split
split_dataset = processed_dataset.train_test_split(test_size=0.1)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenization
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=64)

tokenized_datasets = split_dataset.map(tokenize_function, batched=True)

# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)
model.resize_token_embeddings(len(tokenizer))

# Training arguments
training_args = TrainingArguments(
    output_dir="./ethical_bias_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none"
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train
trainer.train()

# Save model
model.save_pretrained("trained_ethical_bias_model")
tokenizer.save_pretrained("trained_ethical_bias_model")

print("✅ Ethical Bias Model Training Complete with 100 Examples!")


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.714057
2,No log,0.724316
3,0.698800,0.717185


✅ Ethical Bias Model Training Complete with 100 Examples!


In [None]:
pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.70.0
    Uninstalling openai-1.70.0:
      Successfully uninstalled openai-1.70.0
Successfully installed openai-0.28.0


In [None]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.23.3-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 

In [None]:
pip install gradio transformers torch


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import torch
import torch.nn.functional as F
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM
)
import gradio as gr

# ===== Load Trained Models =====
print("Loading models...")

# Chatbot
chatbot_tokenizer = AutoTokenizer.from_pretrained("trained_chatbot")
chatbot_model = AutoModelForCausalLM.from_pretrained("trained_chatbot")
chatbot_tokenizer.pad_token = chatbot_tokenizer.eos_token

# Emotion Detection
emotion_tokenizer = AutoTokenizer.from_pretrained("trained_emotion_model")
emotion_model = AutoModelForSequenceClassification.from_pretrained("trained_emotion_model")
emotion_labels = ["admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion", "curiosity",
                  "desire", "disappointment", "disapproval", "disgust", "embarrassment", "excitement", "fear",
                  "gratitude", "grief", "joy", "love", "nervousness", "optimism", "pride", "realization", "relief",
                  "remorse", "sadness", "surprise", "neutral"]

# Bias Detection
bias_tokenizer = AutoTokenizer.from_pretrained("trained_ethical_bias_model")
bias_model = AutoModelForSequenceClassification.from_pretrained("trained_ethical_bias_model")
bias_labels = ["Biased", "Neutral"]

print("✅ All models loaded successfully.")

# ===== Define Inference Functions =====

# Chatbot logic
def chatbot_response(message, history):
    chat_history = ""
    for user_msg, bot_msg in history[-3:]:  # Context of last 3 turns
        chat_history += f"User: {user_msg}\nBot: {bot_msg}\n"
    chat_history += f"User: {message}\nBot:"

    inputs = chatbot_tokenizer(chat_history, return_tensors="pt", truncation=True, padding=True, max_length=512)

    with torch.no_grad():
        outputs = chatbot_model.generate(
            **inputs,
            max_length=inputs["input_ids"].shape[1] + 50,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            pad_token_id=chatbot_tokenizer.eos_token_id
        )

    decoded = chatbot_tokenizer.decode(outputs[0], skip_special_tokens=True)
    reply = decoded.split("Bot:")[-1].strip().split("\n")[0]
    return reply

# Emotion detection logic
def detect_emotion(text):
    inputs = emotion_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = emotion_model(**inputs)
        scores = torch.sigmoid(outputs.logits)[0].tolist()
    top_emotions = sorted(zip(emotion_labels, scores), key=lambda x: x[1], reverse=True)[:5]
    return {label: round(score, 3) for label, score in top_emotions}

# Bias detection logic
def detect_bias(text):
    inputs = bias_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        logits = bias_model(**inputs).logits
        probs = F.softmax(logits, dim=1)[0].tolist()
        pred_label = torch.argmax(logits, dim=1).item()
    return bias_labels[pred_label], {"Biased": round(probs[0], 3), "Neutral": round(probs[1], 3)}

# ===== Gradio UI =====
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🤖 NLP Intelligence App")
    gr.Markdown("This app combines a **chatbot**, **emotion detector**, and **bias detector** using your locally trained models.")

    with gr.Tab("🧠 Chatbot"):
        gr.ChatInterface(fn=chatbot_response, title="Chat with GPT-2 (Locally Fine-Tuned)")

    with gr.Tab("🎭 Emotion Detection"):
        with gr.Row():
            with gr.Column():
                emotion_input = gr.Textbox(label="Input Text")
                emotion_button = gr.Button("Detect Emotions")
            emotion_output = gr.Label(label="Top Emotions")
        emotion_button.click(fn=detect_emotion, inputs=emotion_input, outputs=emotion_output)

    with gr.Tab("⚖️ Bias Detection"):
        with gr.Row():
            with gr.Column():
                bias_input = gr.Textbox(label="Input Text")
                bias_button = gr.Button("Detect Bias")
            bias_result = gr.Label(label="Bias Prediction")
            bias_probs = gr.Label(label="Confidence Scores")
        bias_button.click(fn=detect_bias, inputs=bias_input, outputs=[bias_result, bias_probs])

demo.launch()


Loading models...
✅ All models loaded successfully.


  self.chatbot = Chatbot(


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://96f11f41d6ed53ee1f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


