# **Task 5: Mental Health Support Chatbot (Fine-Tuned)**

# Install dependencies


In [None]:
!pip install -q transformers datasets torch sentencepiece

# Import libraries

In [None]:
import pandas as pd
import torch
from datasets import Dataset,DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

## **1. Load Dataset**

In [None]:
df = pd.read_csv("/content/emotion-emotion_69k.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Situation,emotion,empathetic_dialogues,labels,Unnamed: 5,Unnamed: 6
0,0,I remember going to the fireworks with my best...,sentimental,Customer :I remember going to see the firework...,"Was this a friend you were in love with, or ju...",,
1,1,I remember going to the fireworks with my best...,sentimental,Customer :This was a best friend. I miss her.\...,Where has she gone?,,
2,2,I remember going to the fireworks with my best...,sentimental,Customer :We no longer talk.\nAgent :,Oh was this something that happened because of...,,
3,3,I remember going to the fireworks with my best...,sentimental,Customer :Was this a friend you were in love w...,This was a best friend. I miss her.,,
4,4,I remember going to the fireworks with my best...,sentimental,Customer :Where has she gone?\nAgent :,We no longer talk.,,


## **2. Preprocess Dataset**

* Convert to **input → target pairs** for fine-tuning.

In [None]:
df = df[['Situation', 'empathetic_dialogues', 'labels', 'emotion']]

df.rename(columns={
    'Situation': 'situation',
    'empathetic_dialogues': 'user_utterance',
    'labels': 'agent_reply',
    'emotion': 'emotion_label'
}, inplace=True)

df['input_text'] = "Context: " + df['situation'] + "\nUser: " + df['user_utterance'] + "\nEmotion: " + df['emotion_label'] + "\nAgent:"

df['target_text'] = df['agent_reply']

df = df[['input_text', 'target_text']]
df.head(3)


Unnamed: 0,input_text,target_text
0,Context: I remember going to the fireworks wit...,"Was this a friend you were in love with, or ju..."
1,Context: I remember going to the fireworks wit...,Where has she gone?
2,Context: I remember going to the fireworks wit...,Oh was this something that happened because of...


## **3. Split Dataset**

I Split The Dataset because I does not have too much power

In [None]:
full_dataset = Dataset.from_pandas(df)

train_dataset = full_dataset.shuffle(seed=42).select(range(10000))
eval_dataset = full_dataset.shuffle(seed=42).select(range(2000))

dataset = DatasetDict({
    'train': train_dataset,
    'eval': eval_dataset
})

## **4. Tokenizer & Model**


In [None]:
model_name = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

## **5. Tokenize Data**


In [None]:
max_length = 128
def tokenize_function(examples):
    inputs = [str(x) for x in examples['input_text']]
    targets = [str(x) for x in examples['target_text']]

    return tokenizer(inputs, text_target=targets,
                     padding="max_length",
                     truncation=True,
                     max_length=max_length)
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

## **6. Data Collator**


In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

## **7. Training Arguments**

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./mental_health_bot",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,          # small epochs for quick test
    save_steps=500,
    save_total_limit=2,
    # evaluation_strategy="steps", # Removed: Not supported in current transformers version
    # eval_steps=250,              # Removed: Not supported in current transformers version
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,                    # if GPU supports
    push_to_hub=False
)

## **8. Trainer Initialization**

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

  trainer = Trainer(


## **9. Train Model**


In [None]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,2.5911
200,2.2545
300,2.2237
400,2.281
500,2.2322
600,2.1998
700,2.1924
800,2.1741
900,2.1374
1000,2.1594


TrainOutput(global_step=2500, training_loss=2.1178226318359377, metrics={'train_runtime': 362.947, 'train_samples_per_second': 55.104, 'train_steps_per_second': 6.888, 'total_flos': 653241876480000.0, 'train_loss': 2.1178226318359377, 'epoch': 2.0})

## **10. Test the Chatbot**

In [None]:
def generate_reply(prompt, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    output_ids = model.generate(**inputs, max_length=max_length, do_sample=True, temperature=0.7)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)


test_prompt = "Context: I feel anxious about exams.\nUser: I'm really stressed.\nEmotion: anxious\nAgent:"
print(generate_reply(test_prompt))


Context: I feel anxious about exams.
User: I'm really stressed.
Emotion: anxious
Agent: anxious
Agent: anxious
Agent: anxious
Agent: anxious
Agent: anxious
Agent: anxious
Agent: anxious



## 11. Download Fine Tuned Model

In [None]:
!zip -r mental_health_bot.zip mental_health_bot

  adding: mental_health_bot/ (stored 0%)
  adding: mental_health_bot/checkpoint-2500/ (stored 0%)
  adding: mental_health_bot/checkpoint-2500/special_tokens_map.json (deflated 60%)
  adding: mental_health_bot/checkpoint-2500/tokenizer_config.json (deflated 54%)
  adding: mental_health_bot/checkpoint-2500/training_args.bin (deflated 53%)
  adding: mental_health_bot/checkpoint-2500/config.json (deflated 52%)
  adding: mental_health_bot/checkpoint-2500/scheduler.pt (deflated 61%)
  adding: mental_health_bot/checkpoint-2500/optimizer.pt (deflated 8%)
  adding: mental_health_bot/checkpoint-2500/merges.txt (deflated 53%)
  adding: mental_health_bot/checkpoint-2500/trainer_state.json (deflated 76%)
  adding: mental_health_bot/checkpoint-2500/scaler.pt (deflated 64%)
  adding: mental_health_bot/checkpoint-2500/vocab.json (deflated 59%)
  adding: mental_health_bot/checkpoint-2500/model.safetensors (deflated 7%)
  adding: mental_health_bot/checkpoint-2500/rng_state.pth (deflated 26%)
  adding: m

In [None]:
from google.colab import files
!tar -czvf mental_health_bot.tar.gz mental_health_bot
files.download('mental_health_bot.tar.gz')


mental_health_bot/
mental_health_bot/checkpoint-2500/
mental_health_bot/checkpoint-2500/special_tokens_map.json
mental_health_bot/checkpoint-2500/tokenizer_config.json
mental_health_bot/checkpoint-2500/training_args.bin
mental_health_bot/checkpoint-2500/config.json
mental_health_bot/checkpoint-2500/scheduler.pt
mental_health_bot/checkpoint-2500/optimizer.pt
mental_health_bot/checkpoint-2500/merges.txt
mental_health_bot/checkpoint-2500/trainer_state.json
mental_health_bot/checkpoint-2500/scaler.pt
mental_health_bot/checkpoint-2500/vocab.json
mental_health_bot/checkpoint-2500/model.safetensors
mental_health_bot/checkpoint-2500/rng_state.pth
mental_health_bot/checkpoint-2500/tokenizer.json
mental_health_bot/checkpoint-2500/generation_config.json
mental_health_bot/runs/
mental_health_bot/runs/Jan02_15-44-10_21971d63e648/
mental_health_bot/runs/Jan02_15-44-10_21971d63e648/events.out.tfevents.1767368651.21971d63e648.4234.0
mental_health_bot/checkpoint-2000/
mental_health_bot/checkpoint-2000/

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!cp -r ./mental_health_bot /content/drive/MyDrive/