<a href="https://colab.research.google.com/github/nandika-9/ai-mental-health-chatbot/blob/main/aimental.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets transformers torch scikit-learn

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

dataset = load_dataset("go_emotions")

TARGET_LABELS = [2, 19]

def filter_emotions(example):
    return any(l in TARGET_LABELS for l in example["labels"])

filtered_dataset = dataset["train"].filter(filter_emotions)


def map_labels(example):
    example["labels"] = 0 if 2 in example["labels"] else 1
    return example

filtered_dataset = filtered_dataset.map(map_labels)

filtered_dataset = filtered_dataset.remove_columns(
    [c for c in filtered_dataset.column_names if c not in ["text", "labels"]]
)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding=False)

tokenized_dataset = filtered_dataset.map(tokenize)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    num_train_epochs=2,
    learning_rate=2e-5,
    logging_steps=100,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

trainer.train()

def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    pred = torch.argmax(outputs.logits).item()
    return "Anger" if pred == 0 else "Anxiety"

print(predict("I am very worried about my exams"))
print(predict("I am extremely angry and frustrated"))

Filter:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/1729 [00:00<?, ? examples/s]

Map:   0%|          | 0/1729 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
pre_classifier.bias     | MISSING    | 
classifier.bias         | MISSING    | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Step,Training Loss
100,0.266565
200,0.102773


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Anxiety
Anger
