In [None]:
!pip install transformers scikit-learn



In [None]:
!pip install -U datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [None]:
# 2. Import Libraries
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import f1_score, accuracy_score, classification_report
import numpy as np

# 3. Load full multi-label GoEmotions dataset
dataset = load_dataset("go_emotions")
label_names = dataset["train"].features["labels"].feature.names
num_labels = len(label_names)

In [None]:
# Convert labels to multi-hot format (as floats)
def encode_labels(example):
    multi_hot = [0.0] * num_labels
    for label in example["labels"]:
        multi_hot[label] = 1.0
    example["labels"] = multi_hot
    return example

dataset = dataset.map(encode_labels)

# 5. Tokenize
model_name = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

dataset = dataset.map(tokenize, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# 6. Format dataset
# dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
# dataset = dataset.map(lambda x: {"labels": torch.tensor(x["labels"], dtype=torch.float)}, batched=False)

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

  dataset = dataset.map(lambda x: {"labels": torch.tensor(x["labels"], dtype=torch.float)}, batched=False)


Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [None]:
import torch.nn as nn
# 7. Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

# 8. Custom Trainer to override loss function
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        labels = labels.type(torch.float)  # ✅ Explicitly cast to float
        outputs = model(**inputs)
        logits = outputs.logits
        loss = nn.BCEWithLogitsLoss()(logits, labels)
        return (loss, outputs) if return_outputs else loss

# 9. Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    preds = (probs >= 0.5).astype(int)
    macro_f1 = f1_score(labels, preds, average="macro", zero_division=0)
    micro_f1 = f1_score(labels, preds, average="micro", zero_division=0)
    return {
        "macro_f1": macro_f1,
        "micro_f1": micro_f1,
    }

# 10. Training arguments
training_args = TrainingArguments(
    output_dir="./goemotion-multilabel",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
)

# 11. Initialize trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = CustomTrainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Macro F1,Micro F1
1,0.0943,0.088107,0.368542,0.558024
2,0.0824,0.083385,0.424141,0.575229
3,0.0725,0.083221,0.443259,0.58493


TrainOutput(global_step=16281, training_loss=0.08957337803017938, metrics={'train_runtime': 1874.3856, 'train_samples_per_second': 69.479, 'train_steps_per_second': 8.686, 'total_flos': 4314807064442880.0, 'train_loss': 0.08957337803017938, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.08322116732597351,
 'eval_macro_f1': 0.4432589250782062,
 'eval_micro_f1': 0.5849297573435505,
 'eval_runtime': 18.478,
 'eval_samples_per_second': 293.646,
 'eval_steps_per_second': 36.746,
 'epoch': 3.0}

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model.save_pretrained('/content/drive/MyDrive/my_model-emotion-detection.h5', save_format="h5")
tokenizer.save_pretrained("/content/drive/MyDrive/my_model-emotion-detection.h5", save_format="h5")

('/content/drive/MyDrive/my_model-emotion-detection.h5/tokenizer_config.json',
 '/content/drive/MyDrive/my_model-emotion-detection.h5/special_tokens_map.json',
 '/content/drive/MyDrive/my_model-emotion-detection.h5/vocab.json',
 '/content/drive/MyDrive/my_model-emotion-detection.h5/merges.txt',
 '/content/drive/MyDrive/my_model-emotion-detection.h5/added_tokens.json',
 '/content/drive/MyDrive/my_model-emotion-detection.h5/tokenizer.json')