In [1]:
# Step 1: Install required libraries
#!pip install -q transformers datasets evaluate

# Step 2: Import libraries
from datasets import load_dataset
from transformers import AutoTokenizer
import pandas as pd

# Step 3: Load the GoEmotions dataset
dataset = load_dataset("go_emotions")  # Contains train, validation, test splits

# (Optional) View a few samples
dataset["train"].shuffle(seed=42).select(range(3))


  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/Users/pankajdhyani/Desktop/DistilGoEmotion/.venv/lib/python3.9/site-packages/ipykernel_launcher.py", 

Dataset({
    features: ['text', 'labels', 'id'],
    num_rows: 3
})

In [2]:
# Check dataset structure and label types
print(dataset["train"].features)

# Check for missing values or malformed examples
for split in ["train", "validation", "test"]:
    print(f"{split} size: {len(dataset[split])}")
    print(f"Null check: {dataset[split].filter(lambda x: x['text'] is None)}")

# Print label distribution
labels = dataset["train"].features["labels"].feature.names
print(f"Total emotion labels: {len(labels)}")


{'text': Value('string'), 'labels': List(ClassLabel(names=['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'])), 'id': Value('string')}
train size: 43410
Null check: Dataset({
    features: ['text', 'labels', 'id'],
    num_rows: 0
})
validation size: 5426
Null check: Dataset({
    features: ['text', 'labels', 'id'],
    num_rows: 0
})
test size: 5427
Null check: Dataset({
    features: ['text', 'labels', 'id'],
    num_rows: 0
})
Total emotion labels: 28


In [3]:
#!pip install torch

In [6]:
from transformers import AutoTokenizer

checkpoint = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Tokenize dataset
def tokenize_batch(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_batch, batched=True)

# ✅ Convert multi-label lists to multi-hot vectors
NUM_LABELS = 28
def encode_labels(example):
    label_vector = [0] * NUM_LABELS
    for label_id in example["labels"]:
        label_vector[label_id] = 1
    example["label"] = np.array(label_vector, dtype=np.float32)  # ✅ cast to float32
    return example

# ✅ Apply the multi-hot encoding to all splits
tokenized_dataset = tokenized_dataset.map(encode_labels)

# ✅ Format the dataset for PyTorch
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])


Map: 100%|██████████| 43410/43410 [00:19<00:00, 2241.99 examples/s]
Map: 100%|██████████| 5426/5426 [00:01<00:00, 4630.91 examples/s]
Map: 100%|██████████| 5427/5427 [00:01<00:00, 4476.15 examples/s]


In [5]:
#!pip install transformers[torch]

In [6]:
#pip install 'accelerate>=0.26.0

In [7]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding

# Load model with multi-label classification setup
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=28,
    problem_type="multi_label_classification"  # ✅ Important fix
)

# Define data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# Training arguments
training_args = TrainingArguments(
    output_dir="./emotion-model",
    eval_strategy="epoch",  # ✅ corrected from `eval_strategy`
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir="./logs",
    logging_strategy="epoch",
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,  # ✅ Important fix
    compute_metrics=None  # To be set later
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [8]:
#!pip install sklearn

In [9]:
import evaluate
import numpy as np
import torch

# Load multi-label compatible metrics
f1 = evaluate.load("f1")
accuracy = evaluate.load("accuracy")

# Threshold for sigmoid outputs
THRESHOLD = 0.5

# Define evaluation logic for multi-label classification
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()  # Get probabilities
    preds = (probs >= THRESHOLD).astype(int)             # Binarize predictions
    
    # Compute weighted F1 and accuracy
    f1_score = f1.compute(predictions=preds, references=labels, average="weighted")
    acc_score = accuracy.compute(predictions=preds, references=labels)
    
    return {
        "f1_weighted": f1_score["f1"],
        "accuracy": acc_score["accuracy"]
    }

# Assign the new metric function
trainer.compute_metrics = compute_metrics

# Start training
trainer.train()

# Evaluate on validation set
eval_result = trainer.evaluate()
print("📊 Evaluation Results:", eval_result)


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Save model and tokenizer
model.save_pretrained("./emotion-model")
tokenizer.save_pretrained("./emotion-model")
