# Train `Emotion Detection` Model

## Setup and Data Loading

In [None]:
!pip install transformers datasets torch scikit-learn

In [2]:
import torch
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict

from sklearn.model_selection import train_test_split

In [3]:
emotions = load_dataset("google-research-datasets/go_emotions")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.40k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/350k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [5]:
emotions['train'][2]

{'text': 'WHY THE FUCK IS BAYLESS ISOING', 'labels': [2], 'id': 'eezlygj'}

In [6]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [7]:
def tokenize(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128) # Adjust max_length if needed

tokenized_emotions = emotions.map(tokenize, batched=True)

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [8]:
label_list = emotions["train"].features["labels"].feature.names
id2label = {str(i): label for i, label in enumerate(label_list)}
label2id = {v: k for k, v in id2label.items()}

In [9]:
def convert_labels_to_ids(example):
    example["labels"] = [int(label) for label in example["labels"]] # Convert labels to integer indices
    return example
tokenized_emotions = tokenized_emotions.map(convert_labels_to_ids, batched=False)

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [11]:
# Splitting the dataset using sklearn
train_valid, test = train_test_split(tokenized_emotions, test_size=0.1, random_state=42) # 10% for testing
train, valid = train_test_split(train_valid, test_size=0.1, random_state=42) # Another 10% (of the remaining 90%) for validation



KeyError: 0

Break

In [8]:
import torch
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset, concatenate_datasets


# Load the goEmotions dataset
emotions = load_dataset("go_emotions")

# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
def tokenize_and_convert_labels(examples):
    tokenized = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
    tokenized["labels"] = [[int(l) for l in labels] for labels in examples["labels"]]  # List of lists of ints
    return tokenized

tokenized_emotions = emotions.map(tokenize_and_convert_labels, batched=True)

# Convert labels to list of integer indices
label_list = emotions["train"].features["labels"].feature.names
id2label = {str(i): label for i, label in enumerate(label_list)}
label2id = {v: k for k, v in id2label.items()}

def convert_labels_to_ids(example):
    example["labels"] = [int(label) for label in example["labels"]]
    return example

tokenized_emotions = tokenized_emotions.map(convert_labels_to_ids, batched=False)

train_valid_dataset = concatenate_datasets([tokenized_emotions["train"], tokenized_emotions["validation"]])

# Calculate split sizes
train_size = int(len(train_valid_dataset) * 0.8)  # 80% for training
valid_size = int(len(train_valid_dataset) * 0.1)  # 10% for validation
test_size = len(train_valid_dataset) - train_size - valid_size  # Remaining 10% for testing

# Split using select
train_final = train_valid_dataset.select(range(train_size))
valid_split = train_valid_dataset.select(range(train_size, train_size+valid_size))
test_final = train_valid_dataset.select(range(train_size + valid_size, len(train_valid_dataset)))

data_collator = DataCollatorWithPadding(tokenizer, padding="longest", max_length=128, return_tensors="pt")

# Define the model
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=len(label_list),
    problem_type="multi_label_classification",  # VERY IMPORTANT!
    id2label=id2label,
    label2id=label2id
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="f1", # Example: replace with a proper multi-label metric
    save_total_limit=3,
    remove_unused_columns=False,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_final,
    eval_dataset=valid_split,
    data_collator=data_collator,  # Use the data collator!
    compute_metrics=lambda p: {"f1": (2 * p.precision * p.recall) / (p.precision + p.recall) if (p.precision + p.recall)>0.0 else 0.0}  # Example metric, replace!
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate(test)

# Save the model and tokenizer
model.save_pretrained("./goemotions-roberta-model")
tokenizer.save_pretrained("./goemotions-roberta-tokenizer")

# Prediction function
def predict_emotion(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits
    sigmoid = torch.sigmoid(logits)
    threshold = 0.5
    predicted_labels = [model.config.id2label[str(i)] for i, score in enumerate(sigmoid[0]) if score > threshold]
    return predicted_labels


example_text = "I am so excited about this!"
predicted_emotions = predict_emotion(example_text)
print(f"Predicted emotions: {predicted_emotions}")

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`text` in this case) have excessive nesting (inputs type `list` where type `int` is expected).