In [2]:
# Install prerequesite libraries
#!pip install torch torchvision transformers datasets

In [3]:
# Load dataset
from datasets import load_dataset

# Load the GoEmotions dataset
dataset = load_dataset("go_emotions", "simplified")

# Access the train, validation, and test splits
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

train_dataset[0]

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/7.03k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.11k [00:00<?, ?B/s]

Downloading and preparing dataset go_emotions/simplified to /root/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d...


Downloading data:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/203k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/201k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

Dataset go_emotions downloaded and prepared to /root/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

{'text': "My favourite food is anything I didn't have to cook myself.",
 'labels': [27],
 'id': 'eebbqej'}

In [4]:
from transformers import AutoTokenizer

# Replace 'bert-base-uncased' with the pre-trained model of your choice
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Max sequence length in dataset = 30
def preprocess_dataset(example):
    # Tokenize the input text and return the encoded result
    encoding = tokenizer(example["text"], truncation=True, padding="max_length", max_length=30)
    return encoding

from torch.utils.data import DataLoader

# Preprocess the dataset
train_dataset = train_dataset.map(preprocess_dataset, batched=True)
val_dataset = val_dataset.map(preprocess_dataset, batched=True)
test_dataset = test_dataset.map(preprocess_dataset, batched=True)

# Set dataset format to PyTorch tensors
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [5]:
import torch
from transformers import AutoModelForSequenceClassification

# Replace 'bert-base-uncased' with the pre-trained model of your choice
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=28)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [6]:
label_map = {
    18: "love",
    11: "disgust",
    17: "joy",
    25: "sadness",
    13: "excitement",
    14: "fear",
    2: "anger",
    15: "gratitude",
    12: "embarrassment",
    4: "approval",
    10: "disapproval",
    7: "curiosity",
    23: "relief",
    27: "neutral"
}

def filter_selected_labels(example):
    for i in example["labels"]:
        if int(i) in label_map.keys():
            return True
    return False

train_dataset = train_dataset.filter(filter_selected_labels)
val_dataset = val_dataset.filter(filter_selected_labels)
test_dataset = test_dataset.filter(filter_selected_labels)

labels_debug = []
def map_selected_labels(example):
  label = [label for label in example["labels"] if int(label) in label_map.keys()][0]
  labels_debug.append(label)
  example["labels"] = [label]
  return example

train_dataset = train_dataset.map(map_selected_labels)
val_dataset = val_dataset.map(map_selected_labels)
test_dataset = test_dataset.map(map_selected_labels)

#print(labels_debug)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

Filter:   0%|          | 0/43410 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5426 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5427 [00:00<?, ? examples/s]

Map:   0%|          | 0/31232 [00:00<?, ? examples/s]

Map:   0%|          | 0/3916 [00:00<?, ? examples/s]

Map:   0%|          | 0/3903 [00:00<?, ? examples/s]

In [7]:
# stats
first_100 = [int(label) for label in train_dataset["labels"]]
print(first_100)

[27, 27, 2, 14, 15, 27, 4, 27, 12, 15, 2, 27, 27, 12, 27, 27, 27, 2, 27, 25, 15, 27, 2, 27, 2, 17, 27, 25, 27, 15, 15, 27, 27, 7, 10, 27, 27, 27, 27, 27, 4, 27, 13, 10, 27, 27, 27, 15, 12, 27, 13, 27, 15, 27, 27, 27, 27, 27, 27, 13, 27, 13, 4, 25, 4, 27, 25, 15, 4, 27, 4, 27, 18, 4, 27, 7, 27, 7, 27, 10, 27, 27, 27, 7, 27, 15, 27, 27, 17, 27, 2, 2, 27, 27, 27, 4, 27, 2, 7, 2, 27, 15, 15, 27, 11, 27, 27, 7, 27, 2, 13, 27, 27, 13, 15, 23, 27, 15, 7, 10, 27, 27, 14, 18, 27, 4, 27, 27, 27, 27, 14, 7, 18, 27, 27, 27, 27, 17, 27, 7, 15, 10, 27, 17, 27, 7, 4, 2, 4, 15, 27, 27, 11, 27, 18, 27, 7, 7, 27, 17, 10, 27, 15, 27, 4, 18, 27, 27, 27, 27, 25, 4, 25, 15, 7, 27, 27, 27, 27, 17, 27, 11, 27, 4, 27, 27, 17, 10, 10, 7, 27, 27, 4, 4, 27, 27, 25, 10, 27, 27, 7, 2, 25, 27, 27, 27, 18, 27, 27, 15, 4, 27, 27, 15, 11, 15, 27, 7, 15, 27, 27, 15, 4, 27, 7, 27, 4, 14, 27, 18, 27, 17, 27, 10, 27, 17, 27, 27, 18, 10, 27, 27, 27, 25, 27, 27, 27, 7, 27, 27, 10, 4, 15, 4, 27, 4, 14, 27, 27, 27, 27, 7, 11, 

In [8]:
label_counts = {label: 0 for label in label_map.values()}

for example in train_dataset:
    for label_idx in example["labels"]:
        label = label_map[int(label_idx)]
        label_counts[label] += 1

print("Label statistics:")
for label, count in label_counts.items():
    print(f"{label}: {count}")

Label statistics:
love: 1830
disgust: 683
joy: 1227
sadness: 1179
excitement: 802
fear: 553
anger: 1567
gratitude: 2510
embarrassment: 276
approval: 2931
disapproval: 1929
curiosity: 2144
relief: 117
neutral: 13484


In [9]:
print("Number of examples in filtered train dataset:", len(train_dataset))
print("Number of examples in filtered validation dataset:", len(val_dataset))
print("Number of examples in filtered test dataset:", len(test_dataset))

Number of examples in filtered train dataset: 31232
Number of examples in filtered validation dataset: 3916
Number of examples in filtered test dataset: 3903


---
## Now the training..

In [10]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


In [11]:
%%script false --no-raise-error

import torch.optim as optim
from torch import nn
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model.to(device)
print(device)

num_epochs = 3
lr = 5e-5

optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index=-100)

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    model.train()
    train_loss = 0.0
    #print(train_loader[0])
    for batch in tqdm(train_loader):
      #print(batch["input_ids"])
      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      labels = batch["labels"].to(device)

      optimizer.zero_grad()
      outputs = model(input_ids, attention_mask=attention_mask)
      logits = outputs.logits

      # Squeeze labels
      labels = labels.squeeze()
      
      loss = criterion(logits, labels)
      loss.backward()
      optimizer.step()

      train_loss += loss.item()

    print(f"Training loss: {train_loss / len(train_loader)}")


cuda
Epoch 1/3


  0%|          | 0/976 [00:05<?, ?it/s]


RuntimeError: ignored

In [13]:
from transformers import Trainer, TrainingArguments

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

# Set training arguments
training_args = TrainingArguments(
    output_dir="output",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    logging_dir="logs",
    learning_rate=5e-5,
    weight_decay=0.01,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,1.3078,0.996987
2,0.7843,0.988398
3,0.5116,1.095557


TrainOutput(global_step=2928, training_loss=0.8179451363985656, metrics={'train_runtime': 726.9338, 'train_samples_per_second': 128.892, 'train_steps_per_second': 4.028, 'total_flos': 1444816898334720.0, 'train_loss': 0.8179451363985656, 'epoch': 3.0})

In [16]:
from datasets import load_metric
import numpy as np

accuracy_metric = load_metric("accuracy")
f1_metric = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=predictions, references=labels),
        "f1": f1_metric.compute(predictions=predictions, references=labels, average="weighted"),
    }

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Evaluate the model on the validation dataset
val_results = trainer.evaluate()
print("Validation results:", val_results)

# Evaluate the model on the test dataset
trainer.eval_dataset = test_dataset
test_results = trainer.evaluate()
print("Test results:", test_results)

Trainer is attempting to log a value of "{'accuracy': 0.6567926455566905}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.6545189532262219}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


Validation results: {'eval_loss': 1.0955569744110107, 'eval_accuracy': {'accuracy': 0.6567926455566905}, 'eval_f1': {'f1': 0.6545189532262219}, 'eval_runtime': 7.7706, 'eval_samples_per_second': 503.953, 'eval_steps_per_second': 15.829}


Trainer is attempting to log a value of "{'accuracy': 0.6707660773763772}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.6717156307223772}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


Test results: {'eval_loss': 1.0385491847991943, 'eval_accuracy': {'accuracy': 0.6707660773763772}, 'eval_f1': {'f1': 0.6717156307223772}, 'eval_runtime': 7.254, 'eval_samples_per_second': 538.046, 'eval_steps_per_second': 16.818}


In [17]:
output_dir = "."
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./tokenizer_config.json',
 './special_tokens_map.json',
 './vocab.txt',
 './added_tokens.json',
 './tokenizer.json')