In [64]:
# Install prerequesite libraries
#!pip install torch torchvision transformers datasets

In [65]:
# Load dataset
from datasets import load_dataset

# Load the GoEmotions dataset
dataset = load_dataset("go_emotions", "simplified")

# Access the train, validation, and test splits
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

train_dataset[0]



  0%|          | 0/3 [00:00<?, ?it/s]

{'text': "My favourite food is anything I didn't have to cook myself.",
 'labels': [27],
 'id': 'eebbqej'}

In [66]:
from transformers import AutoTokenizer

# Replace 'bert-base-uncased' with the pre-trained model of your choice
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Max sequence length in dataset = 30
def preprocess_dataset(example):
    # Tokenize the input text and return the encoded result
    encoding = tokenizer(example["text"], truncation=True, padding="max_length", max_length=30)
    return encoding

from torch.utils.data import DataLoader

# Preprocess the dataset
train_dataset = train_dataset.map(preprocess_dataset, batched=True)
val_dataset = val_dataset.map(preprocess_dataset, batched=True)
test_dataset = test_dataset.map(preprocess_dataset, batched=True)

# Set dataset format to PyTorch tensors
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])



In [67]:
import torch
from transformers import AutoModelForSequenceClassification

# Replace 'bert-base-uncased' with the pre-trained model of your choice
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=14)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [75]:
label_map = {
    18: "love",
    11: "disgust",
    17: "joy",
    25: "sadness",
    13: "excitement",
    14: "fear",
    2: "anger",
    15: "gratitude",
    12: "embarrassment",
    4: "approval",
    10: "disapproval",
    7: "curiosity",
    23: "relief",
    27: "neutral"
}

def filter_selected_labels(example):
    for i in example["labels"]:
        if int(i) in label_map.keys():
            return True
    return False

train_dataset = train_dataset.filter(filter_selected_labels)
val_dataset = val_dataset.filter(filter_selected_labels)
test_dataset = test_dataset.filter(filter_selected_labels)

labels_debug = []
def map_selected_labels(example):
  label = [label for label in example["labels"] if int(label) in label_map.keys()][0]
  labels_debug.append(label)
  example["labels"] = [label]
  return example

train_dataset = train_dataset.map(map_selected_labels)
val_dataset = val_dataset.map(map_selected_labels)
test_dataset = test_dataset.map(map_selected_labels)

#print(labels_debug)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)


Filter:   0%|          | 0/31232 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3916 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3903 [00:00<?, ? examples/s]

Map:   0%|          | 0/31232 [00:00<?, ? examples/s]

Map:   0%|          | 0/3916 [00:00<?, ? examples/s]

Map:   0%|          | 0/3903 [00:00<?, ? examples/s]

[tensor([27]), tensor([27]), tensor([2]), tensor([14]), tensor([15]), tensor([27]), tensor([4]), tensor([27]), tensor([12]), tensor([15]), tensor([2]), tensor([27]), tensor([27]), tensor([12]), tensor([27]), tensor([27]), tensor([27]), tensor([2]), tensor([27]), tensor([25]), tensor([15]), tensor([27]), tensor([2]), tensor([27]), tensor([2]), tensor([17]), tensor([27]), tensor([25]), tensor([27]), tensor([15]), tensor([15]), tensor([27]), tensor([27]), tensor([7]), tensor([10]), tensor([27]), tensor([27]), tensor([27]), tensor([27]), tensor([27]), tensor([4]), tensor([27]), tensor([13]), tensor([10]), tensor([27]), tensor([27]), tensor([27]), tensor([15]), tensor([12]), tensor([27]), tensor([13]), tensor([27]), tensor([15]), tensor([27]), tensor([27]), tensor([27]), tensor([27]), tensor([27]), tensor([27]), tensor([13]), tensor([27]), tensor([13]), tensor([4]), tensor([25]), tensor([4]), tensor([27]), tensor([25]), tensor([15]), tensor([4]), tensor([27]), tensor([4]), tensor([27]), ten

In [77]:
# stats
first_100 = [int(label) for label in train_dataset["labels"]]
print(first_100)

[27, 27, 2, 14, 15, 27, 4, 27, 12, 15, 2, 27, 27, 12, 27, 27, 27, 2, 27, 25, 15, 27, 2, 27, 2, 17, 27, 25, 27, 15, 15, 27, 27, 7, 10, 27, 27, 27, 27, 27, 4, 27, 13, 10, 27, 27, 27, 15, 12, 27, 13, 27, 15, 27, 27, 27, 27, 27, 27, 13, 27, 13, 4, 25, 4, 27, 25, 15, 4, 27, 4, 27, 18, 4, 27, 7, 27, 7, 27, 10, 27, 27, 27, 7, 27, 15, 27, 27, 17, 27, 2, 2, 27, 27, 27, 4, 27, 2, 7, 2, 27, 15, 15, 27, 11, 27, 27, 7, 27, 2, 13, 27, 27, 13, 15, 23, 27, 15, 7, 10, 27, 27, 14, 18, 27, 4, 27, 27, 27, 27, 14, 7, 18, 27, 27, 27, 27, 17, 27, 7, 15, 10, 27, 17, 27, 7, 4, 2, 4, 15, 27, 27, 11, 27, 18, 27, 7, 7, 27, 17, 10, 27, 15, 27, 4, 18, 27, 27, 27, 27, 25, 4, 25, 15, 7, 27, 27, 27, 27, 17, 27, 11, 27, 4, 27, 27, 17, 10, 10, 7, 27, 27, 4, 4, 27, 27, 25, 10, 27, 27, 7, 2, 25, 27, 27, 27, 18, 27, 27, 15, 4, 27, 27, 15, 11, 15, 27, 7, 15, 27, 27, 15, 4, 27, 7, 27, 4, 14, 27, 18, 27, 17, 27, 10, 27, 17, 27, 27, 18, 10, 27, 27, 27, 25, 27, 27, 27, 7, 27, 27, 10, 4, 15, 4, 27, 4, 14, 27, 27, 27, 27, 7, 11, 

In [69]:
label_counts = {label: 0 for label in label_map.values()}

for example in train_dataset:
    for label_idx in example["labels"]:
        label = label_map[int(label_idx)]
        label_counts[label] += 1

print("Label statistics:")
for label, count in label_counts.items():
    print(f"{label}: {count}")

Label statistics:
love: 1830
disgust: 683
joy: 1227
sadness: 1179
excitement: 802
fear: 553
anger: 1567
gratitude: 2510
embarrassment: 276
approval: 2931
disapproval: 1929
curiosity: 2144
relief: 117
neutral: 13484


In [70]:
print("Number of examples in filtered train dataset:", len(train_dataset))
print("Number of examples in filtered validation dataset:", len(val_dataset))
print("Number of examples in filtered test dataset:", len(test_dataset))

Number of examples in filtered train dataset: 31232
Number of examples in filtered validation dataset: 3916
Number of examples in filtered test dataset: 3903


---
## Now the training..

In [80]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


In [81]:
import torch.optim as optim
from torch import nn
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(device)

num_epochs = 3
lr = 5e-5

optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index=-100)

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    model.train()
    train_loss = 0.0
    #print(train_loader[0])
    for batch in tqdm(train_loader):
      print(batch["input_ids"])
      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      labels = batch["labels"].to(device)

      optimizer.zero_grad()
      outputs = model(input_ids, attention_mask=attention_mask)
      logits = outputs.logits

      # Reshaping labels to handle different shapes
      labels = labels.view(-1)
      
      loss = criterion(logits.view(-1, logits.size(-1)), labels)
      loss.backward()
      optimizer.step()

      train_loss += loss.item()

    print(f"Training loss: {train_loss / len(train_loader)}")


cuda
Epoch 1/3


  0%|          | 0/976 [00:00<?, ?it/s]

tensor([[  101,  1998,  2061,  2024,  2017,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1057,  5603,  1012,  1031,  2171,  1033,  2003,  2061, 20355,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2004,  2619,  2040,  2253,  2083,  2082,  2077,  1996,  4274,
          5839,  1010,  2748,  1010,  6881, 17109,  4268,  5839,  1012,  2348,
          2009,  2001,  2062,  1999,  2115,  2227,  1012,   102,     0,     0],
        [  101,  4469,  2051,  3125,  2008,  7906,  2149,  2006,  1996,  2448,
          2000,  2663,  1996, 11942,  3597,   999,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2057, 10657,  5466,  1996,  287




RuntimeError: ignored

In [None]:
from transformers import Trainer, TrainingArguments

# Set training arguments
training_args = TrainingArguments(
    output_dir="output",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    logging_dir="logs",
    learning_rate=5e-5,
    weight_decay=0.01,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()
