In [None]:
!pip install transformers datasets torch scikit-learn




In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

dataset = load_dataset("SemEvalWorkshop/sem_eval_2018_task_1", "subtask5.english")

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    # Tokenize the tweets
    tokens = tokenizer(examples["Tweet"], padding="max_length", truncation=True)
    # Extract labels
    labels = []
    emotion_labels = ['anger', 'anticipation', 'disgust', 'fear', 'joy',
                      'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']
    for i in range(len(examples["Tweet"])):
        label = [float(examples[emotion][i]) for emotion in emotion_labels]  # Cast to float
        labels.append(label)
    tokens["labels"] = labels
    return tokens

tokenized_dataset = dataset.map(preprocess_function, batched=True)


sem_eval_2018_task_1.py:   0%|          | 0.00/6.29k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

The repository for SemEvalWorkshop/sem_eval_2018_task_1 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/SemEvalWorkshop/sem_eval_2018_task_1.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/5.98M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6838 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3259 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/886 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=11,  # Number of emotion labels
    problem_type="multi_label_classification"
)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_dir="./logs",
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = (logits > 0).astype(int)
    f1 = f1_score(labels, predictions, average="micro")
    accuracy = accuracy_score(labels, predictions)
    return {"f1": f1, "accuracy": accuracy}



In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
)

trainer.train()


In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Specify the path to the checkpoint
checkpoint_path = "results/checkpoint-1000"

# Load the model from the checkpoint
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)

# Reinitialize the Trainer with the loaded model
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
test_results = trainer.evaluate()
print(test_results)

{'eval_loss': 0.3037513792514801, 'eval_model_preparation_time': 0.009, 'eval_f1': 0.7056613898577945, 'eval_accuracy': 0.28751150659711566, 'eval_runtime': 34.948, 'eval_samples_per_second': 93.253, 'eval_steps_per_second': 5.837}
