In [None]:
!pip install transformers[torch] datasets accelerate -U

In [None]:
from datasets import load_dataset

emotions = load_dataset("emotion")

train_ds = emotions["train"]

print(train_ds.features)
print(train_ds[:5])
print(train_ds["text"][:5])

In [None]:
from datasets import list_datasets

all_datasets = list_datasets()
print(f"There are {len(all_datasets)} datasets currently available on the Hub")
print(f"The first 10 are: {all_datasets[:10]}")

In [None]:
# hide_output
from datasets import load_dataset

emotions = load_dataset("emotion")

In [None]:
emotions

In [None]:
train_ds = emotions["train"]
train_ds

In [None]:
len(train_ds)

In [None]:
train_ds.column_names

In [None]:
print(train_ds.features)

In [None]:
print(train_ds[:5])

In [None]:
print(train_ds["text"][:5])

### **From Datasets to DataFrames**

In [None]:
import pandas as pd

emotions.set_format(type="pandas")
df = emotions["train"][:]
df.head()

In [None]:
def label_int2str(row):
    return emotions["train"].features["label"].int2str(row)

df["label_name"] = df["label"].apply(label_int2str)
df.head()

### **Looking at the Class Distribution**

Whenever you are working on text classification problems, it is a good idea to examine the distribution of examples across the classes. A dataset with a skewed class distribution might require a different treatment in terms of the training loss and evaluation metrics than a balanced one.

With Pandas and Matplotlib, we can quickly visualize the class distribution as follows:

In [None]:
import matplotlib.pyplot as plt

df["label_name"].value_counts(ascending=True).plot.barh()
plt.title("Frequency of Classes")
plt.show()

In this case, we can see that the dataset is heavily imbalanced; the joy and sadness classes appear frequently, whereas love and surprise are about 5–10 times rarer. There are several ways to deal with imbalanced data, including:

* **Randomly oversample the minority class.**
* **Randomly undersample the majority class.**
* **Gather more labeled data from the underrepresented classes.**

To keep things simple in this chapter, we'll work with the raw, unbalanced class frequencies. If you want to learn more about these sampling techniques, we recommend checking out the Imbalanced-learn library. Just make sure that you don't apply sampling methods before creating your train/test splits, or you'll get plenty of leakage between them!

Now that we've looked at the classes, let's take a look at the tweets themselves.

### **How Long Are Our Tweets?**

In [None]:
df["Words Per Tweet"] = df["text"].str.split().apply(len)
df.boxplot("Words Per Tweet", by="label_name", grid=False, showfliers=False,
           color="black")
plt.suptitle("")
plt.xlabel("")
plt.show()

In [None]:
emotions.reset_format()

### **From Text to Tokens**

In [None]:
# hide_output
from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
print(tokenize(emotions["train"][:2]))

In [None]:
#hide_input
tokens2ids = list(zip(tokenizer.all_special_tokens, tokenizer.all_special_ids))
data = sorted(tokens2ids, key=lambda x : x[-1])
df = pd.DataFrame(data, columns=["Special Token", "Special Token ID"])
df.T

In [None]:
 # hide_output
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

In [None]:
print(emotions_encoded["train"].column_names)

# Fine-Tuning Transformers

In [None]:
from transformers import AutoModelForSequenceClassification
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# hide_output
num_labels = 6
id2label = {
    "0": "sadness",
    "1": "joy",
    "2": "love",
    "3": "anger",
    "4": "fear",
    "5": "surprise"
}

label2id = {
    "sadness": 0,
    "joy": 1,
    "love": 2,
    "anger": 3,
    "fear": 4,
    "surprise": 5
}

model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels, id2label=id2label, label2id=label2id)
         .to(device))

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(emotions_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True,
                                  log_level="error")

In [None]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded["train"],
                  eval_dataset=emotions_encoded["validation"],
                  tokenizer=tokenizer)
trainer.train();

In [None]:
# hide_output
preds_output = trainer.predict(emotions_encoded["validation"])

In [None]:
preds_output.metrics

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.metrics import accuracy_score

In [None]:
# RoBERTa model
model_name_roberta = "roberta-base"
tokenizer_roberta = RobertaTokenizer.from_pretrained(model_name_roberta)
model_roberta = RobertaForSequenceClassification.from_pretrained(model_name_roberta, num_labels=num_labels)

# Tokenize the data for RoBERTa
emotions_encoded_roberta = emotions.map(tokenize, batched=True, batch_size=None)

# Define training arguments for RoBERTa
training_args_roberta = TrainingArguments(
    output_dir="roberta-finetuned-emotion",
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=True,
    log_level="error"
)

# Initialize Trainer for RoBERTa
trainer_roberta = Trainer(
    model=model_roberta,
    args=training_args_roberta,
    compute_metrics=compute_metrics,
    train_dataset=emotions_encoded_roberta["train"],
    eval_dataset=emotions_encoded_roberta["validation"],
    tokenizer=tokenizer_roberta
)

# Train RoBERTa
trainer_roberta.train()

# Evaluate RoBERTa
preds_output_roberta = trainer_roberta.predict(emotions_encoded_roberta["validation"])
metrics_roberta = preds_output_roberta.metrics
print("RoBERTa Metrics:", metrics_roberta)

# Evaluate DistilBERT (assuming 'preds_output' is available)
metrics_distilbert = preds_output.metrics
print("DistilBERT Metrics:", metrics_distilbert)


In [None]:
# Compare metrics
print("Comparison of Metrics:")
#comparing the test accuracy.
print("Accuracy - DistilBERT:", metrics_distilbert["test_accuracy"], ", RoBERTa:", metrics_roberta["test_accuracy"])

In [None]:
print("F1 Score - BERT:", metrics_distilbert.keys(), ", RoBERTa:", metrics_roberta.keys())