In [1]:
!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv
!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv
!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_3.csv

--2025-04-10 18:24:43--  https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.121.207, 209.85.145.207, 142.250.125.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.121.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14174600 (14M) [application/octet-stream]
Saving to: ‘data/full_dataset/goemotions_1.csv’


2025-04-10 18:24:44 (30.7 MB/s) - ‘data/full_dataset/goemotions_1.csv’ saved [14174600/14174600]

--2025-04-10 18:24:44--  https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.121.207, 209.85.145.207, 142.250.125.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.121.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14173154 (14M) [application/octet-stream]
Saving to:

In [2]:
import pandas as pd

df1 = pd.read_csv('data/full_dataset/goemotions_1.csv')
df2 = pd.read_csv('data/full_dataset/goemotions_2.csv')
df3 = pd.read_csv('data/full_dataset/goemotions_3.csv')

df = pd.concat([df1, df2, df3], ignore_index=True)
df.head(2)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Identify emotion columns starting from "admiration"
cols = df.columns.tolist()
emotion_start_idx = cols.index("admiration")
emotion_cols = cols[emotion_start_idx:]

# Create a new column with only the first emotion
def get_first_emotion(row):
    for emotion in emotion_cols:
        if row[emotion] == 1:
            return emotion
    return None  # In case there's no emotion tagged

df['label'] = df.apply(get_first_emotion, axis=1)

# Preview
print(df[['text', 'label']].head())

                                                text    label
0                                    That game hurt.  sadness
1   >sexuality shouldn’t be a grouping category I...     None
2     You do right, if you don't care then fuck 'em!  neutral
3                                 Man I love reddit.     love
4  [NAME] was nowhere near them, he was by the Fa...  neutral


In [4]:
from sklearn.preprocessing import LabelEncoder

df['label'] = df.apply(get_first_emotion, axis=1)
df = df[df['label'].notna()].reset_index(drop=True)

# Encode string labels into integers
label_encoder = LabelEncoder()
df['label_id'] = label_encoder.fit_transform(df['label'])

In [5]:
label_mapping = {
    'anger': 'Anger',
    'annoyance': 'Anger',
    'disapproval': 'Disgust',
    'disgust': 'Disgust',
    'confusion': 'Fear',
    'embarrassment': 'Fear',
    'fear': 'Fear',
    'nervousness': 'Fear',
    'admiration': 'Happy',
    'amusement': 'Happy',
    'curiosity': 'Happy',
    'desire': 'Happy',
    'excitement': 'Happy',
    'gratitude': 'Happy',
    'joy': 'Happy',
    'love': 'Happy',
    'optimism': 'Happy',
    'pride': 'Happy',
    'relief': 'Happy',
    'approval': 'Neutral',
    'caring': 'Neutral',
    'realization': 'Neutral',
    'surprise': 'Neutral',
    'neutral': 'Neutral',
    'disappointment': 'Sad',
    'grief': 'Sad',
    'remorse': 'Sad',
    'sadness': 'Sad'
}

df['label'] = df['label'].map(label_mapping)
print(df['label'].value_counts())

label
Neutral    84572
Happy      66466
Anger      19885
Sad        12774
Disgust    12337
Fear       11780
Name: count, dtype: int64


In [6]:
# Remove rows with missing or empty text or label
df = df[df['text'].notna() & df['label'].notna()]
df = df[df['text'].str.strip() != ""]

# Drop duplicates
df = df.drop_duplicates(subset='text').reset_index(drop=True)
# Remove rare or unknown labels if needed (optional)
df = df[df['label'].isin(['Anger', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad'])]

In [7]:
# Shuffle dataset
from sklearn.utils import shuffle
df = shuffle(df, random_state=42).reset_index(drop=True)

# Encode final labels as integers
label_encoder = LabelEncoder()
df['label_id'] = label_encoder.fit_transform(df['label'])

# Check class balance
print(df['label'].value_counts())
print(label_encoder.classes_)

label
Neutral    23753
Happy      18916
Anger       5265
Disgust     3311
Sad         3301
Fear        3184
Name: count, dtype: int64
['Anger' 'Disgust' 'Fear' 'Happy' 'Neutral' 'Sad']


### Tokenize and Load Pretrained Model

[michellejieli/emotion_text_classifier](https://huggingface.co/michellejieli/emotion_text_classifier)

In [8]:
! pip install datasets
from datasets import Dataset

dataset = Dataset.from_pandas(df[['text', 'label_id']]) # Include 'label_id' in the Dataset
dataset = dataset.train_test_split(test_size=0.1, seed=42)

# Load Pretrained Emotion Classifier Pipeline
from transformers import AutoTokenizer

model_ckpt = "michellejieli/emotion_text_classifier"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

encoded_dataset = dataset.map(tokenize, batched=True)

# Rename 'label_id' to 'labels' to match the expected format for the Trainer
encoded_dataset = encoded_dataset.rename_column("label_id", "labels")

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/413 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Map:   0%|          | 0/51957 [00:00<?, ? examples/s]

Map:   0%|          | 0/5773 [00:00<?, ? examples/s]

##  Training

In [9]:
!pip install transformers
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, AutoModelForSequenceClassification
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=6, ignore_mismatched_sizes=True)



config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at michellejieli/emotion_text_classifier and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([7, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([7]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {"accuracy": acc, "f1": f1}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjanehuynh1411[0m ([33mjanehuynh1411-the-university-of-texas-at-austin[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


### **Evaluation**

In [None]:
import torch
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader
from transformers import default_data_collator, DataCollatorWithPadding

# Fix: Use the encoded dataset for validation
val_dataset = encoded_dataset["test"]

# Use DataCollatorWithPadding to handle dynamic padding during batch creation
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Use the data_collator with the DataLoader
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=data_collator)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.eval()
all_preds = []
all_true = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_true.extend(labels.cpu().numpy())

print(classification_report(all_true, all_preds, target_names=label_encoder.classes_))

In [None]:
model.save_pretrained("./emotion_text_classifier")
tokenizer.save_pretrained("./emotion_text_classifier")