### Load & clean the dataset

In [7]:
!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv
!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv
!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_3.csv

--2025-04-08 20:02:34--  https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.186.207, 142.250.115.207, 142.250.113.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.186.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14174600 (14M) [application/octet-stream]
Saving to: ‘data/full_dataset/goemotions_1.csv.3’


2025-04-08 20:02:34 (116 MB/s) - ‘data/full_dataset/goemotions_1.csv.3’ saved [14174600/14174600]

--2025-04-08 20:02:34--  https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.114.207, 142.250.113.207, 142.251.186.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.114.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14173154 (14M) [application/octet-stream]
Savin

In [8]:
import pandas as pd

df1 = pd.read_csv('data/full_dataset/goemotions_1.csv')
df2 = pd.read_csv('data/full_dataset/goemotions_2.csv')
df3 = pd.read_csv('data/full_dataset/goemotions_3.csv')

df = pd.concat([df1, df2, df3], ignore_index=True)
df.head(2)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211225 entries, 0 to 211224
Data columns (total 37 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   text                  211225 non-null  object 
 1   id                    211225 non-null  object 
 2   author                211225 non-null  object 
 3   subreddit             211225 non-null  object 
 4   link_id               211225 non-null  object 
 5   parent_id             211225 non-null  object 
 6   created_utc           211225 non-null  float64
 7   rater_id              211225 non-null  int64  
 8   example_very_unclear  211225 non-null  bool   
 9   admiration            211225 non-null  int64  
 10  amusement             211225 non-null  int64  
 11  anger                 211225 non-null  int64  
 12  annoyance             211225 non-null  int64  
 13  approval              211225 non-null  int64  
 14  caring                211225 non-null  int64  
 15  

In [10]:
# Identify emotion columns starting from "admiration"
cols = df.columns.tolist()
emotion_start_idx = cols.index("admiration")
emotion_cols = cols[emotion_start_idx:]

# Create a new column with only the first emotion
def get_first_emotion(row):
    for emotion in emotion_cols:
        if row[emotion] == 1:
            return emotion
    return None  # In case there's no emotion tagged

df['label'] = df.apply(get_first_emotion, axis=1)

# Preview
print(df[['text', 'label']].head())

                                                text    label
0                                    That game hurt.  sadness
1   >sexuality shouldn’t be a grouping category I...     None
2     You do right, if you don't care then fuck 'em!  neutral
3                                 Man I love reddit.     love
4  [NAME] was nowhere near them, he was by the Fa...  neutral


In [11]:
from sklearn.preprocessing import LabelEncoder

df['label'] = df.apply(get_first_emotion, axis=1)
df = df[df['label'].notna()].reset_index(drop=True)

# Encode string labels into integers
label_encoder = LabelEncoder()
df['label_id'] = label_encoder.fit_transform(df['label'])

In [12]:
label_mapping = {
    'anger': 'Anger',
    'annoyance': 'Anger',
    'disapproval': 'Disgust',
    'disgust': 'Disgust',
    'confusion': 'Fear',
    'embarrassment': 'Fear',
    'fear': 'Fear',
    'nervousness': 'Fear',
    'admiration': 'Happy',
    'amusement': 'Happy',
    'curiosity': 'Happy',
    'desire': 'Happy',
    'excitement': 'Happy',
    'gratitude': 'Happy',
    'joy': 'Happy',
    'love': 'Happy',
    'optimism': 'Happy',
    'pride': 'Happy',
    'relief': 'Happy',
    'approval': 'Neutral',
    'caring': 'Neutral',
    'realization': 'Neutral',
    'surprise': 'Neutral',
    'neutral': 'Neutral',
    'disappointment': 'Sad',
    'grief': 'Sad',
    'remorse': 'Sad',
    'sadness': 'Sad'
}

df['label'] = df['label'].map(label_mapping)
print(df['label'].value_counts())

label
Neutral    84572
Happy      66466
Anger      19885
Sad        12774
Disgust    12337
Fear       11780
Name: count, dtype: int64


In [13]:
# Remove rows with missing or empty text or label
df = df[df['text'].notna() & df['label'].notna()]
df = df[df['text'].str.strip() != ""]

# Drop duplicates
df = df.drop_duplicates(subset='text').reset_index(drop=True)
# Remove rare or unknown labels if needed (optional)
df = df[df['label'].isin(['Anger', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad'])]

In [19]:
# Shuffle dataset
from sklearn.utils import shuffle
df = shuffle(df, random_state=42).reset_index(drop=True)

# Encode final labels as integers
label_encoder = LabelEncoder()
df['label_id'] = label_encoder.fit_transform(df['label'])

# Check class balance
print(df['label'].value_counts())
print(label_encoder.classes_)

label
Neutral    23753
Happy      18916
Anger       5265
Disgust     3311
Sad         3301
Fear        3184
Name: count, dtype: int64
['Anger' 'Disgust' 'Fear' 'Happy' 'Neutral' 'Sad']


### Tokenizing & data loader

In [20]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
from torch.nn import BCEWithLogitsLoss
from tqdm import tqdm
import ast

In [21]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(texts, padding='max_length', truncation=True, max_length=max_len, return_tensors='pt')
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

texts = df['text'].astype(str).tolist()
labels = df['label_id'].tolist()

from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.1, random_state=42, stratify=labels
)

train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)
val_dataset = EmotionDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

### Model architect:

In [22]:
!pip install huggingface_hub[hf_xet]
from transformers import BertForSequenceClassification

num_labels = len(label_encoder.classes_)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Training

In [23]:
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
from tqdm import tqdm

EPOCHS = 5
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Average training loss: {total_loss / len(train_loader):.4f}")

Epoch 1/5


100%|██████████| 3248/3248 [04:32<00:00, 11.91it/s]


Average training loss: 1.0705
Epoch 2/5


100%|██████████| 3248/3248 [04:32<00:00, 11.92it/s]


Average training loss: 0.9174
Epoch 3/5


100%|██████████| 3248/3248 [04:32<00:00, 11.92it/s]


Average training loss: 0.7316
Epoch 4/5


100%|██████████| 3248/3248 [04:32<00:00, 11.93it/s]


Average training loss: 0.4984
Epoch 5/5


100%|██████████| 3248/3248 [04:32<00:00, 11.93it/s]

Average training loss: 0.3087





### Evaluation

In [24]:
from sklearn.metrics import classification_report

model.eval()
all_preds = []
all_true = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_true.extend(labels.cpu().numpy())

print(classification_report(all_true, all_preds, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

       Anger       0.40      0.39      0.39       527
     Disgust       0.28      0.18      0.22       331
        Fear       0.33      0.33      0.33       318
       Happy       0.70      0.70      0.70      1892
     Neutral       0.61      0.64      0.62      2375
         Sad       0.34      0.36      0.35       330

    accuracy                           0.58      5773
   macro avg       0.44      0.43      0.44      5773
weighted avg       0.57      0.58      0.57      5773



In [25]:
model.save_pretrained("./bert_text_model")
tokenizer.save_pretrained("./bert_text_model")

('./bert_text_model/tokenizer_config.json',
 './bert_text_model/special_tokens_map.json',
 './bert_text_model/vocab.txt',
 './bert_text_model/added_tokens.json')