## Load Dataset

In [1]:
!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv
!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv
!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_3.csv

--2025-04-10 20:30:34--  https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.204.207, 172.217.203.207, 142.250.98.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.204.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14174600 (14M) [application/octet-stream]
Saving to: ‘data/full_dataset/goemotions_1.csv.4’


2025-04-10 20:30:35 (199 MB/s) - ‘data/full_dataset/goemotions_1.csv.4’ saved [14174600/14174600]

--2025-04-10 20:30:35--  https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.204.207, 172.217.203.207, 142.250.98.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.204.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14173154 (14M) [application/octet-stream]
Saving 

In [2]:
import pandas as pd

df1 = pd.read_csv('data/full_dataset/goemotions_1.csv')
df2 = pd.read_csv('data/full_dataset/goemotions_2.csv')
df3 = pd.read_csv('data/full_dataset/goemotions_3.csv')

df = pd.concat([df1, df2, df3], ignore_index=True)
df.head(2)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Identify emotion columns starting from "admiration"
cols = df.columns.tolist()
emotion_start_idx = cols.index("admiration")
emotion_cols = cols[emotion_start_idx:]

# Create a new column with only the first emotion
def get_first_emotion(row):
    for emotion in emotion_cols:
        if row[emotion] == 1:
            return emotion
    return None  # In case there's no emotion tagged

df['label'] = df.apply(get_first_emotion, axis=1)

# Preview
print(df[['text', 'label']].head())

                                                text    label
0                                    That game hurt.  sadness
1   >sexuality shouldn’t be a grouping category I...     None
2     You do right, if you don't care then fuck 'em!  neutral
3                                 Man I love reddit.     love
4  [NAME] was nowhere near them, he was by the Fa...  neutral


In [4]:
from sklearn.preprocessing import LabelEncoder

df['label'] = df.apply(get_first_emotion, axis=1)
df = df[df['label'].notna()].reset_index(drop=True)

# Encode string labels into integers
label_encoder = LabelEncoder()
df['label_id'] = label_encoder.fit_transform(df['label'])

In [5]:
label_mapping = {
    'anger': 'Anger',
    'annoyance': 'Anger',
    'disapproval': 'Disgust',
    'disgust': 'Disgust',
    'confusion': 'Fear',
    'embarrassment': 'Fear',
    'fear': 'Fear',
    'nervousness': 'Fear',
    'admiration': 'Happy',
    'amusement': 'Happy',
    'curiosity': 'Happy',
    'desire': 'Happy',
    'excitement': 'Happy',
    'gratitude': 'Happy',
    'joy': 'Happy',
    'love': 'Happy',
    'optimism': 'Happy',
    'pride': 'Happy',
    'relief': 'Happy',
    'approval': 'Neutral',
    'caring': 'Neutral',
    'realization': 'Neutral',
    'surprise': 'Neutral',
    'neutral': 'Neutral',
    'disappointment': 'Sad',
    'grief': 'Sad',
    'remorse': 'Sad',
    'sadness': 'Sad'
}

df['label'] = df['label'].map(label_mapping)
print(df['label'].value_counts())

label
Neutral    84572
Happy      66466
Anger      19885
Sad        12774
Disgust    12337
Fear       11780
Name: count, dtype: int64


In [6]:
# Remove rows with missing or empty text or label
df = df[df['text'].notna() & df['label'].notna()]
df = df[df['text'].str.strip() != ""]

# Drop duplicates
df = df.drop_duplicates(subset='text').reset_index(drop=True)
# Remove rare or unknown labels if needed (optional)
df = df[df['label'].isin(['Anger', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad'])]

In [7]:
# Shuffle dataset
from sklearn.utils import shuffle
df = shuffle(df, random_state=42).reset_index(drop=True)

# Encode final labels as integers
label_encoder = LabelEncoder()
df['label_id'] = label_encoder.fit_transform(df['label'])

# Check class balance
print(df['label'].value_counts())
print(label_encoder.classes_)

label
Neutral    23753
Happy      18916
Anger       5265
Disgust     3311
Sad         3301
Fear        3184
Name: count, dtype: int64
['Anger' 'Disgust' 'Fear' 'Happy' 'Neutral' 'Sad']


## Preprocess Text

In [8]:
!pip install gensim --quiet

In [15]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Download the punkt_tab data

stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    tokens = word_tokenize(text)
    return [word for word in tokens if word not in stop_words and len(word) > 1]

df['tokens'] = df['text'].apply(preprocess)
df['tokens'].head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,tokens
0,"[enjoy, toxic, umoderated, cesspool, apparentl..."
1,"[dear, name, please, help, name, less, disgust..."
2,"[also, name, raised, money, many, subscribers,..."
3,"[used, use, phillips, switched, state, farm, y..."
4,"[make, sense, anxiety, afraid, things, since, ..."


## Train Word2Vec Model

In [10]:
!pip install --upgrade --force-reinstall gensim

Collecting gensim
  Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Using cached scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Using cached smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Using cached wrapt-1.17.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.4 kB)
Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
Using cached scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (

In [16]:
from gensim.models import Word2Vec
sentences = df['tokens'].tolist()
model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=5, workers=4, sg=1)
model.save('goemotions_word2vec.model')

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Assuming 'tokens' column contains the features and 'label_id' is the target
# Convert the tokens to numerical representations using Word2Vec embeddings
X = df['tokens'].apply(lambda tokens: np.mean([model.wv[token] for token in tokens if token in model.wv], axis=0) if any(token in model.wv for token in tokens) else np.zeros(model.vector_size)).tolist()
# Check if all elements in X are of the same shape
X = [x if x.shape == (model.vector_size,) else np.zeros(model.vector_size) for x in X]

X = np.array(X)

y = df['label_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

## Evaluation

In [27]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[  93    1    0  131  831    8]
 [  19    1    0   83  536   11]
 [   9    0    0   72  519    7]
 [  20    0    1 2028 1760   15]
 [  51    4    3  706 4008   21]
 [  11    0    0   70  491   36]]
              precision    recall  f1-score   support

           0       0.46      0.09      0.15      1064
           1       0.17      0.00      0.00       650
           2       0.00      0.00      0.00       607
           3       0.66      0.53      0.59      3824
           4       0.49      0.84      0.62      4793
           5       0.37      0.06      0.10       608

    accuracy                           0.53     11546
   macro avg       0.36      0.25      0.24     11546
weighted avg       0.49      0.53      0.47     11546



In [29]:
model.save("./word2vec_goemotions")