# Download Dataset

Download and prepare the GoEmotions dataset from HuggingFace. Maps 27 emotions to 7 simplified categories.


## Install Dependencies


In [10]:
%pip install datasets


Note: you may need to restart the kernel to use updated packages.


## Import Libraries


In [11]:
from datasets import load_dataset
import pandas as pd
import os


## Download Dataset


In [12]:
dataset = load_dataset("go_emotions", "simplified")


In [13]:
print(f"Train: {len(dataset['train']):,} | Val: {len(dataset['validation']):,} | Test: {len(dataset['test']):,}")
print(f"Total: {len(dataset['train']) + len(dataset['validation']) + len(dataset['test']):,}")
print(f"Columns: {dataset['train'].column_names}")


Train: 43,410 | Val: 5,426 | Test: 5,427
Total: 54,263
Columns: ['text', 'labels', 'id']


In [14]:
train_df = pd.DataFrame(dataset['train'])
val_df = pd.DataFrame(dataset['validation'])
test_df = pd.DataFrame(dataset['test'])

print(f"Train: {train_df.shape} | Val: {val_df.shape} | Test: {test_df.shape}")


Train: (43410, 3) | Val: (5426, 3) | Test: (5427, 3)


## Map to 7 Emotions


In [15]:
def map_to_7_emotions(labels):
    if len(labels) == 0:
        return 'neutral'
    label = labels[0]
    if label in [17, 1, 15, 20, 4, 21, 23]:
        return 'joy'
    elif label in [25, 9, 16, 24]:
        return 'sadness'
    elif label in [2, 3, 10]:
        return 'anger'
    elif label in [14, 19]:
        return 'fear'
    elif label in [26, 13, 22]:
        return 'surprise'
    elif label in [11]:
        return 'disgust'
    else:
        return 'neutral'

train_df['emotion'] = train_df['labels'].apply(map_to_7_emotions)
val_df['emotion'] = val_df['labels'].apply(map_to_7_emotions)
test_df['emotion'] = test_df['labels'].apply(map_to_7_emotions)


In [16]:
train_df = train_df[['text', 'emotion']]
val_df = val_df[['text', 'emotion']]
test_df = test_df[['text', 'emotion']]

train_df.head()


Unnamed: 0,text,emotion
0,My favourite food is anything I didn't have to...,neutral
1,"Now if he does off himself, everyone will thin...",neutral
2,WHY THE FUCK IS BAYLESS ISOING,anger
3,To make her feel threatened,fear
4,Dirty Southern Wankers,anger


## Save to CSV


In [17]:
os.makedirs('../data/raw', exist_ok=True)

train_df.to_csv('../data/raw/train_raw.csv', index=False)
val_df.to_csv('../data/raw/val_raw.csv', index=False)
test_df.to_csv('../data/raw/test_raw.csv', index=False)

print("Saved datasets")


Saved datasets


## Summary


In [18]:
print("Dataset Summary:")
print(f"Total samples: {len(train_df) + len(val_df) + len(test_df):,}")
print(f"Train: {len(train_df):,} | Val: {len(val_df):,} | Test: {len(test_df):,}")
print(f"\nEmotions: {sorted(train_df['emotion'].unique())}")
print(f"\nFiles saved: train_raw.csv, val_raw.csv, test_raw.csv")


Dataset Summary:
Total samples: 54,263
Train: 43,410 | Val: 5,426 | Test: 5,427

Emotions: ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']

Files saved: train_raw.csv, val_raw.csv, test_raw.csv
