## load the dataset (emotion)

In [1]:
from datasets import load_dataset
raw_datasets = load_dataset("emotion")

In [2]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [3]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'text': 'i didnt feel humiliated', 'label': 0}

In [4]:
raw_train_dataset.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}

In [5]:
# check class distribution
from collections import Counter
labels = [example['label'] for example in raw_train_dataset]
print(Counter(labels))

Counter({1: 5362, 0: 4666, 3: 2159, 4: 1937, 2: 1304, 5: 572})


In [6]:
label_names = raw_datasets['train'].features['label'].names
print(label_names)

['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']


In [7]:
# check missing values
import pandas as pd
df = pd.DataFrame(raw_train_dataset)
df.isnull().sum()

text     0
label    0
dtype: int64

## preprossing

In [8]:
# Get all unique labels in the training set
unique_labels = set(labels)
print("Unique labels in the training set:", unique_labels)


Unique labels in the training set: {0, 1, 2, 3, 4, 5}


In [9]:
# statistics
print(f"Number of training samples: {len(raw_datasets['train'])}")
print(f"Number of validation samples: {len(raw_datasets['validation'])}")
print(f"Number of test samples: {len(raw_datasets['test'])}")
print(f"Number of classes: {len(label_names)}")
print(f"Classes: {label_names}")
print(f"Sample text: {raw_datasets['train'][0]['text']}")


Number of training samples: 16000
Number of validation samples: 2000
Number of test samples: 2000
Number of classes: 6
Classes: ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
Sample text: i didnt feel humiliated
