## load the dataset (emotion)

In [1]:
from datasets import load_dataset
raw_datasets = load_dataset("emotion")

In [2]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [3]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'text': 'i didnt feel humiliated', 'label': 0}

In [4]:
raw_train_dataset.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}

## preprossing

In [10]:
# ...existing code...
from collections import Counter

# Get label names
label_names = raw_datasets['train'].features['label'].names

# Count label occurrences
labels = [example['label'] for example in raw_train_dataset]
label_counts = Counter(labels)

# Print class distribution with label names
for label_id, count in label_counts.items():
    print(f"{label_names[label_id]} ({label_id}): {count}")
# ...existing code...

sadness (0): 4666
anger (3): 2159
love (2): 1304
surprise (5): 572
fear (4): 1937
joy (1): 5362


In [8]:
# Get all unique labels in the training set
unique_labels = set(labels)
print("Unique labels in the training set:", unique_labels)


Unique labels in the training set: {0, 1, 2, 3, 4, 5}


In [9]:
# statistics
print(f"Number of training samples: {len(raw_datasets['train'])}")
print(f"Number of validation samples: {len(raw_datasets['validation'])}")
print(f"Number of test samples: {len(raw_datasets['test'])}")
print(f"Number of classes: {len(label_names)}")
print(f"Classes: {label_names}")
print(f"Sample text: {raw_datasets['train'][0]['text']}")


Number of training samples: 16000
Number of validation samples: 2000
Number of test samples: 2000
Number of classes: 6
Classes: ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
Sample text: i didnt feel humiliated


In [12]:
# text lengths statistics
text_lengths = [len(example['text'].split()) for example in raw_train_dataset]
import numpy as np
print(f"Average text length (in words): {np.mean(text_lengths):.2f}")
print(f"Median text length (in words): {np.median(text_lengths):.2f}")
print(f"Standard deviation of text length (in words): {np.std(text_lengths):.2f}")
# max text length
print(f"Max text length (in words): {np.max(text_lengths)}")
# min text length
print(f"Min text length (in words): {np.min(text_lengths)}")

Average text length (in words): 19.17
Median text length (in words): 17.00
Standard deviation of text length (in words): 10.99
Max text length (in words): 66
Min text length (in words): 2
