In [1]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("google-research-datasets/go_emotions")

# Access splits
train_data = dataset['train']
val_data = dataset['validation']
test_data = dataset['test']
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})

In [2]:
train_data[0]

{'text': "My favourite food is anything I didn't have to cook myself.",
 'labels': [27],
 'id': 'eebbqej'}

In [3]:
val_data[0]

{'text': 'Is this in New Orleans?? I really feel like this is New Orleans.',
 'labels': [27],
 'id': 'edgurhb'}

In [4]:
test_data[0]

{'text': 'Iâ€™m really sorry about your situation :( Although I love the names Sapphira, Cirilla, and Scarlett!',
 'labels': [25],
 'id': 'eecwqtt'}

In [5]:
X_train = train_data['text']
y_train = train_data['labels']

X_val = val_data['text']
y_val = val_data['labels']

X_test = test_data['text']
y_test = test_data['labels']

In [6]:
X_train

Column(["My favourite food is anything I didn't have to cook myself.", 'Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead', 'WHY THE FUCK IS BAYLESS ISOING', 'To make her feel threatened', 'Dirty Southern Wankers'])

In [7]:
import re
from nltk.stem import WordNetLemmatizer
import nltk

# Download required NLTK data
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

lemmatizer = WordNetLemmatizer()

def lemmatize_tokenizer(text):
    """
    Replicates CountVectorizer's default tokenization + adds lemmatization
    """
    # Step 1: Lowercase (default behavior)
    text = text.lower()

    # Step 2: Extract tokens using default regex pattern
    # Pattern: (?u)\b\w\w+\b means "2 or more word characters"
    token_pattern = r"(?u)\b\w\w+\b"
    tokens = re.findall(token_pattern, text)

    # Step 3: Lemmatize each token (THE ONLY DIFFERENCE)
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return lemmatized_tokens

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# Define configurations to test
configs = [
    # Unigrams only
    {'max_features': 5000, 'ngram_range': (1,1), 'name': 'unigram_5k'},
    {'max_features': 10000, 'ngram_range': (1,1), 'name': 'unigram_10k'},
    {'max_features': None, 'ngram_range': (1,1), 'name': 'unigram_all'},

    # Bigrams only
    {'max_features': 5000, 'ngram_range': (2,2), 'name': 'bigram_5k'},
    {'max_features': 10000, 'ngram_range': (2,2), 'name': 'bigram_10k'},
    {'max_features': None, 'ngram_range': (2,2), 'name': 'bigram_all'},

    # Unigrams + Bigrams combined
    {'max_features': 5000, 'ngram_range': (1,2), 'name': 'uni+bi_5k'},
    {'max_features': 10000, 'ngram_range': (1,2), 'name': 'uni+bi_10k'},
    {'max_features': 15000, 'ngram_range': (1,2), 'name': 'uni+bi_15k'},
    {'max_features': None, 'ngram_range': (1,2), 'name': 'uni+bi_all'},

    {'max_features': 5000, 'ngram_range': (1,2), 'tokenizer': lemmatize_tokenizer, 'name': 'uni+bi_5k_lemma'},
]

# Create vectorizers
vectorizers = []
for config in configs:
    name = config.pop('name')  # Remove name from config
    vectorizer = CountVectorizer(
        min_df=2,
        max_df=0.8,
        **config  # Unpack remaining parameters
    )
    vectorizers.append((name, vectorizer))

print(f"Total configurations to test: {len(vectorizers)}")

Total configurations to test: 11


In [9]:
from sklearn.preprocessing import MultiLabelBinarizer

# Convert multi-label format to binary matrix
mlb = MultiLabelBinarizer()
y_train_binary = mlb.fit_transform(y_train)
y_val_binary = mlb.transform(y_val)
y_test_binary = mlb.transform(y_test)

print(f"Number of emotion classes: {len(mlb.classes_)}")
print(f"y_train shape: {y_train_binary.shape}")

Number of emotion classes: 28
y_train shape: (43410, 28)


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, f1_score, hamming_loss
import time

results = []

for name, vectorizer in vectorizers:
    print(f"\n{'='*60}")
    print(f"Training with: {name}")
    print(f"{'='*60}")

    # Transform data
    start_time = time.time()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_val_vec = vectorizer.transform(X_val)
    X_test_vec = vectorizer.transform(X_test)
    transform_time = time.time() - start_time

    print(f"Feature matrix shape: {X_train_vec.shape}")
    print(f"Actual vocabulary size: {len(vectorizer.vocabulary_)}")
    print(f"Transform time: {transform_time:.2f}s")

    # Train classifier
    print("Training classifier...")
    start_time = time.time()
    classifier = OneVsRestClassifier(
        LogisticRegression(max_iter=1000, random_state=42)
    )
    classifier.fit(X_train_vec, y_train_binary)
    train_time = time.time() - start_time
    print(f"Training time: {train_time:.2f}s")

    # Evaluate on validation set
    y_val_pred = classifier.predict(X_val_vec)

    val_accuracy = accuracy_score(y_val_binary, y_val_pred)
    val_f1_micro = f1_score(y_val_binary, y_val_pred, average='micro')
    val_f1_macro = f1_score(y_val_binary, y_val_pred, average='macro')
    val_hamming = hamming_loss(y_val_binary, y_val_pred)

    print(f"\nValidation Results:")
    print(f"  Accuracy: {val_accuracy:.4f}")
    print(f"  F1 (micro): {val_f1_micro:.4f}")
    print(f"  F1 (macro): {val_f1_macro:.4f}")
    print(f"  Hamming Loss: {val_hamming:.4f}")

    # Store results
    results.append({
        'name': name,
        'vocab_size': len(vectorizer.vocabulary_),
        'accuracy': val_accuracy,
        'f1_micro': val_f1_micro,
        'f1_macro': val_f1_macro,
        'hamming_loss': val_hamming,
        'train_time': train_time,
        'transform_time': transform_time
    })


Training with: unigram_5k
Feature matrix shape: (43410, 5000)
Actual vocabulary size: 5000
Transform time: 8.22s
Training classifier...
Training time: 16.44s

Validation Results:
  Accuracy: 0.3496
  F1 (micro): 0.4881
  F1 (macro): 0.3449
  Hamming Loss: 0.0343

Training with: unigram_10k
Feature matrix shape: (43410, 10000)
Actual vocabulary size: 10000
Transform time: 2.08s
Training classifier...
Training time: 13.45s

Validation Results:
  Accuracy: 0.3481
  F1 (micro): 0.4877
  F1 (macro): 0.3421
  Hamming Loss: 0.0343

Training with: unigram_all
Feature matrix shape: (43410, 13077)
Actual vocabulary size: 13077
Transform time: 2.04s
Training classifier...
Training time: 29.23s

Validation Results:
  Accuracy: 0.3491
  F1 (micro): 0.4881
  F1 (macro): 0.3423
  Hamming Loss: 0.0342

Training with: bigram_5k
Feature matrix shape: (43410, 5000)
Actual vocabulary size: 5000
Transform time: 4.31s
Training classifier...
Training time: 5.01s

Validation Results:
  Accuracy: 0.1635
  F1 



Feature matrix shape: (43410, 5000)
Actual vocabulary size: 5000
Transform time: 12.50s
Training classifier...
Training time: 11.89s

Validation Results:
  Accuracy: 0.3559
  F1 (micro): 0.4950
  F1 (macro): 0.3609
  Hamming Loss: 0.0344


In [11]:
import pandas as pd

# Create comparison table
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('f1_micro', ascending=False)

print("\n" + "="*80)
print("RESULTS COMPARISON (sorted by F1-micro)")
print("="*80)
print(results_df.to_string(index=False))


RESULTS COMPARISON (sorted by F1-micro)
           name  vocab_size  accuracy  f1_micro  f1_macro  hamming_loss  train_time  transform_time
      uni+bi_5k        5000  0.354220  0.495269  0.357238      0.034411   11.335574        4.851443
uni+bi_5k_lemma        5000  0.355879  0.494979  0.360937      0.034424   11.890828       12.497140
    unigram_all       13077  0.349060  0.488118  0.342288      0.034168   29.234254        2.044244
     unigram_5k        5000  0.349613  0.488116  0.344946      0.034306   16.440951        8.222292
     uni+bi_10k       10000  0.348507  0.488008  0.353413      0.034845   14.192036        4.006302
    unigram_10k       10000  0.348139  0.487651  0.342120      0.034273   13.453387        2.084634
     uni+bi_all       58338  0.342794  0.486108  0.341363      0.034332   51.732908        3.332034
     uni+bi_15k       15000  0.345374  0.485414  0.347416      0.034832   37.777761        3.865401
     bigram_all       45261  0.178400  0.297406  0.170118  