### Imports

In [5]:
from datasets import load_dataset
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

### Load the data

In [4]:
# https://huggingface.co/datasets/google-research-datasets/go_emotions
ds = load_dataset("google-research-datasets/go_emotions", "simplified")

In [14]:
print(ds['train'][0])

{'text': "My favourite food is anything I didn't have to cook myself.", 'labels': [27], 'id': 'eebbqej'}


In [15]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})

### Prepare the data - Labels encoding

In [18]:
train_texts = ds['train']['text']
val_texts = ds['validation']['text']
test_texts = ds['test']['text']

train_labels = ds['train']['labels']
val_labels = ds['validation']['labels']
test_labels = ds['test']['labels']

In [25]:
label_encoder = LabelEncoder()

train_labels = [label[0] for label in ds["train"]["labels"]]
val_labels = [label[0] for label in ds["validation"]["labels"]]
test_labels = [label[0] for label in ds["test"]["labels"]]

train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.fit_transform(val_labels)
test_labels_encoded = label_encoder.fit_transform(test_labels)

num_classes = len(label_encoder.classes_)
num_classes

28

In [27]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='[OOV]')
tokenizer.fit_on_texts(train_texts)

In [32]:
def preprocess_text(texts):
    seq = tokenizer.texts_to_sequences(texts)
    pad = tf.keras.preprocessing.sequence.pad_sequences(seq, padding='post')
    return pad

X_train = preprocess_text(train_texts)
X_val = preprocess_text(val_texts)
X_test = preprocess_text(test_texts)
y_train = train_labels_encoded
y_val = val_labels_encoded
y_test = test_labels_encoded

### Model building

In [35]:
vocab = len(tokenizer.word_index)+1
embedding_dim = 64
max_length = X_train.shape[1]

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(num_classes, activation='softmax'),
    tf.keras.layers.Dense(64, activation='relu')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [37]:
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

Epoch 1/10
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 44ms/step - accuracy: 0.1124 - loss: 5.4510 - val_accuracy: 0.0899 - val_loss: 4.1589
Epoch 2/10
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 46ms/step - accuracy: 0.0951 - loss: 4.1589 - val_accuracy: 0.0899 - val_loss: 4.1589
Epoch 3/10
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 45ms/step - accuracy: 0.0938 - loss: 4.1589 - val_accuracy: 0.0899 - val_loss: 4.1589
Epoch 4/10
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 43ms/step - accuracy: 0.0954 - loss: 4.1589 - val_accuracy: 0.0899 - val_loss: 4.1589
Epoch 5/10
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 39ms/step - accuracy: 0.0957 - loss: 4.1589 - val_accuracy: 0.0899 - val_loss: 4.1589
Epoch 6/10
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 39ms/step - accuracy: 0.0940 - loss: 4.1589 - val_accuracy: 0.0899 - val_loss: 4.1589
Epoc