In [5]:
!pip install -q datasets scikit-learn tqdm

In [8]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

# Load GoEmotions dataset
dataset = load_dataset("go_emotions")
df = pd.DataFrame(dataset["train"])

# Keep only examples with one label
df = df[df["labels"].apply(lambda x: len(x) == 1)].copy()
df["label"] = df["labels"].apply(lambda x: x[0])

# Encode labels
le = LabelEncoder()
df["label_enc"] = le.fit_transform(df["label"])

# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"], df["label_enc"], test_size=0.2, random_state=42
)

# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

X_train_seq = tokenizer.texts_to_sequences(train_texts)
X_test_seq = tokenizer.texts_to_sequences(test_texts)

max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# One-hot encode labels for multi-class classification
y_train = to_categorical(train_labels, num_classes=28)
y_test = to_categorical(test_labels, num_classes=28)



FileNotFoundError: Couldn't find any data file at /content/go_emotions. Couldn't find 'go_emotions' on the Hugging Face Hub either: LocalEntryNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on.

# **LSTM** **Model**

In [None]:
# Define LSTM model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

lstm_model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_len),
    LSTM(64),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(28, activation='softmax')
])

lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.summary()

# Train the model
lstm_model.fit(X_train_pad, y_train, epochs=3, batch_size=128, validation_split=0.1)
lstm_model.save("lstm_emotion_model.h5")

# Evaluate
from sklearn.metrics import classification_report

lstm_preds = np.argmax(lstm_model.predict(X_test_pad), axis=1)
true = np.argmax(y_test, axis=1)

target_names = [str(label) for label in le.classes_]

print("LSTM Classification Report:")
print(classification_report(true, lstm_preds, target_names=target_names))




Epoch 1/3
[1m205/205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 302ms/step - accuracy: 0.3222 - loss: 2.8322 - val_accuracy: 0.3559 - val_loss: 2.6052
Epoch 2/3
[1m205/205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 206ms/step - accuracy: 0.3505 - loss: 2.6718 - val_accuracy: 0.3559 - val_loss: 2.5979
Epoch 3/3
[1m205/205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 213ms/step - accuracy: 0.3531 - loss: 2.6495 - val_accuracy: 0.3559 - val_loss: 2.5979
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step
LSTM Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       548
           1       0.00      0.00      0.00       333
           2       0.00      0.00      0.00       207
           3       0.00      0.00      0.00       292
           4       0.00      0.00      0.00       373
           5       0.00      0.00      0.00       120
           6       0.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# **Convolutional** **Neural** **Network**

In [None]:
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D

# Define CNN model
cnn_model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_len),
    Conv1D(128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(28, activation='softmax')
])

cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.summary()

# Train the model
cnn_model.fit(X_train_pad, y_train, epochs=3, batch_size=128, validation_split=0.1)
cnn_model.save("cnn_emotion_model.h5")

# Evaluate
cnn_preds = np.argmax(cnn_model.predict(X_test_pad), axis=1)

target_names = [str(label) for label in le.classes_]

print("CNN Classification Report:")
print(classification_report(true, cnn_preds, target_names=target_names))




Epoch 1/3
[1m205/205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 187ms/step - accuracy: 0.3314 - loss: 2.8307 - val_accuracy: 0.4571 - val_loss: 2.1446
Epoch 2/3
[1m205/205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 172ms/step - accuracy: 0.4768 - loss: 2.1148 - val_accuracy: 0.5174 - val_loss: 1.8833
Epoch 3/3
[1m205/205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 164ms/step - accuracy: 0.5270 - loss: 1.8282 - val_accuracy: 0.5463 - val_loss: 1.7873
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step
CNN Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.67      0.64       548
           1       0.66      0.86      0.74       333
           2       0.39      0.23      0.29       207
           3       0.00      0.00      0.00       292
           4       0.67      0.01      0.01       373
           5       0.00      0.00      0.00       120
           6       0.00   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# **Transformer** **Model**

In [None]:
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
import tensorflow as tf
from transformers import AdamWeightDecay



# Load tokenizer and model (28 emotion classes)
tokenizer_hf = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model_hf = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=28)

# Tokenize text
train_enc = tokenizer_hf(list(train_texts), truncation=True, padding=True, max_length=128, return_tensors="tf")
test_enc = tokenizer_hf(list(test_texts), truncation=True, padding=True, max_length=128, return_tensors="tf")

# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    {
        "input_ids": train_enc["input_ids"],
        "attention_mask": train_enc["attention_mask"]
    },
    tf.keras.utils.to_categorical(train_labels, num_classes=28)
)).shuffle(1000).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    {
        "input_ids": test_enc["input_ids"],
        "attention_mask": test_enc["attention_mask"]
    },
    tf.keras.utils.to_categorical(test_labels, num_classes=28)
)).batch(16)

# Compile and train
model_hf.compile(
    optimizer = AdamWeightDecay(learning_rate=5e-5),
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

model_hf.fit(train_dataset, validation_data=test_dataset, epochs=3)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/3
  35/1816 [..............................] - ETA: 5:41:32 - loss: 2.7486 - accuracy: 0.3750

KeyboardInterrupt: 

In [None]:
# Predict on test data
logits = model_hf.predict(test_dataset).logits
bert_preds = np.argmax(logits, axis=1)

# True labels
target_names = [str(label) for label in le.classes_]

print("DistilBERT Classification Report:")
print(classification_report(test_labels, bert_preds, target_names=target_names))
