In [None]:
from datasets import load_dataset

ds = load_dataset("dair-ai/emotion", "split")

In [None]:
import numpy as np 
import pandas as pd 

In [None]:
ds

In [None]:
# First row from training set
sample = ds['train'][0]
sample

# Output: {'text': '...', 'label': 0}


In [None]:
# First 5 rows from validation set
samples = ds['validation'][:5]
print(samples['text'])     # List of 5 texts
print(samples['label'])    # List of 5 labels


In [None]:
import pandas as pd

train = pd.DataFrame(ds['train'][:])
train.head()


In [None]:
test = pd.DataFrame(ds['test'][:])

In [None]:
validation = pd.DataFrame(ds['validation'][:])

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
# Fit tokenizer on the training data 
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(train)

In [None]:
X_train = train['text']
y_train = train['label']

X_test = test['text']
y_test = test['label']

X_val = validation['text']
y_val = validation['label']

In [None]:
# Pad Sequences 
max_length = max(len(x) for x in X_train_seq)
X_train = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_val = pad_sequences(X_val_seq, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test_seq, maxlen=max_length, padding='post')


In [None]:
# train_labels = ['Sadness', 'anger', 'love', 'surprise', 'fear', 'joy']
# val_labels = ['Sadness', 'anger', 'love', 'surprise', 'fear', 'joy']
# test_labels = ['Sadness', 'anger', 'love', 'surprise', 'fear', 'joy']

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_train = encoder.fit_transform(train_labels)
y_val = encoder.fit_transform(val_labels)
y_test = encoder.fit_transform(test_labels)

In [None]:
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)
y_test = to_categorical(y_test)


In [None]:
# from tensorflow.keras.utils import to_categorical
# y_train = to_categorical(y_train)
# y_val = to_categorical(y_val)
# y_test = to_categorical(y_test)


In [None]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder


# Text & Label
X_train = train['text']
y_train = train['label']
X_val = validation['text']
y_val = validation['label']
X_test = test['text']
y_test = test['label']

# Tokenizer
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding
max_length = max(len(x) for x in X_train_seq)
X_train = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_val = pad_sequences(X_val_seq, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# Encode labels
encoder = LabelEncoder()
y_train = to_categorical(encoder.fit_transform(y_train))
y_val = to_categorical(encoder.transform(y_val))
y_test = to_categorical(encoder.transform(y_test))


In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)


In [None]:
tokenizer.word_index

In [None]:
embedding_index = {}
with open("glove.6B.100d.txt", encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs
        
embedding_dim = 100 
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
    if word in embedding_index:
        embedding_matrix[i] = embedding_index[word]

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Input

In [None]:
model = Sequential()
model.add(Input(shape=(max_length,)))  # Not max_length - 1

model.add(Embedding(input_dim=len(word_index) + 1, 
                    output_dim=embedding_dim, 
                    weights=[embedding_matrix], 
                    input_length=max_length,
                    trainable=False))

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(y_train.shape[1], activation='softmax'))  # ✅ fixed this line


In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=32)

In [None]:
model.fit(X_train, y_train, validation_data=(X_val, y_val),
          epochs=10, batch_size=32)


In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")
