# Data Loading/Preprocessing

Downloads

In [None]:
%pip install gensim
%pip install nltk



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import numpy as np
import nltk
from nltk.corpus import stopwords
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import re
import string

# nltk stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

#loading the dataset
df = pd.read_csv('emotions.csv')

# tokenization and stopwords
def clean_and_tokenize(text):
    return [word for word in text.split() if word.lower() not in stop_words]

df['tokenized'] = df['text'].apply(clean_and_tokenize)

# Train Word2Vec
model_Word2Vec = Word2Vec(sentences=df['tokenized'], vector_size=100, window=5, min_count=1, workers=4)

# Function to create sentence vectors by averaging word vectors
def sentence_vector(sentence):
    #creates word vectors. Checks if the word exists in the w2v model_Word2Vecs vocabulary
    vectors = [model_Word2Vec.wv[word] for word in sentence if word in model_Word2Vec.wv]
    #if if vectors is not empty put in the average vector
    if vectors:
        return np.mean(vectors, axis=0)
    #otherwise put in zeros
    else:
        return np.zeros(model_Word2Vec.vector_size)

X = np.array([sentence_vector(sentence) for sentence in df['tokenized']])

y = df['label']

#One hot encode the labels
y_encoded = to_categorical(y, num_classes=6)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

X_train_padded = pad_sequences(X_train, maxlen=20)
X_test_padded = pad_sequences(X_test, maxlen=20)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
max_length = df['tokenized'].apply(lambda x: len(x)).max()
print(max_length)

79


In [None]:
X_train[3]

array([-0.60353845,  1.0190351 ,  0.32119653,  0.47224626,  0.59523934,
       -0.11014067,  0.15660159,  0.20875622,  0.20007655, -0.15408695,
       -0.64697069, -0.70873183,  0.76810968, -0.51497394,  0.24410537,
       -0.33394974,  0.03793345, -0.24877511,  0.17589469,  0.45609182,
       -0.16143882,  0.50447434, -0.29286125,  0.32366529,  0.40783188,
       -0.07883056, -0.56505448, -0.06840909, -0.0425799 , -0.07970779,
        0.07295298,  0.62033862, -0.05344135, -0.05995287, -0.17092492,
        0.32160494, -0.60756195, -0.61878973, -0.64720213,  0.04056277,
       -0.12165699, -0.18092521,  0.46596351,  0.03574416,  1.29345989,
       -0.4503732 ,  0.17304529, -0.93053895, -0.77637392, -0.43578172,
        1.08976626, -0.80655909, -0.49175313,  0.08778358, -0.58111298,
       -0.53948307,  0.53371555,  0.31081915, -0.24971655,  0.27887902,
        0.38650855,  0.48014975,  0.07341887, -0.25780389, -0.66796941,
       -0.07578379,  0.02085705,  0.13966917, -0.14474228,  0.70

# RNN Model

In [None]:
from tensorflow.keras.metrics import Precision, Recall
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Reshape, SimpleRNN, Dense

# Custom F1 Score Metric
class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.precision = Precision()
        self.recall = Recall()

    def update_state(self, y_true, y_pred, sample_weight=None):
        self.precision.update_state(y_true, y_pred, sample_weight)
        self.recall.update_state(y_true, y_pred, sample_weight)

    def result(self):
        p = self.precision.result()
        r = self.recall.result()
        return 2 * ((p * r) / (p + r + 1e-7))

    def reset_states(self):
        self.precision.reset_states()
        self.recall.reset_states()

num_classes = 6

rnn = Sequential()
rnn.add(Reshape((100, 1), input_shape=(100,)))
rnn.add(SimpleRNN(units=64))  # RNN layer
rnn.add(Dense(units=num_classes, activation='softmax'))  # Output layer

# Compile the model
rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', Precision(), Recall(), F1Score()])

epochs = 10
batch_size = 256

# Train the model
rnn.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

# Evaluate the model
loss, accuracy, precision, recall, f1 = rnn.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Epoch 1/10

  m.reset_state()


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.8726698756217957
Test Accuracy: 0.6716609597206116
Precision: 0.7494884729385376
Recall: 0.5843909978866577
F1 Score: 0.6567224264144897


In [None]:
rnn.save('emotions_rnn.keras')