In [2]:
import nltk
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from keras.callbacks import EarlyStopping, ModelCheckpoint




In [None]:
# Read the pickle file
features = pd.read_pickle('./Datasets/feature_extraction_full.pkl')

# Filter out rows where the emotion is "neutral"
filtered_features = features[features['emotion'] != 'neutral']
filtered_features.head()

In [4]:
# Tokenize the sentences
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(filtered_features['sentence'])
vocab_size = len(tokenizer.word_index)
tokenized_sentences = tokenizer.texts_to_sequences(filtered_features['sentence'])

In [5]:
# Train Word2Vec model
word2vec_model_custom = Word2Vec(sentences=tokenized_sentences, vector_size=300, window=5, min_count=1, sg=1, workers=4)
X = tokenized_sentences
y = filtered_features['emotion']

In [6]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Pad sequences
max_sequence_length = 30  # Adjust according to your maximum sequence length
X_train_padded = pad_sequences(X_train, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test, maxlen=max_sequence_length)


In [8]:
# Convert labels to one-hot encoding
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
num_classes = len(np.unique(y_train_encoded))
y_train_encoded_onehot = to_categorical(y_train_encoded, num_classes=num_classes)
y_test_encoded_onehot = to_categorical(y_test_encoded, num_classes=num_classes)

In [9]:
# Map words to their corresponding word vectors in the custom Word2Vec model
embedding_matrix_custom = np.zeros((vocab_size + 1, 300))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model_custom.wv:
        embedding_matrix_custom[i] = word2vec_model_custom.wv[word]

In [10]:
# Define the RNN model
model = Sequential()
model.add(Embedding(input_dim=vocab_size + 1, output_dim=300, input_length=max_sequence_length, weights=[embedding_matrix_custom], trainable=False))
model.add(LSTM(128))
model.add(Dense(num_classes, activation='softmax'))




In [11]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])




In [12]:
# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

# Train the model
history = model.fit(X_train_padded, y_train_encoded_onehot, epochs=10, batch_size=16, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: early stopping


In [13]:
# Evaluate accuracy on test set
test_loss, test_acc = model.evaluate(X_test_padded, y_test_encoded_onehot)
print("Test accuracy:", test_acc)

Test accuracy: 0.42352741956710815


---
# Predict on Kaggle test set

In [14]:
# Loading kaggle test file
file_path = "C:/Users/neilr/Documents/BUAS year 2/Block C/2023-24c-fai2-adsai-neildaniel221270/Kaggle/Datasets multiclass/test (kaggle).csv"
test_kaggle = pd.read_csv(file_path, sep='\t')
test_kaggle.head()

Unnamed: 0,id,sentence
0,0,Girls are happy when they get flowers
1,1,His jaw dropped in disbelief when he saw the p...
2,2,Sometimes the ugly stench makes me wanna throw...
3,3,The foul odor from the garbage bin was disgust...
4,4,"I can’t believe it, they lost the game in the ..."


In [15]:
# Preprocess the text data from Kaggle test set
X_test_kaggle = test_kaggle['sentence']
sequences_kaggle = tokenizer.texts_to_sequences(X_test_kaggle)
padded_sequences_kaggle = pad_sequences(sequences_kaggle, padding='post', maxlen=max_sequence_length)

In [16]:
# Predict emotions on Kaggle test set
y_pred_kaggle = model.predict(padded_sequences_kaggle)



In [17]:
y_pred_kaggle

array([[0.02568394, 0.00366002, 0.09627504, 0.4285193 , 0.40245146,
        0.04341031],
       [0.02568394, 0.00366002, 0.09627504, 0.4285193 , 0.40245146,
        0.04341031],
       [0.02568394, 0.00366002, 0.09627504, 0.4285193 , 0.40245146,
        0.04341031],
       ...,
       [0.02568394, 0.00366002, 0.09627504, 0.4285193 , 0.40245146,
        0.04341031],
       [0.02568394, 0.00366002, 0.09627504, 0.4285193 , 0.40245146,
        0.04341031],
       [0.02568394, 0.00366002, 0.09627504, 0.4285193 , 0.40245146,
        0.04341031]], dtype=float32)

In [18]:
# Define a function to convert predicted probabilities to emotions
def get_emotion(pred):
    emotions = ['happiness', 'surprise', 'sadness', 'anger', 'fear', 'disgust']
    return emotions[np.argmax(pred)]


# Convert predicted probabilities to emotions for Kaggle test set
predicted_emotions = [get_emotion(pred) for pred in y_pred_kaggle]

In [19]:
# Create a DataFrame for submission
submission_df = pd.DataFrame({'id': test_kaggle['id'], 'emotion': predicted_emotions})
submission_df.head()

Unnamed: 0,id,emotion
0,0,anger
1,1,anger
2,2,anger
3,3,anger
4,4,anger


In [20]:
# Count occurrences of each emotion
emotion_counts = submission_df['emotion'].value_counts()
emotion_counts

emotion
anger    1436
Name: count, dtype: int64

In [21]:
# Save predictions to CSV
submission_df.to_csv('./Kaggle/Datasets multiclass/created_embeddings_rnn_predictions.csv', index=False)
print("RNN predictions have been saved to created_embeddings_rnn_predictions.csv")

RNN predictions have been saved to created_embeddings_rnn_predictions.csv
