In [2]:
import nltk
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from keras import Sequential
from keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from keras.callbacks import EarlyStopping




In [None]:
# Read the pickle file
features = pd.read_pickle('./Datasets/feature_extraction_full.pkl')

# Filter out rows where the emotion is "neutral"
filtered_features = features[features['emotion'] != 'neutral']

# Display the filtered DataFrame
filtered_features.head()

In [8]:
# Load pre-trained Word2Vec model
word2vec_model = KeyedVectors.load_word2vec_format('C:/Users/neilr/Documents/BUAS year 2/Block C/GoogleNews-vectors-negative300.bin', binary=True)

In [9]:
# Tokenize the sentences
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(filtered_features['sentence'])
vocab_size = len(tokenizer.word_index)

# Convert the tokenized sentences to sequences
sequences = tokenizer.texts_to_sequences(filtered_features['sentence'])

# Pad the sequences
padded_sequences = pad_sequences(sequences, padding='post', maxlen=30)

In [10]:
X = padded_sequences
y = filtered_features['emotion']

In [11]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform labels
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [13]:
# Map words to their corresponding word vectors
embedding_matrix = np.zeros((vocab_size + 1, 300))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]

In [14]:
num_classes = len(np.unique(y_train_encoded))

# Define the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size + 1, output_dim=300, input_length=30, weights=[embedding_matrix], trainable=False))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

# Print model summary
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 300)           30327000  
                                                                 
 flatten (Flatten)           (None, 9000)              0         
                                                                 
 dense (Dense)               (None, 128)               1152128   
                                                                 
 dense_1 (Dense)             (None, 6)                 774       
                                                                 
Total params: 31479902 (120.09 MB)
Trainable params: 1152902 (4.40 MB)
Non-trainable params: 30327000 (115.69 MB)
_________________________________________________________________


In [15]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'],  run_eagerly=True)




In [17]:
# Train the model
y_train_encoded_onehot = to_categorical(y_train_encoded, num_classes=num_classes)

early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
history = model.fit(X_train, y_train_encoded_onehot, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 4: early stopping


In [18]:
# Predict on the testing set
y_pred = model.predict(X_test)



In [19]:
# Convert target labels to one-hot encoding
y_test_encoded_onehot = to_categorical(y_test_encoded, num_classes=num_classes)

# Evaluate accuracy
test_loss, test_acc = model.evaluate(X_test, y_test_encoded_onehot)
print("Test accuracy:", test_acc)

Test accuracy: 0.8111016750335693


---
## Predict it on the kaggle test set.

In [20]:
# Loading kaggle test file
file_path = "C:/Users/neilr/Documents/BUAS year 2/Block C/2023-24c-fai2-adsai-neildaniel221270/Kaggle/Datasets multiclass/test (kaggle).csv"
test_kaggle = pd.read_csv(file_path, sep='\t')
test_kaggle.head()

Unnamed: 0,id,sentence
0,0,Girls are happy when they get flowers
1,1,His jaw dropped in disbelief when he saw the p...
2,2,Sometimes the ugly stench makes me wanna throw...
3,3,The foul odor from the garbage bin was disgust...
4,4,"I can’t believe it, they lost the game in the ..."


In [21]:
# Preprocess the text data
X_test_kaggle = test_kaggle['sentence']

In [22]:
# Preprocess the text data from Kaggle test set
sequences_kaggle = tokenizer.texts_to_sequences(X_test_kaggle)
padded_sequences_kaggle = pad_sequences(sequences_kaggle, padding='post', maxlen=30)

# Predict emotions on Kaggle test set
y_pred_kaggle = model.predict(padded_sequences_kaggle)



In [23]:
y_pred_kaggle

array([[2.1862409e-04, 4.1155081e-06, 1.9114469e-04, 9.7730941e-01,
        1.8328106e-05, 2.2258334e-02],
       [1.2005935e-08, 9.7923156e-11, 1.7694929e-05, 1.4838902e-04,
        1.3997644e-03, 9.9843413e-01],
       [9.7705859e-01, 5.1050254e-05, 4.5764165e-07, 1.5528039e-10,
        2.2887509e-02, 2.4075566e-06],
       ...,
       [1.7402707e-02, 8.6030877e-06, 2.2064932e-04, 9.8221934e-01,
        2.1838543e-06, 1.4650705e-04],
       [1.2159633e-02, 4.8530779e-05, 9.7986650e-01, 7.5915601e-04,
        5.5714227e-06, 7.1605509e-03],
       [1.2240220e-06, 5.4748898e-07, 1.2485098e-08, 9.9999285e-01,
        2.3698701e-06, 3.0843416e-06]], dtype=float32)

In [24]:
# Define a function to convert predicted probabilities to emotions
def get_emotion(pred):
    emotions = ['happiness', 'surprise', 'sadness', 'anger', 'fear', 'disgust', 'neutral']
    return emotions[np.argmax(pred)]

In [25]:
# Convert predicted probabilities to emotions for Kaggle test set
predicted_emotions = [get_emotion(pred) for pred in y_pred_kaggle]

# Create a DataFrame for submission
submission_df = pd.DataFrame({'id': test_kaggle['id'], 'emotion': predicted_emotions})

In [27]:
submission_df.head()

Unnamed: 0,id,emotion
0,0,anger
1,1,disgust
2,2,happiness
3,3,fear
4,4,fear


In [28]:
# Count occurrences of each emotion
emotion_counts = submission_df['emotion'].value_counts()
emotion_counts

emotion
anger        421
fear         416
happiness    263
sadness      158
disgust      140
surprise      38
Name: count, dtype: int64

In [29]:
model.save("./Models/pre_trained_embeddings.h1")

INFO:tensorflow:Assets written to: ./Models/pre_trained_embeddings.h1\assets


INFO:tensorflow:Assets written to: ./Models/pre_trained_embeddings.h1\assets


In [30]:
# Save predictions to CSV
submission_df.to_csv('./Kaggle/Datasets multiclass/pretrained_predictions_2.csv', index=False)
print("Logistic Regression predictions have been saved to pretrained_predictions_2.csv")

Logistic Regression predictions have been saved to pretrained_predictions_2.csv
