In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense

# Load the dataset
data = pd.read_csv(r"emotions.csv")

# Data preprocessing
X = data['text']
y = data['label']

# Tokenization and stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_and_tokenize(text):
    return [word for word in text.split() if word.lower() not in stop_words]

X_tokenized = X.apply(clean_and_tokenize)

# Train Word2Vec model
model_Word2Vec = Word2Vec(sentences=X_tokenized, vector_size=100, window=5, min_count=1, workers=4)

# Function to create sentence vectors by averaging word vectors
def sentence_vector(sentence):
    vectors = [model_Word2Vec.wv[word] for word in sentence if word in model_Word2Vec.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model_Word2Vec.vector_size)

# Convert text data to fixed-length vectors
X_features = np.array([sentence_vector(sentence) for sentence in X_tokenized])

# Convert labels to one-hot encoding
num_classes = len(data['label'].unique())
y_encoded = pd.get_dummies(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_features, y_encoded, test_size=0.2, random_state=42)

# Build the model
model = Sequential()
model.add(Dense(128, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_split=0.2, epochs=10, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

# Predict labels
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(np.array(y_test), axis=1)

# Calculate precision, recall, and F1-score
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')
loss, accuracy = model.evaluate(X_test, y_test)

print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\einst\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8337/8337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.6639 - loss: 0.8986 - val_accuracy: 0.7243 - val_loss: 0.7160
Epoch 2/10
[1m8337/8337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.7343 - loss: 0.6954 - val_accuracy: 0.7435 - val_loss: 0.6645
Epoch 3/10
[1m8337/8337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - accuracy: 0.7509 - loss: 0.6494 - val_accuracy: 0.7530 - val_loss: 0.6397
Epoch 4/10
[1m8337/8337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.7601 - loss: 0.6236 - val_accuracy: 0.7558 - val_loss: 0.6314
Epoch 5/10
[1m8337/8337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1ms/step - accuracy: 0.7663 - loss: 0.6050 - val_accuracy: 0.7633 - val_loss: 0.6107
Epoch 6/10
[1m8337/8337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1ms/step - accuracy: 0.7701 - loss: 0.5936 - val_accuracy: 0.7644 - val_loss: 0.6130
Epoch 7/10
[1m8337/8