# Creating the Machine Learning Model to train the model's detection of emotion

In [1]:
import pandas as pd
import numpy as np

In [2]:
ed = pd.read_csv("emotion_sentimen_dataset.csv")
ed.head()

Unnamed: 0.1,Unnamed: 0,text,Emotion
0,0,i seriously hate one subject to death but now ...,hate
1,1,im so full of life i feel appalled,neutral
2,2,i sit here to write i start to dig out my feel...,neutral
3,3,ive been really angry with r and i feel like a...,anger
4,4,i feel suspicious if there is no one outside l...,neutral


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from keras.layers import Dense
from keras.models import Sequential

In [4]:
ed = ed.drop(columns="Unnamed: 0")
ed.head()

Unnamed: 0,text,Emotion
0,i seriously hate one subject to death but now ...,hate
1,im so full of life i feel appalled,neutral
2,i sit here to write i start to dig out my feel...,neutral
3,ive been really angry with r and i feel like a...,anger
4,i feel suspicious if there is no one outside l...,neutral


In [5]:
# Get X (text) and y (label)
X = ed['text']
y = ed['Emotion']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Tokenize the text and pad sequences - nlp stuff
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# hyperparams
vocab_size = 10000 # change based on dataset size
max_length = 100 # max words per sentence
trunc_type = 'post'
padding_type = 'post'
oov_token = "<OOV>"

# tokenizer to convert words to integers
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(X_train)

# convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# pad sequences to ensure equal length
X_train_pad = pad_sequences(X_train_seq, maxlen = max_length, padding=padding_type, truncating=trunc_type)
X_test_pad = pad_sequences(X_test_seq, maxlen = max_length, padding=padding_type, truncating=trunc_type)

In [8]:
# Encode Emotional Labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(y_train)

y_train_enc = label_encoder.transform(y_train)
y_test_enc = label_encoder.transform(y_test)

from keras.utils import to_categorical
y_train_cat = to_categorical(y_train_enc)
y_test_cat = to_categorical(y_test_enc)

In [9]:
# Build the Keras Model
from keras.layers import Embedding, GlobalAveragePooling1D

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=max_length))
model.add(GlobalAveragePooling1D())
model.add(Dense(32, activation="relu"))
model.add(Dense(y_train_cat.shape[1], activation="softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 64)           640000    
                                                                 
 global_average_pooling1d (G  (None, 64)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 32)                2080      
                                                                 
 dense_1 (Dense)             (None, 13)                429       
                                                                 
Total params: 642,509
Trainable params: 642,509
Non-trainable params: 0
_________________________________________________________________


In [10]:
model.fit(X_train_pad, y_train_cat, epochs=10, batch_size=32)
loss, accuracy = model.evaluate(X_test_pad, y_test_cat)
print("Test accuracy:", accuracy)


Epoch 1/10


2025-06-02 18:43:07.482805: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.997308075428009


In [11]:
model.save('emoModel.keras')

import joblib
joblib.dump(tokenizer, "tokenizer.joblib")

joblib.dump(label_encoder, "label_encoder.joblib")


['label_encoder.joblib']

In [12]:
from EmoModelClass import EmoModelWrapper
new_inst = EmoModelWrapper("emoModel.keras", "tokenizer.joblib", "label_encoder.joblib", 100)

new_data = ["you are annoying"]
prediction = new_inst.predict(new_data)
print(prediction)

['neutral']


In [34]:
text_input = ["I'm fine", "I am excited about today"]
predictions = new_inst.predict(text_input)

print(predictions)

['neutral' 'enthusiasm']
