In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Activation
import numpy as np
from sklearn.metrics import classification_report, precision_recall_fscore_support
import tensorflow as tf


In [2]:
#dataset for Training 
train_data=pd.read_excel("Training_data.xlsx")
#dataset for Testing
test_data=pd.read_excel("Testing_data.xlsx")

train_df = pd.DataFrame(train_data, columns=["Sentence", "Label"])
test_df = pd.DataFrame(test_data, columns=["Sentence", "Label"])


In [3]:
#Exploratory Data Analysis (EDA) Performance
# Display the first few rows of the training data
print("First few rows of the training data:")
print(train_df.head()) 

# Display the first few rows of the testing data
print("First few rows of the testing data:")
print(test_df.head())  
# Display basic statistics of the training data
print("\nBasic statistics of the training data:")
print(train_df.describe()) 

# Display basic statistics of the testing data
print("\nBasic statistics of the testing data:")
print(test_df.describe())  

# Display information about the training data
print("\nInformation about the training data:")
print(train_df.info()) 

# Display information about the testing data
print("\nInformation about the testing data:")
print(test_df.info())  


First few rows of the training data:
                  Sentence  Label
0         I lost my wallet      3
1   You failed the midterm      3
2  Congrats on the new job      2
3            I want to eat      4
4       That catcher sucks      1
First few rows of the testing data:
                   Sentence  Label
0         he did not answer      3
1      she got me a present      0
2  ha ha ha it was so funny      2
3       he is a good friend      0
4                I am upset      3

Basic statistics of the training data:
           Label
count  27.000000
mean    2.037037
std     1.255189
min     0.000000
25%     1.500000
50%     2.000000
75%     3.000000
max     4.000000

Basic statistics of the testing data:
          Label
count  8.000000
mean   1.750000
std    1.581139
min    0.000000
25%    0.000000
50%    2.000000
75%    3.000000
max    4.000000

Information about the training data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 2 columns

In [4]:
#data preprocessing
class data_cleaning:
    #function to remove duplicates
    def remove_duplicates(self,data):        
        data.drop_duplicates(inplace=True)
    #function to handle missing values
    def handling_empty_cells(self,data):
        data.dropna(inplace=True)
obj=data_cleaning()
obj.remove_duplicates(train_df)
obj.handling_empty_cells(train_df)
obj.remove_duplicates(test_df)
obj.handling_empty_cells(test_df) 

In [5]:
#Text Tokenization, Padding and  Sequencing:
max_len = 10  # maximum number of words in a sentence
#Tokenizing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['Sentence'].values)
word_index = tokenizer.word_index
print("Tokenizing")
print(word_index)
vocab_size = len(word_index) + 1  
#Sequencing
print("Sequencing x_train")
x_train = tokenizer.texts_to_sequences(train_df['Sentence'].values)
print(x_train)
print("Sequencing x_test")
x_test = tokenizer.texts_to_sequences(test_df['Sentence'].values)
print(x_test)
#Padding
print("Padding")
x_train = pad_sequences(x_train,padding="post", maxlen=max_len)
print("Padding x_train")
print(x_train)
x_test = pad_sequences(x_test, padding="post",maxlen=max_len)
print("Padding x_test")
print(x_test)
# Verify the number of classes
num_classes = len(set(train_df['Label'].values))

# One-hot encode the labels
print("One-hot encode the labels")
y_train = to_categorical(train_df['Label'].values, num_classes=num_classes)
print("One-hot encode the  y-train")
print(y_train)
y_test = to_categorical(test_df['Label'].values, num_classes=num_classes)
print("One-hot encode the  y_test")
print(y_test)

Tokenizing
{'i': 1, 'you': 2, 'ha': 3, 'the': 4, 'this': 5, 'a': 6, 'is': 7, 'not': 8, 'joke': 9, 'am': 10, 'got': 11, 'he': 12, 'my': 13, 'for': 14, 'was': 15, 'such': 16, 'good': 17, 'it': 18, 'so': 19, 'happy': 20, 'did': 21, 'she': 22, 'lost': 23, 'wallet': 24, 'failed': 25, 'midterm': 26, 'congrats': 27, 'on': 28, 'new': 29, 'job': 30, 'want': 31, 'to': 32, 'eat': 33, 'that': 34, 'catcher': 35, 'sucks': 36, 'are': 37, 'qualified': 38, 'position': 39, 'love': 40, 'dad': 41, 'guy': 42, 'specialization': 43, 'great': 44, 'could': 45, 'solve': 46, 'proud': 47, 'of': 48, 'forever': 49, 'first': 50, 'baseman': 51, 'ball': 52, 'bad': 53, 'do': 54, 'your': 55, 'homework': 56, 'answer': 57, 'raise': 58, 'me': 59, 'present': 60, 'funny': 61, 'friend': 62, 'upset': 63, 'we': 64, 'had': 65, 'lovely': 66, 'dinner': 67, 'tonight': 68, 'where': 69, 'food': 70, 'stop': 71, 'making': 72}
Sequencing x_train
[[1, 23, 13, 24], [2, 25, 4, 26], [27, 28, 4, 29, 30], [1, 31, 32, 33], [34, 35, 36], [2, 37

In [6]:
# Load GloVe embeddings
embeddings_index = {}
embedding_dim = 300
with open(r'C:\Users\user\Downloads\glove.6B (1)\glove.6B.300d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create the embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
print(embedding_matrix)
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print(embedding_matrix) 


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.13292     0.16985001 -0.1436     ... -0.23778     0.14766
   0.62901998]
 [-0.28426999  0.047977   -0.15062    ... -0.090071    0.016922
   0.29278001]
 ...
 [ 0.38543999  0.34246999  0.29598999 ... -0.49944001 -0.27509999
  -0.61361003]
 [ 0.80465001  0.051994    0.078946   ... -0.23231     0.066039
   0.1154    ]
 [ 0.34911001  0.20057     0.2419     ... -0.46020001 -0.31044
   0.099268  ]]


In [8]:
# Build the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.4))
model.add(LSTM(64))
model.add(Dropout(0.3))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.summary()

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

 
# Train the model
model.fit(x_train, y_train, batch_size=32, epochs=20, validation_split=0.1)


Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.3043 - loss: 1.6228 - val_accuracy: 0.3333 - val_loss: 1.6135
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step - accuracy: 0.3043 - loss: 1.5663 - val_accuracy: 0.3333 - val_loss: 1.5916
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step - accuracy: 0.4348 - loss: 1.5135 - val_accuracy: 0.3333 - val_loss: 1.5748
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 219ms/step - accuracy: 0.5217 - loss: 1.4894 - val_accuracy: 0.3333 - val_loss: 1.5585
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 156ms/step - accuracy: 0.3913 - loss: 1.4484 - val_accuracy: 0.3333 - val_loss: 1.5446
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 156ms/step - accuracy: 0.4783 - loss: 1.4459 - val_accuracy: 0.3333 - val_loss: 1.5318
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x2179a79ce60>

In [9]:

# Evaluate the model on test data
loss, accuracy = model.evaluate(x_test, y_test)



# Calculate other evaluation metrics
print(f'Test Accuracy: {accuracy * 100:.2f}%')
# Predict classes for test data
y_pred = np.argmax(model.predict(x_test), axis=-1)

# Calculate precision, recall, and F1-score using sklearn 
precision, recall, f1_score, _ = precision_recall_fscore_support(np.argmax(y_test, axis=-1), y_pred, average='weighted',zero_division=0)


print(f'Precision: {precision:.2f}%')
print(f'Recall: {recall:.2f}%')
print(f'F1-score: {f1_score:.2f}%')




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - accuracy: 0.7500 - loss: 1.0277
Test Accuracy: 75.00%
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 381ms/step
Precision: 0.67%
Recall: 0.75%
F1-score: 0.70%


In [11]:
emoji_dictionary = {
    "0": "🎁",  # Assuming a set of emojis
    "1": "😡",
    "2": "😄",
    "3": "😓",
    "4": "🍴"
}

pred_probabilities = model.predict(x_test)
pred = np.argmax(pred_probabilities, axis=1)

for i in range(len(x_test)):
    sentence = test_df['Sentence'].values[i]
    actual_emoji = emoji_dictionary[str(np.argmax(y_test[i]))]
    predicted_emoji = emoji_dictionary[str(pred[i])]
    print(f'Sentence: {sentence}| Actual Emoji: {actual_emoji} |Predicted Emoji: {predicted_emoji}')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
Sentence: he did not answer| Actual Emoji: 😓 |Predicted Emoji: 😓
Sentence: she got me a present| Actual Emoji: 🎁 |Predicted Emoji: 🎁
Sentence: ha ha ha it was so funny| Actual Emoji: 😄 |Predicted Emoji: 😄
Sentence: he is a good friend| Actual Emoji: 🎁 |Predicted Emoji: 🎁
Sentence: I am upset| Actual Emoji: 😓 |Predicted Emoji: 😄
Sentence: We had such a lovely dinner tonight| Actual Emoji: 🎁 |Predicted Emoji: 🎁
Sentence: where is the food| Actual Emoji: 🍴 |Predicted Emoji: 😓
Sentence: Stop making this joke ha ha ha| Actual Emoji: 😄 |Predicted Emoji: 😄
