In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

[nltk_data] Downloading package stopwords to C:\Users\Bilal
[nltk_data]     Ahmad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Bilal
[nltk_data]     Ahmad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
!pip install pyarrow
!pip install fastparquet

In [11]:
!pip install tensorflow




[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
df = pd.read_parquet("train-00000-of-00001.parquet")

df.head()

Unnamed: 0,text,label
0,i feel awful about it too because it s my job ...,0
1,im alone i feel awful,0
2,ive probably mentioned this before but i reall...,1
3,i was feeling a little low few days back,0
4,i beleive that i am much more sensitive to oth...,2


In [3]:
df.tail()

Unnamed: 0,text,label
416804,that was what i felt when i was finally accept...,1
416805,i take every day as it comes i m just focussin...,4
416806,i just suddenly feel that everything was fake,0
416807,im feeling more eager than ever to claw back w...,1
416808,i give you plenty of attention even when i fee...,0


In [4]:
df.shape

(416809, 2)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416809 entries, 0 to 416808
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    416809 non-null  object
 1   label   416809 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 6.4+ MB


In [6]:
df.describe()

Unnamed: 0,label
count,416809.0
mean,1.554271
std,1.490453
min,0.0
25%,0.0
50%,1.0
75%,3.0
max,5.0


In [7]:
print(df['label'].value_counts())

label
1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: count, dtype: int64


In [8]:
emotion_map = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
df['emotion'] = df['label'].map(emotion_map)

# Preview the updated dataframe
df[['text', 'emotion']].head()

Unnamed: 0,text,emotion
0,i feel awful about it too because it s my job ...,sadness
1,im alone i feel awful,sadness
2,ive probably mentioned this before but i reall...,joy
3,i was feeling a little low few days back,sadness
4,i beleive that i am much more sensitive to oth...,love


In [14]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Embedding, LSTM, Dense, Dropout, Layer
)
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

In [15]:
df = df.dropna().reset_index(drop=True)

In [16]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["emotion_label"] = le.fit_transform(df["emotion"])

num_classes = len(le.classes_)
print("Classes:", le.classes_)

Classes: ['anger' 'fear' 'joy' 'love' 'sadness' 'surprise']


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df["text"],
    df["emotion_label"],
    test_size=0.2,
    random_state=42,
    stratify=df["emotion_label"]
)

In [18]:
MAX_WORDS = 20000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

In [19]:
y_train_cat = to_categorical(y_train, num_classes)
y_test_cat  = to_categorical(y_test, num_classes)

In [20]:
class AttentionLayer(Layer):
    def __init__(self):
        super(AttentionLayer, self).__init__()

    def call(self, inputs):
        # inputs shape: (batch, time_steps, hidden_size)
        score = tf.nn.tanh(inputs)
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * inputs
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector

In [21]:
embedding_dim = 128

inputs = Input(shape=(MAX_LEN,))
x = Embedding(MAX_WORDS, embedding_dim, input_length=MAX_LEN)(inputs)
x = LSTM(128, return_sequences=True)(x)
x = AttentionLayer()(x)
x = Dropout(0.5)(x)
outputs = Dense(num_classes, activation='softmax')(x)

model = Model(inputs, outputs)
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()






In [22]:
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

history = model.fit(
    X_train_pad,
    y_train_cat,
    validation_split=0.1,
    epochs=15,
    batch_size=32,
    callbacks=[early_stop]
)

Epoch 1/15
[1m9379/9379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2905s[0m 309ms/step - accuracy: 0.8889 - loss: 0.2491 - val_accuracy: 0.9359 - val_loss: 0.1034
Epoch 2/15
[1m9379/9379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m951s[0m 101ms/step - accuracy: 0.9392 - loss: 0.1009 - val_accuracy: 0.9372 - val_loss: 0.0975
Epoch 3/15
[1m9379/9379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1109s[0m 118ms/step - accuracy: 0.9418 - loss: 0.0923 - val_accuracy: 0.9384 - val_loss: 0.0958
Epoch 4/15
[1m9379/9379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1860s[0m 198ms/step - accuracy: 0.9425 - loss: 0.0882 - val_accuracy: 0.9358 - val_loss: 0.0966
Epoch 5/15
[1m9379/9379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1136s[0m 121ms/step - accuracy: 0.9440 - loss: 0.0851 - val_accuracy: 0.9358 - val_loss: 0.0975
Epoch 6/15
[1m9379/9379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2444s[0m 261ms/step - accuracy: 0.9447 - loss: 0.0837 - val_accuracy: 0.9330 - val_

In [25]:
loss, accuracy = model.evaluate(X_test_pad, y_test_cat)
print("Test Accuracy:", accuracy)

[1m2606/2606[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 30ms/step - accuracy: 0.9402 - loss: 0.0908
Test Accuracy: 0.9401645660400391


In [26]:
def predict_emotion_lstm(text):
    seq = tokenizer.texts_to_sequences([text])
    pad = pad_sequences(seq, maxlen=MAX_LEN, padding='post')
    pred = model.predict(pad)
    return le.inverse_transform([np.argmax(pred)])[0]

predict_emotion_lstm("I feel very depressed and hopeless today")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step


'sadness'