# Emotion Detection
Dataset: https://www.kaggle.com/datasets/praveengovi/emotions-dataset-for-nlp

In [1]:
#import pandas library
import pandas as pd

In [6]:
#read the dataset with name "Emotion_classify_Data.csv" and store it in a variable df
header = ['comment', 'emotion']
df = pd.read_csv('Emotion Detection/train.txt',delimiter = ';',names = header)

In [5]:
#print the shape of dataframe
df.shape

(16000, 2)

In [7]:
#print top 5 rows
df.head()

Unnamed: 0,comment,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [8]:
#check the distribution of Emotion
df.emotion.value_counts()

emotion
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

In [10]:
min_samples = 572

df_joy = df[df.emotion=="joy"].sample(min_samples, random_state=2022)
df_sadness = df[df.emotion=="sadness"].sample(min_samples, random_state=2022)
df_anger = df[df.emotion=="anger"].sample(min_samples, random_state=2022)
df_fear = df[df.emotion=="fear"].sample(min_samples, random_state=2022)
df_love = df[df.emotion=="love"].sample(min_samples, random_state=2022)
df_surprise = df[df.emotion=="surprise"].sample(min_samples, random_state=2022)

In [12]:
df_balanced = pd.concat([df_joy, df_sadness, df_anger,df_fear,df_love,df_surprise],axis=0)
df_balanced.emotion.value_counts()

emotion
joy         572
sadness     572
anger       572
fear        572
love        572
surprise    572
Name: count, dtype: int64

In [15]:
df_balanced.shape

(3432, 2)

In [23]:
#Add the new column "Emotion_num" which gives a unique number to each of these Emotions
df_balanced['emotion_num']=df_balanced['emotion'].map({
    'joy' : 0,
    'sadness' : 1,
    'anger' : 2,
    'fear' : 3,
    'love' : 4,
    'surprise' : 5
})

In [24]:
#checking the results 
df_balanced.head()

Unnamed: 0,comment,emotion,emotion_num
15478,i want to do with my life is an amazing feelin...,joy,0
552,i checked on you was a long time ago i can say...,joy,0
4021,i should do but i think it means that i should...,joy,0
13217,i feel the near and lively presence of the wel...,joy,0
2784,i am left tonight feeling so hopeful for the f...,joy,0


In [25]:
#Do the 'train-test' splitting with test size of 20%

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.comment, 
    df_balanced.emotion_num, 
    test_size=0.2,
    random_state=2022,
    stratify=df_balanced.emotion_num
)

In [26]:
#print the shapes
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (2745,)
Shape of X_test:  (687,)


# Random Forest

In [27]:
#1. create a pipeline object

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_tri_grams', CountVectorizer(ngram_range = (3, 3))),                        
    ('random_forest', (RandomForestClassifier()))         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.53      0.08      0.14       114
           1       0.30      0.06      0.10       115
           2       0.17      0.06      0.09       114
           3       0.18      0.78      0.29       114
           4       0.29      0.22      0.25       115
           5       0.41      0.06      0.11       115

    accuracy                           0.21       687
   macro avg       0.31      0.21      0.16       687
weighted avg       0.31      0.21      0.16       687



In [29]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_bi_grams', CountVectorizer(ngram_range = (1, 2))),                       #using the ngram_range parameter 
    ('random_forest', (RandomForestClassifier()))         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.61      0.66       114
           1       0.61      0.77      0.68       115
           2       0.92      0.75      0.83       114
           3       0.86      0.75      0.80       114
           4       0.85      0.93      0.89       115
           5       0.90      0.97      0.93       115

    accuracy                           0.80       687
   macro avg       0.81      0.80      0.80       687
weighted avg       0.81      0.80      0.80       687



In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer


#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),        
     ('Random Forest', RandomForestClassifier())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.64      0.63       114
           1       0.62      0.65      0.64       115
           2       0.81      0.73      0.77       114
           3       0.85      0.71      0.78       114
           4       0.82      0.90      0.86       115
           5       0.89      0.99      0.94       115

    accuracy                           0.77       687
   macro avg       0.77      0.77      0.77       687
weighted avg       0.77      0.77      0.77       687



# MultinomialNB

In [28]:
from sklearn.naive_bayes import MultinomialNB


#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_bigrams', CountVectorizer(ngram_range = (1, 2))),        #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.60      0.61       114
           1       0.63      0.48      0.54       115
           2       0.67      0.58      0.62       114
           3       0.74      0.61      0.67       114
           4       0.58      0.75      0.65       115
           5       0.67      0.87      0.76       115

    accuracy                           0.65       687
   macro avg       0.65      0.65      0.64       687
weighted avg       0.65      0.65      0.64       687



# Text Preprocessing

In [31]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 


#use this utility function to get the preprocessed text data
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [32]:
# create a new column "preprocessed_comment" and use the utility function above to get the clean data
# this will take some time, please be patient
df_balanced['preprocessed_comment'] = df_balanced['comment'].apply(preprocess) 

In [33]:
#Do the 'train-test' splitting with test size of 20%

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.comment, 
    df_balanced.emotion_num, 
    test_size=0.2,
    random_state=2022,
    stratify=df_balanced.emotion_num
)

# Random Forest

In [34]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_bi_grams', CountVectorizer(ngram_range = (1, 2))),                       #using the ngram_range parameter 
    ('random_forest', (RandomForestClassifier()))         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.68      0.73       114
           1       0.65      0.79      0.71       115
           2       0.90      0.79      0.84       114
           3       0.89      0.77      0.83       114
           4       0.85      0.92      0.88       115
           5       0.90      0.98      0.94       115

    accuracy                           0.82       687
   macro avg       0.83      0.82      0.82       687
weighted avg       0.83      0.82      0.82       687



In [35]:
#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),        #using the ngram_range parameter 
     ('Random Forest', RandomForestClassifier())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred)) 

              precision    recall  f1-score   support

           0       0.63      0.68      0.66       114
           1       0.64      0.64      0.64       115
           2       0.84      0.72      0.77       114
           3       0.88      0.75      0.81       114
           4       0.84      0.90      0.87       115
           5       0.88      0.99      0.93       115

    accuracy                           0.78       687
   macro avg       0.78      0.78      0.78       687
weighted avg       0.78      0.78      0.78       687

