# Emotion Detection using NLP

### Importing libraries

In [1]:
import pandas as pd

In [3]:
emt=pd.read_csv(r"C:\Users\REAL\Documents\Datasets\NLP\emotion_nlp.csv")

### Analyzing Dataset

In [4]:
emt.head()

Unnamed: 0,Text,Emotions
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [5]:
emt.shape

(17999, 2)

### Data cleaning and Preprocessing

In [6]:
emt.isnull().sum()>0

Text        False
Emotions    False
dtype: bool

In [7]:
emt=emt.rename(columns={'Emotions':'y','Text':'x'})

In [8]:
emt.head()

Unnamed: 0,x,y
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [9]:
emt.y.unique()

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

In [10]:
emt=emt.replace({'sadness':2, 'anger':1, 'love':5, 'surprise':3, 'fear':0, 'joy':4})

In [11]:
emt.y=emt.y.replace({2:1,3:2,4:2,5:3})

In [12]:
emt.y.value_counts()

1    7681
2    6694
0    2161
3    1463
Name: y, dtype: int64

In [13]:
emt.x=emt.x.str.lower()

In [14]:
from nltk.corpus import stopwords
l1=stopwords.words("english")

In [15]:
import string

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [16]:
def text_process(mess):
    """
    1. remove the punctuation
    2. remove the stopwords
    3. return the list of clean textwords
    
    """
    nopunc=[char for char in mess if char not in string.punctuation]
    nopunc="".join(nopunc)
    
    return [word for word in nopunc.split() if word not in l1]

In [17]:
# create a TDM 
from sklearn.feature_extraction.text import CountVectorizer  # this is used to count each and every unique word...

In [18]:
import timeit
start=timeit.default_timer()

bow_transformer=CountVectorizer(analyzer=text_process).fit(emt['x'])

stop=timeit.default_timer()
execution_time=stop-start
print("Program executed in ",execution_time)

Program executed in  0.9887347000039881


In [19]:
bow_transformer.vocabulary_

{'didnt': 3834,
 'feel': 5205,
 'humiliated': 6786,
 'go': 5997,
 'feeling': 5209,
 'hopeless': 6687,
 'damned': 3392,
 'hopeful': 6685,
 'around': 726,
 'someone': 13096,
 'cares': 2062,
 'awake': 974,
 'im': 6920,
 'grabbing': 6058,
 'minute': 8934,
 'post': 10731,
 'greedy': 6120,
 'wrong': 15877,
 'ever': 4830,
 'nostalgic': 9576,
 'fireplace': 5342,
 'know': 7832,
 'still': 13463,
 'property': 11032,
 'grouchy': 6160,
 'ive': 7442,
 'little': 8241,
 'burdened': 1870,
 'lately': 7950,
 'wasnt': 15482,
 'sure': 13811,
 'taking': 13975,
 'milligrams': 8890,
 'times': 14361,
 'recommended': 11455,
 'amount': 484,
 'fallen': 5093,
 'asleep': 798,
 'lot': 8351,
 'faster': 5140,
 'also': 429,
 'like': 8164,
 'funny': 5747,
 'confused': 2837,
 'life': 8137,
 'teenager': 14091,
 'jaded': 7458,
 'year': 15934,
 'old': 9767,
 'man': 8527,
 'petronas': 10378,
 'years': 15942,
 'performed': 10304,
 'well': 15573,
 'made': 8463,
 'huge': 6763,
 'profit': 10985,
 'romantic': 11990,
 'make': 8501

In [20]:
tdm=bow_transformer.transform(emt['x'])

### Building the models

In [21]:
# tdm is like our x variable 
from sklearn.model_selection import train_test_split
tdm_train,tdm_test,train_y,test_y=train_test_split(tdm,emt['y'],test_size=.2)

In [22]:
from sklearn.naive_bayes import MultinomialNB

nb=MultinomialNB()
nb.fit(tdm_train,train_y)

In [23]:
pred_nb=nb.predict(tdm_test)

In [24]:
from sklearn.metrics import confusion_matrix

In [25]:
tab_nb=confusion_matrix(test_y,pred_nb)
tab_nb

array([[ 188,  159,   67,    2],
       [  18, 1479,   43,    8],
       [  15,   92, 1229,    8],
       [   4,   61,  129,   98]], dtype=int64)

In [26]:
tab_nb.diagonal().sum()/tab_nb.sum()*100

83.16666666666667

In [27]:
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()

In [28]:
logreg.fit(tdm_train,train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
pred_log=logreg.predict(tdm_test)

In [30]:
tab_log=confusion_matrix(test_y,pred_log)
tab_log

array([[ 341,   46,   25,    4],
       [  24, 1493,   28,    3],
       [  13,   37, 1256,   38],
       [   1,   10,   63,  218]], dtype=int64)

In [31]:
tab_log.diagonal().sum()/tab_log.sum()*100

91.88888888888889

In [33]:
from sklearn.tree import DecisionTreeClassifier
dec=DecisionTreeClassifier()

dec.fit(tdm_train,train_y)

pred_dec=dec.predict(tdm_test)

tab_dec=confusion_matrix(test_y,pred_dec)
tab_dec



array([[ 346,   38,   29,    3],
       [  46, 1363,  124,   15],
       [  21,   51, 1216,   56],
       [   3,    9,   57,  223]], dtype=int64)

In [34]:
tab_dec.diagonal().sum()/tab_dec.sum()*100

87.44444444444444

In [35]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()

rfc.fit(tdm_train,train_y)

pred_rfc=rfc.predict(tdm_test)

tab_rfc=confusion_matrix(test_y,pred_rfc)
tab_rfc



array([[ 338,   43,   31,    4],
       [  31, 1441,   65,   11],
       [  14,   31, 1254,   45],
       [   2,    7,   63,  220]], dtype=int64)

In [36]:
tab_rfc.diagonal().sum()/tab_rfc.sum()*100

90.36111111111111

### Conclusion:
Random forest performs well in terms of accuracy as compared to Logistic regression, Naive Bayes and Decision tree.