Libraries

In [93]:
import numpy as np
import pandas as pd 
import emoji
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from textblob import Word
import re
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from textblob import TextBlob
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [94]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\Disha
[nltk_data]     Chavan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Disha Chavan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to C:\Users\Disha
[nltk_data]     Chavan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Dataset

In [176]:
data = pd.read_csv("emoji_dataset.csv")
data.head(10)

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated 😟,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wro...,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy 😠,anger
5,ive been feeling a little burdened lately wasn...,sadness
6,ive been taking or milligrams or times recomme...,surprise
7,i feel as confused about life as a teenager or...,fear
8,i have been with petronas for years i feel tha...,joy
9,i feel romantic too 💓,love


In [3]:
data = data[['Text','Emotion']]

In [5]:
train, validate, test = np.split(data.sample(frac=1, random_state=42), [int(.8*len(data)), int(0.9*len(data))])

In [6]:
train.shape

(16000, 2)

In [7]:
test.shape

(2000, 2)

In [8]:
validate.shape

(2000, 2)

Preprocessing

In [157]:
def convert_emoji(text):
    return emoji.demojize(text)

def convert_to_lower(text):
    words = text.split()
    for i in range(len(words)):
        words[i] = words[i].lower()
    sentence = " ".join(words)
    return sentence

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]
    filtered_sentence = ' '.join(filtered_words)
    return filtered_sentence

def remove_non_alphanum(text):
    text = re.sub(r'[^\w\s]', ' ', text)
    return text

def word_check(word):
    pattern = re.compile(r"(.)\1{2,}")
    sentence = pattern.sub(r"\1\1", word)
    b = TextBlob(sentence)
    return str(b.correct())

def remove_extendedwords(text):
    words = text.split()
    for i in range(len(words)):
        words[i] = word_check(words[i])
    sentence = " ".join(words)
    return sentence

def lemmatize_text(text):
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    tag_map = {'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV, 'J': wordnet.ADJ}

    lemmatized_words = []
    for word, tag in pos_tags:
        wn_tag = tag_map.get(tag[0].upper(), wordnet.NOUN)
        if wn_tag == wordnet.VERB and word.endswith('ed'):
            wn_tag = wordnet.VERB
            word = word[:-2]
        lemmatized_word = lemmatizer.lemmatize(word, pos=wn_tag)
        lemmatized_words.append(lemmatized_word)

    # join the lemmatized words back into a sentence
    lemmatized_sentence = ' '.join(lemmatized_words)

    return (lemmatized_sentence)



In [119]:
train['Text'] = train['Text'].apply(lambda x: convert_emoji(x))
train['Text'] = train['Text'].apply(lambda x: convert_to_lower(x))
train['Text'] = train['Text'].apply(lambda x: remove_stopwords(x))
train['Text'] = train['Text'].apply(lambda x: remove_non_alphanum(x))
train['Text'] = train['Text'].apply(lambda x: lemmatize_text(x))
train.Text

10650    noticed several month ago start feel resentful...
2041     love lot different kind sport love hang friend...
8668      feel even kill agonize extent loudly_crying_face
1114     feel numb way wound really start hurt slightly...
13902    feel happy inspire little si love read write g...
                               ...                        
7382     pay month month feel shame every time grill ho...
13492         feel determine go get face_with_tears_of_joy
10394    remember feeling bit confuse really question s...
16865                feel helpless look world fearful_face
5047     believe happy healthy relationship likely feel...
Name: Text, Length: 16000, dtype: object

In [120]:
test['Text'] = test['Text'].apply(lambda x: convert_emoji(x))
test['Text'] = test['Text'].apply(lambda x: convert_to_lower(x))
test['Text'] = test['Text'].apply(lambda x: remove_stopwords(x))
test['Text'] = test['Text'].apply(lambda x: remove_non_alphanum(x))
test['Text'] = test['Text'].apply(lambda x: lemmatize_text(x))
test.Text

3716     didnt want lazy feel groggy keep drinking red ...
10837    thought good idea give time recover feel nervo...
6140     feel like didnt really care alexis irritate ev...
9956     feel stress free heading holiday rolling_on_th...
1549     keep feeling sometimes one fake till make cryi...
                               ...                        
11284    want savor feel ecstatic anticipation abide da...
11964    im feel puppy dog rainbows im exhaust yes beli...
5390                                 feel delicate bouquet
860                start feel little stressed broken_heart
15795    feel stress tired worn shape neglect worried_face
Name: Text, Length: 2000, dtype: object

In [121]:
validate['Text'] = validate['Text'].apply(lambda x: convert_emoji(x))
validate['Text'] = validate['Text'].apply(lambda x: convert_to_lower(x))
validate['Text'] = validate['Text'].apply(lambda x: remove_stopwords(x))
validate['Text'] = validate['Text'].apply(lambda x: remove_non_alphanum(x))
validate['Text'] = validate['Text'].apply(lambda x: lemmatize_text(x))
validate.Text

1262     go detail long night wake bus feel like could ...
19010        im feeling torture write today grimacing_face
7212            still feel like tragic waste pleading_face
975      feel humiliate body husband make advance towar...
2566                   feel horribly insecure fearful_face
                               ...                        
10900      feel helpless make real difference pensive_face
7758     feel impatient much thanks nic know calm face_...
4837     feel outrage life easy bless angry_face_with_h...
6548     feel like witness birth really amazing dm face...
4481     flip guy feel terrible today flip guy feel ter...
Name: Text, Length: 2000, dtype: object

Model

In [122]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_texts = train['Text']
tokenizer = Tokenizer(15212,lower=True,oov_token='UNK')
tokenizer.fit_on_texts(train_texts)

print('Found %d unique words.' % len(tokenizer.word_index))

# texts_to_sequences: Transforms each text in texts to a sequence of integers. 
# It basically takes each word in the text and replaces it with its corresponding integer value from the word_index dictionary.

train_texts_sequences = tokenizer.texts_to_sequences(train_texts)

# pad_sequences: Ensure that all sequences in a list have the same length. 
train_texts_pad_sequences = pad_sequences(train_texts_sequences, maxlen=80, padding='post') 

Found 12126 unique words.


In [199]:
train.Emotion.unique()

array([4, 1, 0, 5, 3, 2], dtype=int64)

In [123]:
from tensorflow.keras.utils import to_categorical
emotions = {'sadness': 0, 'joy': 1, 'surprise': 2, 'love': 3, 'anger': 4, 'fear': 5}

# Step 1: Replace all emotion values with integers
train['Emotion'] = train.Emotion.replace(emotions)
train_emotion_integers = train['Emotion'].values

# Step 2: Changing the integers to binary
train_emotion_categorical = to_categorical(train_emotion_integers)
train_emotion_categorical[:6] 

array([[0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.]], dtype=float32)

In [124]:
validate_texts = validate['Text']
validate_emotion_integers = validate.Emotion.replace(emotions)
validate_texts_sequences = tokenizer.texts_to_sequences(validate_texts)
validate_texts_pad_sequences = pad_sequences(validate_texts_sequences, maxlen=80, padding='post')
validate_emotion_categorical = to_categorical(validate_emotion_integers.values)
validate_emotion_categorical[:6]

array([[0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.]], dtype=float32)

In [125]:
import tensorflow as tf
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
  tf.config.experimental_connect_to_cluster(tpu)
  tf.tpu.experimental.initialize_tpu_system(tpu)
  print("All devices: ", tf.config.list_logical_devices('TPU'))
  tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
  tpu_strategy = tf.distribute.get_strategy() 

In [126]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Bidirectional,Dense,Embedding,Dropout

# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():
    model=Sequential()
    model.add(Embedding(15212,64,input_length=80))
    model.add(Dropout(0.6))
    model.add(Bidirectional(LSTM(80,return_sequences=True)))
    model.add(Bidirectional(LSTM(160)))
    model.add(Dense(len(emotions),activation='softmax'))
    print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 80, 64)            973568    
                                                                 
 dropout (Dropout)           (None, 80, 64)            0         
                                                                 
 bidirectional (Bidirectiona  (None, 80, 160)          92800     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 320)              410880    
 nal)                                                            
                                                                 
 dense (Dense)               (None, 6)                 1926      
                                                                 
Total params: 1,479,174
Trainable params: 1,479,174
Non-

In [127]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [128]:
hist=model.fit(train_texts_pad_sequences, train_emotion_categorical, epochs=10, validation_data = (validate_texts_pad_sequences, validate_emotion_categorical))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [192]:
test_texts = test['Text']
test_emotion_integers = test.Emotion.replace(emotions)
test_texts_sequences = tokenizer.texts_to_sequences(test_texts)
test_texts_pad_sequences = pad_sequences(test_texts_sequences, maxlen=80, padding='post')
test_emotion_categorical = to_categorical(test_emotion_integers.values)
test_emotion_categorical[:7]

array([[1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]], dtype=float32)

In [193]:
x = model.evaluate(test_texts_pad_sequences, test_emotion_categorical)
print(x)

[0.051815781742334366, 0.984499990940094]


In [201]:
def get_key(value):
    for key,val in emotions.items():
          if (val==value):
            return key


        
def predict(sentence):
    sentence = convert_emoji(sentence)
    sentence = convert_to_lower(sentence)
    sentence = remove_stopwords(sentence)
    sentence = remove_non_alphanum(sentence)
    sentence = remove_extendedwords(sentence)
    sentence = lemmatize_text(sentence)
    print(sentence)
    sentence_lst=[]
    sentence_lst.append(sentence)
    sentence_seq=tokenizer.texts_to_sequences(sentence_lst)
    sentence_padded=pad_sequences(sentence_seq,maxlen=80,padding='post')
    certaintyprediction = model.predict(sentence_padded)[0]
    for key,val in emotions.items():
          print(key + ': ' + str(round(certaintyprediction[val]*100, 2)) + ' %')
    bestpredictionindex = np.argmax(certaintyprediction)
    certainty = str(round(certaintyprediction[bestpredictionindex]*100, 2))
    print('\nI am '+ certainty + ' % sure the emotion is ' + get_key(bestpredictionindex) + '.')

In [202]:
predict("You are being very rude.")

rude
sadness: 42.2 %
joy: 2.12 %
surprise: 1.81 %
love: 2.93 %
anger: 47.7 %
fear: 3.25 %

I am 47.7 % sure the emotion is anger.


In [207]:
predict("I am very happy")

happy
sadness: 69.7 %
joy: 8.07 %
surprise: 1.94 %
love: 2.37 %
anger: 16.44 %
fear: 1.48 %

I am 69.7 % sure the emotion is sadness.
