Libraries

In [67]:
import numpy as np
import pandas as pd 
import contractions
import emoji
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from textblob import Word
import re
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from textblob import TextBlob
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

Dataset

In [68]:
data = pd.read_csv(os.path.join(os.getcwd(),"../static/dataset/emoji_dataset.csv"))
data

Unnamed: 0,Text,Emotion
0,im feeling groggy and horrid,sadness
1,i could feel the muscles in my arches ankles a...,joy
2,i feel like but im not very fond of that word 💙,love
3,i have to move stop staring at the other ladie...,joy
4,i have this kind of life so my girlfriend woul...,sadness
...,...,...
19995,im feeling ive resolved to live a life of love...,joy
19996,i used feel frustrated all the time,anger
19997,im starting to feel more sociable again i actu...,joy
19998,i am feeling devastated the inner voice within...,sadness


In [69]:
data = data[['Text','Emotion']]

In [70]:
from sklearn.model_selection import train_test_split
train, validate, test = np.split(data.sample(frac=1, random_state=42), [int(.8*len(data)), int(0.9*len(data))])

In [71]:
train.shape

(16000, 2)

Preprocessing

In [72]:
abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", #"que pasa",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

In [73]:
def convert_emoji(text):
    return emoji.demojize(text)

def convert_abbrev(word):
    return abbreviations[word.lower()] if word.lower() in abbreviations.keys() else word 

def convert_abbrev_in_text(text):
    tokens = word_tokenize(text)
    tokens=[convert_abbrev(word) for word in tokens]
    text = ' '.join(tokens)
    return text

def expand_contractions(text):
    expanded_words = []   
    for word in text.split():
        expanded_words.append(contractions.fix(word))  
    expanded_text = ' '.join(expanded_words)
    return expanded_text

def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

def convert_to_lower(text):
    words = text.split()
    for i in range(len(words)):
        words[i] = words[i].lower()
    sentence = " ".join(words)
    return sentence

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]
    filtered_sentence = ' '.join(filtered_words)
    return filtered_sentence

def remove_non_alphanum(text):
    text = re.sub(r'[^\w\s]', ' ', text)
    return text

def word_check(word):
    pattern = re.compile(r"(.)\1{2,}")
    sentence = pattern.sub(r"\1\1", word)
    b = TextBlob(sentence)
    return str(b.correct())

def remove_extendedwords(text):
    words = text.split()
    for i in range(len(words)):
        words[i] = word_check(words[i])
    sentence = " ".join(words)
    return sentence

def lemmatize_text(text):
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    tag_map = {'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV, 'J': wordnet.ADJ}

    lemmatized_words = []
    for word, tag in pos_tags:
        wn_tag = tag_map.get(tag[0].upper(), wordnet.NOUN)
        if wn_tag == wordnet.VERB and word.endswith('ed'):
            wn_tag = wordnet.VERB
            word = word[:-2]
        lemmatized_word = lemmatizer.lemmatize(word, pos=wn_tag)
        lemmatized_words.append(lemmatized_word)

    # join the lemmatized words back into a sentence
    lemmatized_sentence = ' '.join(lemmatized_words)

    return (lemmatized_sentence)

In [74]:
train['Text'] = train['Text'].apply(lambda x: convert_emoji(x))
train['Text'] = train['Text'].apply(lambda x: expand_contractions(x))
train['Text'] = train['Text'].apply(lambda x: convert_to_lower(x))
train['Text'] = train['Text'].apply(lambda x: remove_stopwords(x))
train['Text'] = train['Text'].apply(lambda x: remove_non_alphanum(x))
train['Text'] = train['Text'].apply(lambda x: lemmatize_text(x))
train.Text

10650             read blog year feel like faithful reader
2041                 feel crave naughty sweet snack choose
8668     hop like new draft good time end feeling devas...
1114           crappy week still feel agitat like day want
13902    easily feel quite pressured routine really not...
                               ...                        
7382                               feel bit funny actually
13492    meet great people feel may unintentionally offend
10394                            feel must remain faithful
16865          felt like real good feeling welcom open arm
5047                             feel good run feel normal
Name: Text, Length: 16000, dtype: object

In [75]:
test['Text'] = test['Text'].apply(lambda x: convert_emoji(x))
test['Text'] = test['Text'].apply(lambda x: expand_contractions(x))
test['Text'] = test['Text'].apply(lambda x: convert_to_lower(x))
test['Text'] = test['Text'].apply(lambda x: remove_stopwords(x))
test['Text'] = test['Text'].apply(lambda x: remove_non_alphanum(x))
test['Text'] = test['Text'].apply(lambda x: lemmatize_text(x))
test.Text

3716     think many may dislike still feel impressed ed...
10837                                    feel smart though
6140                                 feel desperately fond
9956     also able get appointment osteopath freak awes...
1549           feel taste dessert sweet suit many customer
                               ...                        
11284     id get past whole oh gawd humiliat feel humiliat
11964    look see stare feel also know sympathetic glan...
5390     sound desperate pathetic feel frantic need anx...
860                  worried feeling suppos church rich dr
15795    exercise feel energetic able perform task good...
Name: Text, Length: 2000, dtype: object

In [76]:
validate['Text'] = validate['Text'].apply(lambda x: convert_emoji(x))
validate['Text'] = validate['Text'].apply(lambda x: expand_contractions(x))
validate['Text'] = validate['Text'].apply(lambda x: convert_to_lower(x))
validate['Text'] = validate['Text'].apply(lambda x: remove_stopwords(x))
validate['Text'] = validate['Text'].apply(lambda x: remove_non_alphanum(x))
validate['Text'] = validate['Text'].apply(lambda x: lemmatize_text(x))
validate.Text

1262                              feel offend think justly
19010    spend two week zombie mode two week feel feeli...
7212     love idea white blouse jumper feel jumper woul...
975               could help feel infuriated left building
2566     think notice prone feel jealous right help sho...
                               ...                        
10900                            angry feeling disillusion
7758     feel like someone need invest money could gorg...
4837     id let kill matter fact feel frightfully well ...
6548     feel though people find quite pleasant smiling...
4481     even think would ready fuck buddys emotion wou...
Name: Text, Length: 2000, dtype: object

Model

In [77]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_texts = train['Text']
tokenizer = Tokenizer(15212,lower=True,oov_token='UNK')
tokenizer.fit_on_texts(train_texts)

print('Found %d unique words.' % len(tokenizer.word_index))

# texts_to_sequences: Transforms each text in texts to a sequence of integers. 
# It basically takes each word in the text and replaces it with its corresponding integer value from the word_index dictionary.

train_texts_sequences = tokenizer.texts_to_sequences(train_texts)

# pad_sequences: Ensure that all sequences in a list have the same length. 
train_texts_pad_sequences = pad_sequences(train_texts_sequences, maxlen=80, padding='post') 

Found 12704 unique words.


In [78]:
from tensorflow.keras.utils import to_categorical
emotions = {'sadness': 0, 'joy': 1, 'surprise': 2, 'love': 3, 'anger': 4, 'fear': 5}

# Step 1: Replace all emotion values with integers
train['Emotion'] = train.Emotion.replace(emotions)
train_emotion_integers = train['Emotion'].values

# Step 2: Changing the integers to binary
train_emotion_categorical = to_categorical(train_emotion_integers)
train_emotion_categorical[:6] 

array([[0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0.]], dtype=float32)

In [79]:
validate_texts = validate['Text']
validate_emotion_integers = validate.Emotion.replace(emotions)
validate_texts_sequences = tokenizer.texts_to_sequences(validate_texts)
validate_texts_pad_sequences = pad_sequences(validate_texts_sequences, maxlen=80, padding='post')
validate_emotion_categorical = to_categorical(validate_emotion_integers.values)
validate_emotion_categorical[:6]

array([[0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0.]], dtype=float32)

In [80]:
import tensorflow as tf
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
  tf.config.experimental_connect_to_cluster(tpu)
  tf.tpu.experimental.initialize_tpu_system(tpu)
  print("All devices: ", tf.config.list_logical_devices('TPU'))
  tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
  tpu_strategy = tf.distribute.get_strategy() 

In [81]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Bidirectional,Dense,Embedding,Dropout

# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():
    model=Sequential()
    model.add(Embedding(15212,64,input_length=80))
    model.add(Dropout(0.6))
    model.add(Bidirectional(LSTM(80,return_sequences=True)))
    model.add(Bidirectional(LSTM(160)))
    model.add(Dense(len(emotions),activation='softmax'))
    print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 80, 64)            973568    
                                                                 
 dropout_1 (Dropout)         (None, 80, 64)            0         
                                                                 
 bidirectional_2 (Bidirectio  (None, 80, 160)          92800     
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 320)              410880    
 nal)                                                            
                                                                 
 dense_1 (Dense)             (None, 6)                 1926      
                                                                 
Total params: 1,479,174
Trainable params: 1,479,174
No

In [82]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [83]:
hist=model.fit(train_texts_pad_sequences, train_emotion_categorical, epochs=10, validation_data = (validate_texts_pad_sequences, validate_emotion_categorical))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [84]:
test_texts = test['Text']
test_emotion_integers = test.Emotion.replace(emotions)
test_texts_sequences = tokenizer.texts_to_sequences(test_texts)
test_texts_pad_sequences = pad_sequences(test_texts_sequences, maxlen=80, padding='post')
test_emotion_categorical = to_categorical(test_emotion_integers.values)
test_emotion_categorical[:7]

array([[0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.]], dtype=float32)

In [94]:
x = model.evaluate(test_texts_pad_sequences, test_emotion_categorical)
print(x)

[0.26521509885787964, 0.9150000214576721]


In [86]:
model.save("../static/model/m2.hdf5")

In [87]:
from tensorflow import keras
model = keras.models.load_model("../static/model/m2.hdf5")

In [88]:
def get_key(value):
    for key,val in emotions.items():
          if (val==value):
            return key
        
def predict(sentence):
    sentence = convert_emoji(sentence)
    sentence = expand_contractions(sentence)
    sentence = chat_words_conversion(sentence)
    sentence = convert_to_lower(sentence)
    sentence = remove_stopwords(sentence)
    sentence = remove_non_alphanum(sentence)
    sentence = remove_extendedwords(sentence)
    sentence = lemmatize_text(sentence)
    print(sentence)
    sentence_lst=[]
    sentence_lst.append(sentence)
    sentence_seq=tokenizer.texts_to_sequences(sentence_lst)
    sentence_padded=pad_sequences(sentence_seq,maxlen=80,padding='post')
    certaintyprediction = model.predict(sentence_padded)[0]
    for key,val in emotions.items():
          print(key + ': ' + str(round(certaintyprediction[val]*100, 2)) + ' %')
    bestpredictionindex = np.argmax(certaintyprediction)
    certainty = str(round(certaintyprediction[bestpredictionindex]*100, 2))
    print('\nI am '+ certainty + ' % sure the emotion is ' + get_key(bestpredictionindex) + '.')

In [96]:
sentence = "🎁"
predict(sentence)

wrapped_gift
sadness: 2.21 %
joy: 8.74 %
surprise: 62.51 %
love: 7.13 %
anger: 0.89 %
fear: 18.52 %

I am 62.51 % sure the emotion is surprise.


In [103]:
predict("this is niceee😁")

nice beaming_face_with_smiling_eyes
sadness: 0.03 %
joy: 98.75 %
surprise: 0.02 %
love: 1.15 %
anger: 0.03 %
fear: 0.01 %

I am 98.75 % sure the emotion is joy.


In [104]:
sentence = "A wedding can be a highly emotional event."
predict(sentence)

wed highly emotional event
sadness: 99.95 %
joy: 0.01 %
surprise: 0.0 %
love: 0.0 %
anger: 0.01 %
fear: 0.03 %

I am 99.95 % sure the emotion is sadness.


In [105]:
sentence = "You are being very rude."
predict(sentence)

rude
sadness: 0.52 %
joy: 0.08 %
surprise: 0.04 %
love: 0.11 %
anger: 98.7 %
fear: 0.55 %

I am 98.7 % sure the emotion is anger.


In [107]:
predict("Today I went to Bangalore railway station, Yeshwantpura, to receive my uncle and aunt who were coming from Mumbai. It was a bright sunny day. Sun was shining like a star. While I and my father were crossing the Orion mall, we saw three elephants that made me reminded of my Kerala trip.Last year I went on a Kerala trip, where we visited around 5 cities like Cochin, Wayanad, Munnar, Kovalam, and Alappuzha. All the places were really awesome and beautiful. Then we went to Elephant junction Thekkady, Kumily, where people go for elephant rides. I rode sitting above the elephant around for 2 and half hours. Then we have also done elephant bath and feeding. We took a lot of pictures with elephants. It was a nice trip and I still can’t get over it.")

today go bangalore railway station yeshwantpura receive uncle aunt come lumbar bright sunny day sun shin like star father cross onion mall saw three elephant make remind herald trip last year go herald trip visit around 5 city like cochon wayanad manner kovalam alappuzha place really awesome beautiful go elephant junction thekkady family people go elephant ride ride sit elephant around 2 half hour also do elephant bath feed take lot picture elephant nice trip still can not get it
sadness: 0.11 %
joy: 99.73 %
surprise: 0.01 %
love: 0.08 %
anger: 0.06 %
fear: 0.02 %

I am 99.73 % sure the emotion is joy.
