In [77]:
import numpy as np
import pandas as pd 
import os
from nltk.corpus import stopwords
from textblob import Word
import re
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
import emoji

In [78]:
data = pd.read_csv(os.path.join(os.getcwd(),"../static/dataset/emoji_dataset.csv"))
data

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated 😟,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wro...,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy 😠,anger
...,...,...
19995,i just keep feeling like someone is being unki...,anger
19996,im feeling a little cranky negative after this...,anger
19997,i feel that i am useful to my people and that ...,joy
19998,im feeling more comfortable with derby i feel ...,joy


In [79]:
data = data[['Text','Emotion']]

In [80]:
from sklearn.model_selection import train_test_split
train, validate, test = np.split(data.sample(frac=1, random_state=42), [int(.8*len(data)), int(0.9*len(data))])

In [81]:
train.shape

(16000, 2)

In [82]:
def de_repeat(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

# def preprocess(x):
#     x = ' '.join([item for item in str(x).split() if item not in stopwords.words('english')])
#     x = ' '.join(x.lower() for x in x.split())
#     x = emoji.demojize(x)
#     x = x.replace('[^\w\s]',' ')
#     x = ' '.join([Word(word).lemmatize() for word in x.split()])
#     x = ' '.join(de_repeat(x) for x in x.split())
#     return x

In [83]:
from nltk.corpus import stopwords

train['Text'] = train['Text'].apply(lambda x: ' '.join([item for item in str(x).split() if item not in stopwords.words('english')]))
train['Text'] = train['Text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['Text'] = train['Text'].apply(lambda x: emoji.demojize(x))
train['Text'] = train['Text'].str.replace('[^\w\s]',' ')
train['Text'] = train['Text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['Text'] = train['Text'].apply(lambda x: " ".join(de_repeat(x) for x in x.split()))
train


Unnamed: 0,Text,Emotion
10650,noticed several month ago start feeling resent...,anger
2041,love lot different kind sport love hanging fri...,joy
8668,feel even killed agonized extent loudly_crying...,sadness
1114,feel numb way wound really start hurt slightly...,sadness
13902,feel happy inspired little si love reading wri...,joy
...,...,...
7382,pay month month feel shame every time grill ho...,love
13492,feeling determined going get face_with_tears_o...,joy
10394,remember feeling bit confused really questione...,fear
16865,feel helpless look world fearful_face,fear


In [84]:
validate['Text'] = validate['Text'].apply(lambda x: ' '.join([item for item in str(x).split() if item not in stopwords.words('english')]))
validate['Text'] = validate['Text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
validate['Text'] = validate['Text'].apply(lambda x: emoji.demojize(x))
validate['Text'] = validate['Text'].str.replace('[^\w\s]',' ')
validate['Text'] = validate['Text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
validate['Text'] = validate['Text'].apply(lambda x: " ".join(de_repeat(x) for x in x.split()))
validate



Unnamed: 0,Text,Emotion
1262,go detail long night woke bus feeling like cou...,joy
19010,im feeling tortured write today grimacing_face,fear
7212,still feel like tragic waste pleading_face,sadness
975,feel humiliated body husband make advance towa...,sadness
2566,feel horribly insecure fearful_face,fear
...,...,...
10900,feel helpless make real difference pensive_face,sadness
7758,feel impatient much thanks nic know calm face_...,anger
4837,feel outraged life easy blessed angry_face_wit...,anger
6548,feel like witnessing birth really amazing dm f...,joy


In [85]:
test['Text'] = test['Text'].apply(lambda x: ' '.join([item for item in str(x).split() if item not in stopwords.words('english')]))
test['Text'] = test['Text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
test['Text'] = test['Text'].apply(lambda x: emoji.demojize(x))
test['Text'] = test['Text'].str.replace('[^\w\s]',' ')
test['Text'] = test['Text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
test['Text'] = test['Text'].apply(lambda x: " ".join(de_repeat(x) for x in x.split()))
test



Unnamed: 0,Text,Emotion
3716,didnt want lazy feel groggy kept drinking red ...,sadness
10837,thought good idea gave time recover feeling ne...,fear
6140,feel like didnt really care alexis irritated e...,anger
9956,feel stress free heading holiday rolling_on_th...,joy
1549,keep feeling sometimes one fake till make cryi...,sadness
...,...,...
11284,want savor feeling ecstatic anticipation abide...,joy
11964,im feeling puppy dog rainbow im exhausted yes ...,sadness
5390,feel delicate bouquet,love
860,starting feel little stressed broken_heart,sadness


In [86]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_texts = train['Text']
tokenizer = Tokenizer(15212,lower=True,oov_token='UNK')
tokenizer.fit_on_texts(train_texts)

print('Found %d unique words.' % len(tokenizer.word_index))

# texts_to_sequences: Transforms each text in texts to a sequence of integers. 
# It basically takes each word in the text and replaces it with its corresponding integer value from the word_index dictionary.

train_texts_sequences = tokenizer.texts_to_sequences(train_texts)

# pad_sequences: Ensure that all sequences in a list have the same length. 
train_texts_pad_sequences = pad_sequences(train_texts_sequences, maxlen=80, padding='post') 

Found 13470 unique words.


In [87]:
from tensorflow.keras.utils import to_categorical
emotions = {'sadness': 0, 'joy': 1, 'surprise': 2, 'love': 3, 'anger': 4, 'fear': 5}

# Step 1: Replace all emotion values with integers
train['Emotion'] = train.Emotion.replace(emotions)
train_emotion_integers = train['Emotion'].values

# Step 2: Changing the integers to binary
train_emotion_categorical = to_categorical(train_emotion_integers)
train_emotion_categorical[:6] 

array([[0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.]], dtype=float32)

In [88]:
validate_texts = validate['Text']
validate_emotion_integers = validate.Emotion.replace(emotions)
validate_texts_sequences = tokenizer.texts_to_sequences(validate_texts)
validate_texts_pad_sequences = pad_sequences(validate_texts_sequences, maxlen=80, padding='post')
validate_emotion_categorical = to_categorical(validate_emotion_integers.values)
validate_emotion_categorical[:6]

array([[0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.]], dtype=float32)

In [89]:
import tensorflow as tf
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
  tf.config.experimental_connect_to_cluster(tpu)
  tf.tpu.experimental.initialize_tpu_system(tpu)
  print("All devices: ", tf.config.list_logical_devices('TPU'))
  tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
  tpu_strategy = tf.distribute.get_strategy() 

In [90]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Bidirectional,Dense,Embedding,Dropout

# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():
    model=Sequential()
    model.add(Embedding(15212,64,input_length=80))
    model.add(Dropout(0.6))
    model.add(Bidirectional(LSTM(80,return_sequences=True)))
    model.add(Bidirectional(LSTM(160)))
    model.add(Dense(len(emotions),activation='softmax'))
    print(model.summary())

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 80, 64)            973568    
                                                                 
 dropout_5 (Dropout)         (None, 80, 64)            0         
                                                                 
 bidirectional_10 (Bidirecti  (None, 80, 160)          92800     
 onal)                                                           
                                                                 
 bidirectional_11 (Bidirecti  (None, 320)              410880    
 onal)                                                           
                                                                 
 dense_5 (Dense)             (None, 6)                 1926      
                                                                 
Total params: 1,479,174
Trainable params: 1,479,174
No

In [91]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [92]:
hist=model.fit(train_texts_pad_sequences, train_emotion_categorical, epochs=10, validation_data = (validate_texts_pad_sequences, validate_emotion_categorical))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

In [97]:
test_texts = test['Text']
test_emotion_integers = test.Emotion.replace(emotions)
test_texts_sequences = tokenizer.texts_to_sequences(test_texts)
test_texts_pad_sequences = pad_sequences(test_texts_sequences, maxlen=80, padding='post')
test_emotion_categorical = to_categorical(test_emotion_integers.values)
test_emotion_categorical[:7]

In [96]:
x = model.evaluate(test_texts_pad_sequences, test_emotion_categorical)
print(x)

In [None]:
# from tensorflow.keras.models import Sequential, model_from_json
# from tensorflow.keras.layers import Dense
# import numpy
# import os
# model_json = model.to_json()
# with open("../static/model/model.json", "w") as json_file:
#     json_file.write(model_json)
# # serialize weights to HDF5
# model.save_weights("../static/model/model.h5")
# print("Saved model to disk")

In [98]:
# import pickle
# # dump information to that file
# pickle.dump(tokenizer, open('../static/model/tokenizer.pkl', 'wb'))