# Emojify
## Map emoji to a text according to the context

## What is covered?
1. Data Engineering
2. Load Embedding Vectors
3. Train the model
4. Test & Results

In [2]:
import pandas as pd
from util import Utils
import numpy as np
import emoji

## 1. Data Engineering

In [3]:
#load train & test csv files
cols = ["Doc", "Label", "c3", "c4"]
df = pd.read_csv("emojify_data.csv", header=None, names = cols)
df2 = pd.read_csv("test_emoji.csv", header=None, names = cols)
df.head()

Unnamed: 0,Doc,Label,c3,c4
0,French macaroon is so tasty,4,,
1,work is horrible,3,,
2,I am upset,3,,[3]
3,throw the ball,1,,[2]
4,Good joke,2,,


### Labels to Emoji
<p>The text is labeled with integers range from 0-4. Each integer corresponds to a specific emoji.</p>

In [4]:
emoji_dictionary = {"0": "\u2764\uFE0F",    
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}

#function to convert integer to printable emoji
def label_to_emoji(label):
    return emoji.emojize(emoji_dictionary[str(label)], use_aliases=True)

#print labels and respective emoji
for i in range(5):
    print("label", i,label_to_emoji(i))

label 0 ❤️
label 1 ⚾
label 2 😄
label 3 😞
label 4 🍴


In [5]:
docs = df["Doc"]
labels = df["Label"]
docs_test = df2["Doc"]
labels_test = df2["Label"]
X = []
y = []

X_test = []
y_test = []

#create tokenized documents and assign labels
for i,doc in enumerate(docs):
    X.append(doc.split())
    y.append(labels[i])
    
for i,doc in enumerate(docs_test):
    X_test.append(doc.split())
    y_test.append(labels_test[i])
    
#print first example 
print(X[0],label_to_emoji(y[0]))
print(X_test[1],label_to_emoji(y_test[1]))

['French', 'macaroon', 'is', 'so', 'tasty'] 🍴
['he', 'did', 'not', 'answer'] 😞


## 2. Load GloVe Embedding Vectors

In [6]:
util = Utils()
emb_file = 'D:\Resources\Glove_Embeddings\glove.6B.50d.txt'
dimention = 50
word_to_index, index_to_word, word_to_vec_map = util.read_emb_vec(file_name=emb_file, dimention = dimention)

## 3. Train the Keras Model

In [7]:
import numpy as np
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
np.random.seed(1)
import os
import tensorflow as tf
#disable warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

Using TensorFlow backend.


In [8]:
#convert tokenize docs to the indices representation of glove embedding
def sentences_to_indices(X, word_to_index, max_len):
    m = len(X)                                  
    X_indices = np.zeros((m,max_len))
    for i,x in enumerate(X):
        j = 0
        # Loop over the tokens
        for w in x:
            X_indices[i, j] = word_to_index[w.lower()]
            j = j + 1
    return X_indices

In [9]:
example = [X[0],X[1]]
example_indices = sentences_to_indices([X[0],X[1]],word_to_index, max_len = 5)
print("X1 =", example)
print("X1_indices =", example_indices)

X1 = [['French', 'macaroon', 'is', 'so', 'tasty'], ['work', 'is', 'horrible']]
X1_indices = [[153730. 229211. 192973. 336115. 353731.]
 [389837. 192973. 181872.      0.      0.]]


In [10]:
#create an embedding layer with GloVe Data for the Keras Model
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
   
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    emb_matrix = np.zeros((vocab_len,emb_dim))
    print(emb_matrix.shape)
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]
    embedding_layer = Embedding(input_dim=vocab_len, output_dim=emb_dim,trainable=False)
    embedding_layer.build((None,))
    
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [5]:
# Keras emojify LSTM Model
def emojify_model(input_shape, word_to_vec_map, word_to_index):
    sentence_indices = Input(input_shape, dtype = 'int32')
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    embeddings = embedding_layer(sentence_indices)   
    X = LSTM(128, return_sequences=True)(embeddings)
    X = Dropout(0.5)(X)
    X = LSTM(128, return_sequences=False)(X)
    X = Dropout(0.5)(X)
    X = Dense(5)(X)
    X = Activation('softmax')(X)
    
    model = Model(input=sentence_indices, output=X)
    return model


In [4]:
def getMaxLen(X):
    max = 0
    for x in X:
        if len(x) > max:
            max = len(x)
    return max

maxLen = getMaxLen(X)
print("Max length of doc is ", maxLen)

model = emojify_model((maxLen,), word_to_vec_map, word_to_index)
model.summary()

NameError: name 'X' is not defined

In [15]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
X_train_indices = sentences_to_indices(X, word_to_index, maxLen)
Y_train_oh = util.convert_to_one_hot(np.array(y), C = 5)

X_test_indices = sentences_to_indices(X_test, word_to_index, maxLen)
Y_test_oh = util.convert_to_one_hot(np.array(y_test), C = 5)

In [1]:
model.fit(X_train_indices, Y_train_oh, epochs = 35, batch_size = 1, shuffle=True, validation_data=(X_test_indices, Y_test_oh))

NameError: name 'model' is not defined

## 4. Test & Results

In [21]:
# This code allows you to see the mislabelled examples

Y_test = y_test
pred = model.predict(X_test_indices)
for i in range(len(X_test)):
    x = X_test_indices
    num = np.argmax(pred[i])
    sentence = " "
    for w in X_test[i]:
        sentence += (w + " ")
    if True:
        print('Text: ' + sentence + ', Expected emoji:' + label_to_emoji(Y_test[i]) + ', predicted ' + label_to_emoji(num).strip())

Text:  I want to eat , Expected emoji:🍴, predicted 🍴
Text:  he did not answer , Expected emoji:😞, predicted 😞
Text:  he got a raise , Expected emoji:😄, predicted 😄
Text:  she got me a present , Expected emoji:❤️, predicted ❤️
Text:  ha ha ha it was so funny , Expected emoji:😄, predicted 😄
Text:  he is a good friend , Expected emoji:❤️, predicted ❤️
Text:  I am upset , Expected emoji:❤️, predicted 😞
Text:  We had such a lovely dinner tonight , Expected emoji:❤️, predicted ❤️
Text:  where is the food , Expected emoji:🍴, predicted 🍴
Text:  Stop making this joke ha ha ha , Expected emoji:😄, predicted 😄
Text:  where is the ball , Expected emoji:⚾, predicted ⚾
Text:  work is hard , Expected emoji:😞, predicted 😞
Text:  This girl is messing with me , Expected emoji:😞, predicted 😞
Text:  are you serious ha ha , Expected emoji:😄, predicted 😄
Text:  Let us go play baseball , Expected emoji:⚾, predicted ⚾
Text:  This stupid grader is not working , Expected emoji:😞, predicted 😞
Text:  work is horri