In [21]:
import numpy as np
import emoji
import matplotlib.pyplot as plt
import csv
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.layers import Input, LSTM, Dropout, Dense, Activation



%matplotlib inline

In [3]:
X_train, y_train, X_test, y_test = ([] for i in range(4))

with open('emoji_train.csv') as c:
    csvReader = csv.reader(c)
    for line in csvReader:
        X_train.append(line[0])
        y_train.append(line[1])

with open('emoji_test.csv') as c:
    csvReader = csv.reader(c)
    for line in csvReader:
        X_test.append(line[0])
        y_test.append(line[1])

X_train = np.asarray(X_train)
X_test = np.asarray(X_test)
y_train = np.asarray(y_train, dtype=int)
y_test = np.asarray(y_test, dtype=int)




In [4]:
max_len = len(max(X_train, key=len).split())

emoji_dict = {"0": "\u2764\uFE0F",   
              "1": ":baseball:",
              "2": ":smile:",
              "3": ":disappointed:",
              "4": ":fork_and_knife:"}

In [5]:
word_vec_dict = {}

with open('glove.6B.50d.txt','r', encoding='utf-8') as f:
    vocab = set()
    
    for line in f:
        line = line.strip().split()
        vocab.add(line[0])
        word_vec_dict[line[0]] = np.array(line[1:], dtype=np.float64)
        
        word_dict = {}
        inverse_word_dict = {}
        
    idx = 1
    for word in sorted(vocab):
            word_dict[idx] = word
            inverse_word_dict[word] = idx
            idx += 1


In [6]:
def convert_to_indices(X,inverse_word_dict, max_len):
    
    num_examples = X.shape[0]
    
    X_indices = np.zeros((num_examples, max_len))
    
    for i in range(num_examples):
        words = X[i].lower().split()
        j = 0
        for word in words:
            X_indices[i, j] = inverse_word_dict[word]
            j += 1
    return X_indices
    
  

In [10]:
def create_embedding_layer(word_vec_dict, inverse_word_dict):
    
    vocab_size = len(inverse_word_dict) + 1
    embed_dim = word_vec_dict['animal'].shape[0]
    
    embed_matrix = np.zeros((vocab_size, embed_dim))
    
    for word, idx in inverse_word_dict.items():
        embed_matrix[idx, :] = word_vec_dict[word]
        
    embed_layer = Embedding(vocab_size, embed_dim, trainable=False)
    embed_layer.build((None, ))
    embed_layer.set_weights([embed_matrix])
    
    return embed_layer
        


In [22]:
def model(in_shape, word_vec_dict, inverse_word_dict):
    
    sent_indices = Input(shape=in_shape)
    embed_layer = create_embedding_layer(word_vec_dict, inverse_word_dict)
    in_embed = embed_layer(sent_indices)
    
    X = LSTM(128, return_sequences=True)(in_embed)
    X = Dropout(0.5)(X)
    X = LSTM(128)(X)
    X = Dropout(0.5)(X)
    X = Dense(5)(X)
    X = Activation('softmax')(X)
    
    m = Model(inputs=sent_indices, outputs=X)
    
    return m


In [23]:
m = model((max_len,),word_vec_dict, inverse_word_dict)
m.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 10, 50)            20000050  
_________________________________________________________________
lstm_5 (LSTM)                (None, 10, 128)           91648     
_________________________________________________________________
dropout_5 (Dropout)          (None, 10, 128)           0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_6 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 645       
__________

In [24]:
m.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [25]:
X_train_in = convert_to_indices(X_train,inverse_word_dict, max_len)
y_train_in = np.eye(5)[y_train.reshape(-1)]



In [26]:
m.fit(X_train_in, y_train_in, epochs=50, batch_size=32, shuffle=True)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x18c8637a6d8>

In [28]:
X_test_in = convert_to_indices(X_test, inverse_word_dict, max_len=10)
y_test_in = np.eye(5)[y_test.reshape(-1)]
_, accuracy = m.evaluate(X_test_in, y_test_in)
print('Test accuracy = %f' % (accuracy))

Test accuracy = 0.686275


In [29]:
y_predict = m.predict(X_test_in)
for i in range(len(X_test)):
    x = X_test_in
    pred = np.argmax(y_predict[i])
    if pred != y_test[i]:
        print('Expected' + emoji.emojize(emoji_dict[str(y_test[i])], use_aliases=True) + 'Prediction' + X_test[i] + 
        emoji.emojize(emoji_dict[str(pred)], use_aliases=True))





Expected😄Predictionhe got a raise⚾
Expected❤️PredictionI am upset😞
Expected❤️PredictionWe had such a lovely dinner tonight😄
Expected😞Predictionwork is hard😄
Expected😞PredictionThis girl is messing with me❤️
Expected😄Predictionare you serious ha ha😞
Expected😞Predictionwork is horrible😄
Expected🍴Predictionany suggestions for dinner😄
Expected😄Predictionyou brighten my day❤️
Expected😞Predictionshe is a bully❤️
Expected😞PredictionI worked during my birthday😄
Expected😄Predictionenjoy your break⚾
Expected❤️Predictionvalentine day is near😄
Expected😄PredictionI will go dance⚾
Expected😄PredictionI like your jacket❤️
Expected❤️PredictionI love to the stars and back😄
