In [2]:
import numpy as np 
import pandas as pd 
import emoji
import matplotlib.pyplot as plt
import random
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers import Embedding
from keras.callbacks import ReduceLROnPlateau

In [3]:
train_ds = pd.read_csv('/Users/alfahwun/Downloads/train_emoji.csv',header=None).drop([2,3],axis=1)
test_ds = pd.read_csv('/Users/alfahwun/Downloads/test_emoji.csv', header=None)

In [4]:
X_train = train_ds[0]
Y_train = train_ds[1]

In [5]:
X_test = test_ds[0]
Y_test = test_ds[1]

In [6]:
from keras.utils import to_categorical
Y_train_hot = to_categorical(Y_train)
Y_test_hot = to_categorical(Y_test)

In [7]:
emoji_dictionary = {"0": ":heart:",    
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}

def label_to_emoji(label):
    return emoji.emojize(emoji_dictionary[str(label)])


In [8]:
# we will be using 50 d word vectors
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()         # ensures unique values
        word_to_vec_map = {}  # this will be a dictionary mapping words to their vectors
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}   # dictionary mapping words to their index in the dictionary
        index_to_words = {}   # dictionary mapping index to the word in the dictionary
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map


In [9]:
words_to_index, index_to_words, word_to_vec_map = read_glove_vecs('/Users/alfahwun/Downloads/glove.6B.100d.txt')

In [15]:
word_to_vec_map

In [12]:
def sentences_to_indices(X, word_to_index, max_len):
 
    m = X.shape[0]                                  
    X_indices = np.zeros((m, max_len))
    
    for i in range(m):                               
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words = X[i].lower().split()
        
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            X_indices[i, j] = word_to_index[w]
            j = j+1  
    
    return X_indices


In [None]:
X_train_index = sentences_to_indices(X_train,word)

In [None]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    vocab_len = len(word_to_index) + 1               # +1 for Keras  
    emb_dim = 50                                     # dimensionality of your GloVe word vectors
    
    emb_matrix = np.zeros((vocab_len, emb_dim))      # Initialization with zeros
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    
    # Build the embedding layer
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [None]:
def Emojify(input_shape, word_to_vec_map, word_to_index):
    sentence_indices = Input(shape=input_shape, dtype='int32')
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    embeddings = embedding_layer(sentence_indices)
    


In [None]:
    X = LSTM(128, return_sequences=True)(embeddings)
    X = Dropout(0.5)(X)
    X = LSTM(128, return_sequences=False)(X)
    X = Dropout(0.5)(X)
    X = Dense(5, activation='softmax')(X)
    X = Activation('softmax')(X)    
    
    model = Model(inputs=sentence_indices, outputs=X)
    
    return model



In [None]:
emojifier = Emojify((maxWords,), word_to_vec_map, words_to_index)
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=3, min_lr=0.00001, verbose=1)
emojifier.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
emojifier.fit(X_train_index, Y_train_hot, epochs = 100, batch_size = 16, shuffle=True, 
                               callbacks=[reduce_lr])
    