# Emojifier V1

In [20]:
import numpy as np
import emoji
import pandas as pd
import matplotlib.pyplot as plt

In [21]:
train_df = pd.read_csv('data/train_emoji.csv', header=None, names=['phrase','emoji'], usecols=[0,1])
test_df = pd.read_csv('data/test_emoji.csv', header=None, names=['phrase','emoji'], usecols=[0,1])

### Train data frame

In [22]:
train_df.head()

Unnamed: 0,phrase,emoji
0,never talk to me again,3
1,I am proud of your achievements,2
2,It is the worst day in my life,3
3,Miss you so much,0
4,food is life,4


### Test data frame

In [23]:
test_df.head()

Unnamed: 0,phrase,emoji
0,I want to eat\t,4
1,he did not answer\t,3
2,he got a very nice raise\t,2
3,she got me a nice present\t,2
4,ha ha ha it was so funny\t,2


In [24]:
X_train = train_df['phrase'].to_numpy()
Y_train = train_df['emoji'].to_numpy()

X_test = test_df['phrase'].to_numpy()
Y_test = test_df['emoji'].to_numpy()

In [25]:
emoji_dictionary = {"0": "\u2764\uFE0F",    # :heart: prints a black instead of red heart depending on the font
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}

def label_to_emoji(label):
    """
    Converts a label (int or string) into the corresponding emoji code (string) ready to be printed
    """
    return emoji.emojize(emoji_dictionary[str(label)], use_aliases=True)

In [26]:
index = 1
print(X_train[index], label_to_emoji(Y_train[index]))

I am proud of your achievements 😄


In [27]:
Y_oh_train = pd.get_dummies(Y_train).to_numpy()
Y_oh_test = pd.get_dummies(Y_test).to_numpy()

In [28]:
index = 50
print(Y_train[index], "is converted into one hot", Y_oh_train[index])

0 is converted into one hot [1 0 0 0 0]


### Reading GloVe file

In [29]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [30]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

In [31]:
word = "cucumber"
index = 289846
print("the index of", word, "in the vocabulary is", word_to_index[word])
print("the", str(index) + "th word in the vocabulary is", index_to_word[index])

the index of cucumber in the vocabulary is 113317
the 289846th word in the vocabulary is potatos


In [32]:
index = 289846
print("Embending vector for word ", index_to_word[index])
word_to_vec_map[index_to_word[index]]

Embending vector for word  potatos


array([-0.77033 , -0.57765 , -1.4086  ,  0.28488 ,  1.3224  ,  0.025708,
       -0.39524 ,  0.1565  ,  0.73926 ,  0.07319 ,  0.39573 ,  0.81557 ,
        1.7829  , -1.1323  , -1.2555  , -0.31784 , -1.0604  , -0.19973 ,
        2.2133  , -0.13883 ,  0.50197 ,  1.5084  ,  0.58036 ,  0.3317  ,
        0.56073 ,  2.767   ,  0.22387 , -1.1203  ,  1.3909  ,  0.51904 ,
        0.91258 ,  1.2222  ,  1.3356  , -0.01816 ,  1.855   ,  0.87767 ,
       -0.51949 , -0.60569 , -0.33336 , -0.68257 ,  0.18867 , -1.2253  ,
       -0.3839  , -0.61717 ,  1.311   ,  1.0898  ,  0.23086 ,  1.3371  ,
       -0.060975,  1.4675  ])

### Sentence to avg
1. Convert every sentence to lower-case, then split the sentence into a list of words. X.lower() and X.split() might be useful.
2. For each word in the sentence, access its GloVe representation. Then, average all these values.

In [33]:
def sentence_to_avg(sentence, word_to_vec_map):
    words = sentence.lower().split()

    avg = np.zeros((50,))
    for w in words:
        avg += word_to_vec_map[w]
    
    avg = avg / len(words)
    return avg

In [34]:
avg = sentence_to_avg("Morrocan couscous is my favorite dish", word_to_vec_map)
print("avg = ", avg)
print(avg.shape)

avg =  [-0.008005    0.56370833 -0.50427333  0.258865    0.55131103  0.03104983
 -0.21013718  0.16893933 -0.09590267  0.141784   -0.15708967  0.18525867
  0.6495785   0.38371117  0.21102167  0.11301667  0.02613967  0.26037767
  0.05820667 -0.01578167 -0.12078833 -0.02471267  0.4128455   0.5152061
  0.38756167 -0.898661   -0.535145    0.33501167  0.68806933 -0.2156265
  1.797155    0.10476933 -0.36775333  0.750785    0.10282583  0.348925
 -0.27262833  0.66768    -0.10706167 -0.283635    0.59580117  0.28747333
 -0.3366635   0.23393817  0.34349183  0.178405    0.1166155  -0.076433
  0.1445417   0.09808667]
(50,)


In [35]:
def softmax(x):
    e_x = np.exp(x)
    return e_x/e_x.sum()

In [36]:
def cross_entropy_loss(y_hat, y):
    return - np.sum(y * np.log(y_hat))

In [40]:
def predict(X, Y, W, b, word_to_vec_map):
    """
    Given X (sentences) and Y (emoji indices), predict emojis and compute the accuracy of your model over the given set.
    
    Arguments:
    X -- input data containing sentences, numpy array of shape (m, None)
    Y -- labels, containing index of the label emoji, numpy array of shape (m, 1)
    
    Returns:
    pred -- numpy array of shape (m, 1) with your predictions
    """
    m = X.shape[0]
    pred = np.zeros((m, 1))
    
    for j in range(m):                       # Loop over training examples
        
        avg = sentence_to_avg(X[j], word_to_vec_map)

        # Forward propagation
        Z = np.dot(W, avg) + b
        A = softmax(Z)
        pred[j] = np.argmax(A)
        
    print("Accuracy: "  + str(np.mean((pred[:] == Y.reshape(Y.shape[0],1)[:]))))
    
    return pred

In [41]:
def model(X, Y, word_to_vec_map, learning_rate = 0.01, num_iterations = 400):
    """
    Model to train word vector representations in numpy.
    
    Arguments:
    X -- input data, numpy array of sentences as strings, of shape (m, 1)
    Y -- labels, numpy array of integers between 0 and 7, numpy-array of shape (m, 1)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    learning_rate -- learning_rate for the stochastic gradient descent algorithm
    num_iterations -- number of iterations
    
    Returns:
    pred -- vector of predictions, numpy-array of shape (m, 1)
    W -- weight matrix of the softmax layer, of shape (n_y, n_h)
    b -- bias of the softmax layer, of shape (n_y,)
    """
    
    np.random.seed(1)

    m = Y.shape[0]          # number of training examples
    n_y = 5                 # number of classes
    n_h = 50                # dimension of GloVe vectors

    # Initialize parameters using Xavier initialization
    W = np.random.randn(n_y, n_h) / np.sqrt(n_h)    # shape (5, 50)
    b = np.zeros((n_y,))                            # bias: shape (5)

    # convert Y to one-hot
    Y_oh = pd.get_dummies(Y).to_numpy()

    # Optimization loop
    for t in range(num_iterations):                       # Loop over the number of iterations
        for i in range(m):                                # Loop over the training examples
            # Average the word vectors of the words from the i'th training example
            avg = sentence_to_avg(X[i], word_to_vec_map)

            # Forward propagate the avg through the softmax layer
            z = np.dot(W, avg) + b                        # (5, 50) * (50, 1) + (5, 1) = (5, 1)
            a = softmax(z)

            # Compute cost using the i'th training label's one hot representation and "A" (the output of the softmax)
            cost = cross_entropy_loss(a, Y_oh[i])     

            # Compute gradients
            dz = a - Y_oh[i]            # (5, 1)     
            dw = np.dot(dz.reshape(n_y, 1), avg.reshape(n_h, 1).T)      # (5, 1) * (1, 50) = (5, 50)
            db = dz

            # Update parameters with Stochastic Gradient Descent
            W = W - learning_rate * dw
            b = b - learning_rate * db

        if t % 100 == 0:
            print("Epoch: " + str(t) + " --- cost = " + str(cost))
            pred = predict(X, Y, W, b, word_to_vec_map)
            
    return pred, W, b



In [42]:
pred, W, b = model(X_train, Y_train, word_to_vec_map)
print(pred)

Epoch: 0 --- cost = 1.9520498812810074
Accuracy: 0.3484848484848485
Epoch: 100 --- cost = 0.07971818726014807
Accuracy: 0.9318181818181818
Epoch: 200 --- cost = 0.04456369243681379
Accuracy: 0.9545454545454546
Epoch: 300 --- cost = 0.0343226737878607
Accuracy: 0.9696969696969697
[[3.]
 [2.]
 [3.]
 [0.]
 [4.]
 [0.]
 [3.]
 [2.]
 [3.]
 [1.]
 [3.]
 [3.]
 [1.]
 [3.]
 [2.]
 [3.]
 [2.]
 [3.]
 [1.]
 [2.]
 [3.]
 [0.]
 [2.]
 [2.]
 [2.]
 [1.]
 [4.]
 [3.]
 [3.]
 [4.]
 [0.]
 [3.]
 [4.]
 [2.]
 [0.]
 [3.]
 [2.]
 [2.]
 [3.]
 [4.]
 [2.]
 [2.]
 [0.]
 [2.]
 [3.]
 [0.]
 [3.]
 [2.]
 [4.]
 [3.]
 [0.]
 [3.]
 [3.]
 [3.]
 [4.]
 [2.]
 [1.]
 [1.]
 [1.]
 [2.]
 [3.]
 [1.]
 [0.]
 [0.]
 [0.]
 [3.]
 [4.]
 [4.]
 [2.]
 [2.]
 [1.]
 [2.]
 [0.]
 [3.]
 [2.]
 [2.]
 [0.]
 [3.]
 [3.]
 [1.]
 [2.]
 [1.]
 [2.]
 [2.]
 [4.]
 [3.]
 [3.]
 [2.]
 [4.]
 [0.]
 [0.]
 [3.]
 [3.]
 [3.]
 [3.]
 [2.]
 [0.]
 [1.]
 [2.]
 [3.]
 [0.]
 [2.]
 [2.]
 [2.]
 [3.]
 [2.]
 [2.]
 [2.]
 [4.]
 [1.]
 [1.]
 [3.]
 [3.]
 [4.]
 [1.]
 [2.]
 [1.]
 [1.]
 [3.]
 [1.]


### Examining test set performance

In [43]:
print("Training set:")
pred_train = predict(X_train, Y_train, W, b, word_to_vec_map)
print('Test set:')
pred_test = predict(X_test, Y_test, W, b, word_to_vec_map)

Training set:
Accuracy: 0.9772727272727273
Test set:
Accuracy: 0.8571428571428571


In [45]:
def print_predictions(X, pred):
    print()
    for i in range(X.shape[0]):
        print(X[i], label_to_emoji(int(pred[i])))

In [46]:
X_my_sentences = np.array(["i adore you", "i love you", "funny lol", "lets play with a ball", "food is ready", "not feeling happy"])
Y_my_labels = np.array([[0], [0], [2], [1], [4],[3]])

pred = predict(X_my_sentences, Y_my_labels , W, b, word_to_vec_map)
print_predictions(X_my_sentences, pred)

Accuracy: 0.8333333333333334

i adore you ❤️
i love you ❤️
funny lol 😄
lets play with a ball ⚾
food is ready 🍴
not feeling happy 😄


In [47]:
pred = predict(X_test, Y_test , W, b, word_to_vec_map)
print_predictions(X_test, pred)

Accuracy: 0.8571428571428571

I want to eat	 🍴
he did not answer	 😞
he got a very nice raise	 😄
she got me a nice present	 😄
ha ha ha it was so funny	 😄
he is a good friend	 😄
I am upset	 😞
We had such a lovely dinner tonight	 😄
where is the food	 🍴
Stop making this joke ha ha ha	 😄
where is the ball	 ⚾
work is hard	 😄
This girl is messing with me	 ❤️
are you serious 😞
Let us go play baseball	 ⚾
This stupid grader is not working 	 😞
work is horrible	 😄
Congratulation for having a baby	 😄
stop pissing me off 😞
any suggestions for dinner	 😄
I love taking breaks	 😞
you brighten my day	 ❤️
I boiled rice	 🍴
she is a bully	 😞
Why are you feeling bad	 😞
I am upset	 😞
give me the ball ⚾
My grandmother is the love of my life	 ❤️
enjoy your game ⚾
valentine day is near	 😄
I miss you so much	 ❤️
throw the ball	 ⚾
My life is so boring	 😞
she said yes	 😄
will you be my valentine	 ❤️
he can pitch really well	 ⚾
dance with me	 😄
I am hungry 🍴
See you at the restaurant	 🍴
I like to laugh	 😄
I will  ru