# PREDICCIÓN DE EMOJI

## 1. Librerías a usar

In [1]:
import pandas as pd
import numpy as np
import emoji
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from tabulate import tabulate

## 2. Carga y lectura de datos

In [2]:
# Cargar los datos de entrenamiento y de datos de los archivos csv
names_cols_train = ["Text", "Label", "C3", "C4"]
names_cols_test = ["Text", "Label"]
train = pd.read_csv("DATOS_PROYECTO_4/train_emoji.csv", header=None, names = names_cols_train)
test = pd.read_csv("DATOS_PROYECTO_4/test_emoji.csv", header=None, names = names_cols_test)

In [3]:
# Mostrar los 5 primeros datos de entrenamiento
train.head()

Unnamed: 0,Text,Label,C3,C4
0,never talk to me again,3,,
1,I am proud of your achievements,2,,
2,It is the worst day in my life,3,,
3,Miss you so much,0,,[0]
4,food is life,4,,


In [4]:
# Mostrar los 5 primeros datos de entrenamiento
test.head()

Unnamed: 0,Text,Label
0,I want to eat\t,4
1,he did not answer\t,3
2,he got a raise\t,2
3,she got me a present\t,0
4,ha ha ha it was so funny\t,2


## 3. De etiquetas a Emojis
<p>El texto está etiquetado con números enteros que van del 0 al 4. Cada número entero corresponde a un emoji específico.</p>

In [5]:
emoji_dictionary = {"0": ":heart:", #-- :corazón: imprime un corazón negro en lugar de rojo dependiendo de la fuente
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}

# Función para convertir un números entero en emoji imprimible
def label_to_emoji(label):
    return emoji.emojize(emoji_dictionary[str(label)], use_aliases=True)

# Mostrar las etiquetas y sus respectivos emojis
for i in range(5):
    print("Label", i, " :", label_to_emoji(i))

Label 0  : ❤️
Label 1  : ⚾
Label 2  : 😄
Label 3  : 😞
Label 4  : 🍴


In [6]:
# Mostrar los 5 primeros textos y sus respectivos emojis
data = train.values
for i in range(5):
    print(data[i][0], " :", label_to_emoji(data[i][1]))

never talk to me again  : 😞
I am proud of your achievements  : 😄
It is the worst day in my life  : 😞
Miss you so much  : ❤️
food is life  : 🍴


## 4. Creating training and testing data

In [7]:
X_train = train["Text"]
X_test = test["Text"]

In [8]:
maxLen = len(max(X_train, key=len).split())
print('Maximum words in sentence are:',maxLen)

Maximum words in sentence are: 10


Convert Y's to one-hot vectors

In [9]:
Y_train = pd.get_dummies(train["Label"])
Y_test = pd.get_dummies(test["Label"])

In [10]:
print("Tamaño de los datos de entrenamiento:", X_train.shape, Y_train.shape)
print ("Tamaño de los datos de test:", X_test.shape, Y_test.shape)

Tamaño de los datos de entrenamiento: (132,) (132, 5)
Tamaño de los datos de test: (56,) (56, 5)


## 4. Load GloVe Embedding Vectors

We will be using word vector representations of the words in the sentence so we need word vector representations of the words in the sentences. We will use the Glove vectors for this representation. Based on few iterations 100 d vectors seem to work best for this case.

In [11]:
def read_glove_vecs(glove_file):
    with open(glove_file,encoding='utf-8') as f:
        words = set()         # ensures unique values
        word_to_vec_map = {}  # this will be a dictionary mapping words to their vectors
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype='float32')
        
        i = 1
        words_to_index = {}   # dictionary mapping words to their index in the dictionary
        index_to_words = {}   # dictionary mapping index to the word in the dictionary
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('Glove_Embeddings/glove.6B.50d.txt')

We will be using Keras for implementation of the LSTM. We thus need to create an 'embedding layer'.

In [12]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    vocab_len = len(word_to_index) + 1               # +1 for Keras  
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # dimensionality of your GloVe word vectors
    
    emb_matrix = np.zeros((vocab_len, emb_dim))      # Initialization with zeros
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    
    # Build the embedding layer
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

We now need to convert all training sentences into lists of indices, and then zero-pad all these lists so that their length is the length of the longest sentence.

In [13]:
def sentences_to_indices(X, word_to_index, max_len):
    m = X.shape[0]                               # number of training examples
    X_indices = np.zeros((m, max_len))           # Initialize with zeros
    for i in range(m):
        sentence_words = (X[i].lower()).split()  # split each sentence into words
        j = 0
        for w in sentence_words:
            X_indices[i, j] = word_to_index[w]   # lookup index of word from vocabulary
            j = j + 1
            
    return X_indices

X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)
X_test_indices = sentences_to_indices(X_test, word_to_index, maxLen)

## 3. Keras emojify LSTM Model

In [14]:
sentence_indices = Input((maxLen,), dtype = 'int32')
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
embeddings = embedding_layer(sentence_indices)   
X = LSTM(128, return_sequences=True)(embeddings)
X = Dropout(0.5)(X)
X = LSTM(128, return_sequences=False)(X)
X = Dropout(0.5)(X)
X = Dense(5)(X)
X = Activation('softmax')(X)
    
model = Model(sentence_indices, X)


In [15]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 10)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 10, 50)            20000050  
_________________________________________________________________
lstm (LSTM)                  (None, 10, 128)           91648     
_________________________________________________________________
dropout (Dropout)            (None, 10, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 5)                 645   

## 3. Train the Keras Model

In [16]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [17]:
X_train_indices.shape

(132, 10)

In [18]:
Y_train.shape

(132, 5)

In [19]:
model.fit(X_train_indices, Y_train, epochs = 50, batch_size = 16, shuffle=True, validation_data=(X_test_indices, Y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1ea149654c0>

## 4. Test & Results

In [20]:
loss, acc = model.evaluate(X_test_indices, Y_test)
print()
print("Test accuracy = ", acc)


Test accuracy =  0.6785714030265808


In [21]:
Y_test_lbl = test["Label"]
Y_test_oh = pd.get_dummies(Y_test_lbl)
X_test_indices = sentences_to_indices(test["Text"], word_to_index, maxLen)
pred = model.predict(X_test_indices)

tabla_predict = []
for i in range(len(X_test)):
    x = X_test_indices
    num = np.argmax(pred[i])
    tabla_predict.append([X_test[i],label_to_emoji(Y_test_lbl[i]),label_to_emoji(num).strip()])

In [22]:
print(tabulate(tabla_predict, headers=["Texto", "Label", "Predicción"]))

Texto                                  Label    Predicción
-------------------------------------  -------  ------------
I want to eat                          🍴       🍴
he did not answer                      😞       😞
he got a raise                         😄       😄
she got me a present                   ❤️        😄
ha ha ha it was so funny               😄       😄
he is a good friend                    ❤️        😄
I am upset                             ❤️        😞
We had such a lovely dinner tonight    ❤️        😄
where is the food                      🍴       🍴
Stop making this joke ha ha ha         😄       😄
where is the ball                      ⚾       ⚾
work is hard                           😞       😄
This girl is messing with me           😞       ❤️
are you serious ha ha                  😄       😞
Let us go play baseball                ⚾       ⚾
This stupid grader is not working      😞       😞
work is horrible                       😞       😄
Congratulation for having a baby      