# PREDICCIÓN DE EMOJI

## 1. Librerías a usar

In [6]:
import pandas as pd
import numpy as np
import emoji
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from tabulate import tabulate

## 2. Carga y lectura de datos

In [7]:
# Cargar los datos de entrenamiento y de datos de los archivos csv
names_cols_train = ["Text", "Label", "C3", "C4"]
names_cols_test = ["Text", "Label"]
train = pd.read_csv("DATOS_PROYECTO_4/train_emoji.csv", header=None, names = names_cols_train)
test = pd.read_csv("DATOS_PROYECTO_4/test_emoji.csv", header=None, names = names_cols_test)

In [8]:
# Mostrar los 5 primeros datos de entrenamiento
train.head()

Unnamed: 0,Text,Label,C3,C4
0,never talk to me again,3,,
1,I am proud of your achievements,2,,
2,It is the worst day in my life,3,,
3,Miss you so much,0,,[0]
4,food is life,4,,


In [9]:
# Mostrar los 5 primeros datos de entrenamiento
test.head()

Unnamed: 0,Text,Label
0,I want to eat\t,4
1,he did not answer\t,3
2,he got a raise\t,2
3,she got me a present\t,0
4,ha ha ha it was so funny\t,2


## 3. De etiquetas a Emojis
<p>El texto está etiquetado con números enteros que van del 0 al 4. Cada número entero corresponde a un emoji específico.</p>

In [10]:
emoji_dictionary = {"0": ":heart:", #-- :corazón: imprime un corazón negro en lugar de rojo dependiendo de la fuente
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}

# Función para convertir un números entero en emoji imprimible
def label_to_emoji(label):
    return emoji.emojize(emoji_dictionary[str(label)], use_aliases=True)

# Mostrar las etiquetas y sus respectivos emojis
for i in range(5):
    print("Label", i, " :", label_to_emoji(i))

Label 0  : ❤️
Label 1  : ⚾
Label 2  : 😄
Label 3  : 😞
Label 4  : 🍴


In [11]:
# Mostrar los 5 primeros textos y sus respectivos emojis
data = train.values
for i in range(5):
    print(data[i][0], " :", label_to_emoji(data[i][1]))

never talk to me again  : 😞
I am proud of your achievements  : 😄
It is the worst day in my life  : 😞
Miss you so much  : ❤️
food is life  : 🍴


## 4. Creamos el training y test

In [12]:
X_train = train["Text"]
X_test = test["Text"]

In [13]:
maxLen = len(max(X_train, key=len).split())
print('Maximum words in sentence are:',maxLen)

Maximum words in sentence are: 10


Convert Y's to one-hot vectors

In [15]:
Y_train = pd.get_dummies(train["Label"])
Y_test = pd.get_dummies(test["Label"])

In [16]:
print("Tamaño de los datos de entrenamiento:", X_train.shape, Y_train.shape)
print ("Tamaño de los datos de test:", X_test.shape, Y_test.shape)

Tamaño de los datos de entrenamiento: (132,) (132, 5)
Tamaño de los datos de test: (56,) (56, 5)


## 4. Cargar los vectores Glove 

Usaremos representaciones de vectores de palabras en las palabras en la oración, por lo que necesitamos representaciones de vectores de palabras de las palabras en las oraciones. Usaremos los vectores Glove para esta representación. Basado en algunas iteraciones, los vectores 100 d parecen funcionar mejor para este caso.


In [18]:
def read_glove_vecs(glove_file):
    with open(glove_file,encoding='utf-8') as f:
        words = set()         # ensures unique values
        word_to_vec_map = {}  # creamos un diciconario para mapear las palabras
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype='float32')
        
        i = 1
        words_to_index = {}   # dictionario donde  mapeamos las palbras con el el indice
        index_to_words = {}   # dictionario donde mapeamos el indice con la palabra
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('gloveEmbeddings/glove.6B.50d.txt')

Usaremos Keras para la implementación del LSTM. Por lo tanto, necesitamos crear una 'capa de incrustación'.

In [19]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    vocab_len = len(word_to_index) + 1                
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # dimension de los vectores Glove
    
    emb_matrix = np.zeros((vocab_len, emb_dim))      # Inicializar la matriz con ceros

    # Establecer un subindice para cada fila
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Creamos 
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

Cambiamos las sentencias a indices

In [20]:
def sentences_to_indices(X, word_to_index, max_len):
    m = X.shape[0]                               
    X_indices = np.zeros((m, max_len))           # Initializamos la matriz con ceros
    for i in range(m):
        sentence_words = (X[i].lower()).split()  # partimos la sentencia en palabras
        j = 0
        for w in sentence_words:
            X_indices[i, j] = word_to_index[w]   # vemos el indice de la palabra en la frase y le asifnamos vocavulario
            j = j + 1
            
    return X_indices

X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)
X_test_indices = sentences_to_indices(X_test, word_to_index, maxLen)

## 3. Keras emojify LSTM Model

In [21]:
sentence_indices = Input((maxLen,), dtype = 'int32')
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
embeddings = embedding_layer(sentence_indices)   
X = LSTM(128, return_sequences=True)(embeddings)
X = Dropout(0.5)(X)
X = LSTM(128, return_sequences=False)(X)
X = Dropout(0.5)(X)
X = Dense(5)(X)
X = Activation('softmax')(X)
    
model = Model(sentence_indices, X)


In [22]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 10)]              0         
                                                                 
 embedding (Embedding)       (None, 10, 50)            20000050  
                                                                 
 lstm (LSTM)                 (None, 10, 128)           91648     
                                                                 
 dropout (Dropout)           (None, 10, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 5)                 645   

## 3. Entrenamiento del modelo

In [23]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [24]:
X_train_indices.shape

(132, 10)

In [25]:
Y_train.shape

(132, 5)

In [31]:
model.fit(X_train_indices, Y_train, epochs = 150, batch_size = 16, shuffle=True, validation_data=(X_test_indices, Y_test))

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x1e2b35ee370>

## 4. Test & Results

In [30]:
loss, acc = model.evaluate(X_test_indices, Y_test)
print()
print("Test accuracy = ", acc)


Test accuracy =  0.6071428656578064


In [32]:
Y_test_lbl = test["Label"]
Y_test_oh = pd.get_dummies(Y_test_lbl)
X_test_indices = sentences_to_indices(test["Text"], word_to_index, maxLen)
pred = model.predict(X_test_indices)

tabla_predict = []
for i in range(len(X_test)):
    x = X_test_indices
    num = np.argmax(pred[i])
    tabla_predict.append([X_test[i],label_to_emoji(Y_test_lbl[i]),label_to_emoji(num).strip()])

In [34]:
print(tabulate(tabla_predict, headers=["Texto", "Label", "Predicción"]))

Texto                                  Label    Predicción
-------------------------------------  -------  ------------
I want to eat                          🍴       🍴
he did not answer                      😞       😞
he got a raise                         😄       😞
she got me a present                   ❤️        😄
ha ha ha it was so funny               😄       😄
he is a good friend                    ❤️        😄
I am upset                             ❤️        😞
We had such a lovely dinner tonight    ❤️        😄
where is the food                      🍴       🍴
Stop making this joke ha ha ha         😄       😄
where is the ball                      ⚾       ⚾
work is hard                           😞       😄
This girl is messing with me           😞       ❤️
are you serious ha ha                  😄       😞
Let us go play baseball                ⚾       ⚾
This stupid grader is not working      😞       😞
work is horrible                       😞       😞
Congratulation for having a baby      