# Emojitranslate project

## Part one - learning to predict emojis from tweets

The following code adapted from the Osinga deep learning cookbook - using the Twitter API to sample EN language tweets that contain exactly one emoji

In [114]:
import random
import twitter
import emoji
# import itertools
import pandas as pd
from itertools import chain
from keras.preprocessing.sequence import pad_sequences
import numpy as np
#from sklearn.model_selection import train_test_split
from keras import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras.utils import to_categorical
import keras.callbacks
#import json

import os
# import nb_utils
from keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM, Embedding, GlobalMaxPooling1D#, Merge 
from keras.models import Model
from keras.layers.merge import Concatenate, Average

# from gensim.models import Word2Vec

In [30]:
all_tweets = pd.read_csv('data/emojis_homemade.csv')
all_tweets['emoji'].value_counts()

üòÇ        35746
üò≠        14450
‚ù§        11139
emoji     8154
üòç         7890
üî•         7636
ü§£         4974
ü§î         3923
üôè         3607
üò©         3584
üíï         3578
üòä         3392
üëÄ         3127
üíú         2936
‚ú®         2924
üëè         2887
üôÑ         2825
üñ§         2638
üíÄ         2628
üòè         2554
üéâ         2374
üôå         2310
üòò         2265
üíØ         2147
üëç         2026
üíô         2013
üòâ         1925
üíñ         1878
üö®         1873
üëá         1860
         ...  
‚óª            1
üóÉ            1
üÄÑ            1
üöõ            1
üõ§            1
‚è∏            1
üà¥            1
üö∏            1
üìè            1
‚õé            1
üéç            1
üïñ            1
üà≤            1
üè®            1
‚ûó            1
‚óÄ            1
üâê            1
üßô            1
üóÇ            1
üíπ            1
üöÉ            1
üö°            1
üñ≤            1
üõã            1
üë∑            1
üè¨ 

In [31]:
all_tweets.shape

(281488, 3)

In [32]:
all_tweets.head

<bound method NDFrame.head of         Unnamed: 0                                               text emoji
0              0.0  RT @mydear_bangtan: [VID] 181023 - Foi adicion...     ¬©
1              1.0  RT @WuYiFan_KrisBar: 181023 Kris Wu Studio upd...     üí´
2              2.0  RT @TrendsPrabhas: Now you are watching Indian...     üòé
3              3.0                                    dats for keeps      üíõ
4              4.0  RT @xruiztru: WHO ARMS THE SAUDIS?\n\n  üá∫üá∏US 5...     üè≥
5              5.0  RT @xxxfreaknasty2: interracial couple go at i...     üåù
6              6.0               @Eric_Deshaun Holy shit no I think.      üò©
7              7.0     RT @fanmutuals: army, follow who retweet this      üëë
8              8.0  RT @_Simplykpop: Simply K-Pop harddrive dump\n...     ü§©
9              9.0  Happy birthday nellie @thelittlegend, hope you...     ‚ù§
10            10.0  RT @montparnasty: "I have passed through fire"...     üíï
11            11.0  RT

In [33]:
tweets = all_tweets.groupby('emoji').filter(lambda c:len(c) > 1000)
tweets['emoji'].value_counts()

üòÇ        35746
üò≠        14450
‚ù§        11139
emoji     8154
üòç         7890
üî•         7636
ü§£         4974
ü§î         3923
üôè         3607
üò©         3584
üíï         3578
üòä         3392
üëÄ         3127
üíú         2936
‚ú®         2924
üëè         2887
üôÑ         2825
üñ§         2638
üíÄ         2628
üòè         2554
üéâ         2374
üôå         2310
üòò         2265
üíØ         2147
üëç         2026
üíô         2013
üòâ         1925
üíñ         1878
üö®         1873
üëá         1860
         ...  
üò≥         1797
üòÖ         1670
üí™         1603
üòÅ         1600
üò±         1571
‚Äº         1505
üíõ         1498
üëå         1489
üó£         1454
üôÉ         1393
ü§™         1353
üò¢         1304
‚ò∫         1287
üò§         1278
‚úå         1240
ü§ó         1213
ü§ß         1160
üòí         1138
üí•         1131
üòà         1121
¬©         1119
üíî         1114
üò´         1108
üòå         1082
‚òπ         1079
üíó  

In [34]:
max(tweets['text'], key=lambda t:len(t))

'RT @DingDongLive: LMAO  @TexasEDMFamily\n@FreakyDeakyFam\nGrab my hard/hybrid trap mix&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt; https://t.co/34rcxAVbgk https://t.co/ArQeh23L‚Ä¶'

In [35]:
chars = list(sorted(set(chain(*tweets['text']))))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
max_sequence_len = max(len(x) for x in tweets['text'])

emojis = list(sorted(set(tweets['emoji'])))
emoji_to_idx = {em: idx for idx, em in enumerate(emojis)}
emojis[:10]

#train_tweets, test_tweets = train_test_split(tweets, test_size=0.1)

['emoji', '¬©', '‚Äº', '‚òπ', '‚ò∫', '‚ô•', '‚úå', '‚ú®', '‚ù§', 'üéÉ']

Currently we've collected quite a lot of tweet data, so to speed up the prototyping of the model we're going to create a train/dev/test datasets of 10,000 tweets each.

In [36]:
train_tweets = tweets[0:10000]
dev_tweets = tweets[10000:20000]
test_tweets = tweets[20000:30000]

In [37]:
def data_generator(tweets, batch_size):
    while True:
        if batch_size is None:
            batch = tweets
            batch_size = batch.shape[0]
        else:
            batch = tweets.sample(batch_size)
        X = np.zeros((batch_size, max_sequence_len, len(chars)))
        y = np.zeros((batch_size,))
        for row_idx, (_, row) in enumerate(batch.iterrows()):
            y[row_idx] = emoji_to_idx[row['emoji']]
            for ch_idx, ch in enumerate(row['text']):
                X[row_idx, ch_idx, char_to_idx[ch]] = 1
        yield X, y

next(data_generator(tweets, 10))

(array([[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 1., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 1., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 1., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        ...,
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],


In [38]:
def create_char_cnn_model(num_chars, max_sequence_len, num_labels):
    char_input = Input(shape=(max_sequence_len, num_chars), name='char_cnn_input')
    
    conv_1x = Conv1D(128, 6, activation='relu', padding='valid')(char_input)
    max_pool_1x = MaxPooling1D(4)(conv_1x)
    conv_2x = Conv1D(256, 6, activation='relu', padding='valid')(max_pool_1x)
    max_pool_2x = MaxPooling1D(4)(conv_2x)

    flatten = Flatten()(max_pool_2x)
    dense = Dense(128, activation='relu')(flatten)
    preds = Dense(num_labels, activation='softmax', name='char_cnn_predictions')(dense)

    model = Model(char_input, preds)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
    return model

char_cnn_model = create_char_cnn_model(len(char_to_idx), max_sequence_len, len(emojis))
char_cnn_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
char_cnn_input (InputLayer)  (None, 163, 2222)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 158, 128)          1706624   
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 34, 256)           196864    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 8, 256)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 2048)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               262272    
__________

In [46]:
early = keras.callbacks.EarlyStopping(monitor='loss',
                              min_delta=0.03,
                              patience=2,
                              verbose=1, mode='auto')

checkpoint = keras.callbacks.ModelCheckpoint(filepath='emoji_cnn.h5',
                                             monitor='val_acc',
                                             save_best_only = True)

#tensorboard = keras.callbacks.TensorBoard(log_dir='tensorboard_log',
#                                         histogram_freq=1,
#                                         embeddings_freq=1)

BATCH_SIZE = 512
char_cnn_model.fit_generator(
    data_generator(train_tweets, batch_size=BATCH_SIZE),
    validation_data = data_generator(dev_tweets, batch_size=BATCH_SIZE),
    validation_steps=int(dev_tweets.shape[0]/BATCH_SIZE),
    epochs=5,
    steps_per_epoch=len(train_tweets) / BATCH_SIZE,
    verbose=1, # was: verbose=2
    callbacks=[early, checkpoint]
)

Epoch 1/5


ValueError: If printing histograms, validation_data must be provided, and cannot be a generator.

In [45]:
pd.options.display.max_colwidth = 128
inspect_tweets = dev_tweets.sample(100)
predicted = char_cnn_model.predict_generator(data_generator(inspect_tweets, batch_size=None), steps=1)
show = pd.DataFrame({
    'text': inspect_tweets['text'],
    'true': inspect_tweets['emoji'],
    'pred': [emojis[np.argmax(x)] for x in predicted],
})
show = show[['text', 'true', 'pred']]
show.head(10)

Unnamed: 0,text,true,pred
23120,RT @PGATOUR: This is as unlucky as golf gets. \n\n#TOURVault https://t.co/1lyERpKge8,üôÑ,‚ù§
23904,The worst champions league atmosphere I‚Äôve ever heard at old Trafford #manu #overated,üôÑ,üòÇ
24903,i think i go to school to get sick not smart,üòÇ,üòÇ
21950,"Someone said the Cake Island Arc in One Piece has made them hate cake...\nFor me, it makes me want to stuff my face with swe...",üòÇ,üòÇ
18236,RT @tribranchvo: she did NOT just do the ‚Äúokurrrrr‚Äù ASMR style no she did not https://t.co/t1FtqLXUkD,üíÄ,üòÇ
16296,RT @wisconsinGodsPl: THE 1980 CUBAN BOAT LIFT WAS A IMMIGRATION CRISIS\n\nNOTICE THE HEADLINES IN THE PAPER\n\nPRESIDENT TRU...,ü§î,‚ù§
22613,@Humor_Silly Actually BJPWon Main Inka Haath Zyada Hai,ü§£,üòÇ
17512,text,emoji,emoji
21778,"RT @BillHai61993017: @RealSaavedra Imagine that, a DYING party trying to register DEAD people!!!!",üòÖ,‚ù§
19871,RT @Soumz: Today is Marshmallow appreciation day. Because every day is Marshmallow appreciation day \nÿ£ÿπÿ∑Ÿà ÿ≠ÿ®ŸÉŸÖ ŸÑŸÑŸÖÿßÿ±ÿ¥ŸÖŸÑŸà h...,üíô,‚ù§


# Simple neural network - single Dense layer

Benchmark performance with the simplest neural network we can get

Let's divide up the train/dev/test sets so we're not relying on a generator

In [118]:
x_train = train_tweets["text"]
y_train = np.asarray(train_tweets["emoji"])
x_dev = dev_tweets["text"]
y_dev = np.asarray(dev_tweets["emoji"])
x_test = test_tweets["text"]
y_test = np.asarray(test_tweets["emoji"])

all_emojis = np.concatenate((y_train, y_dev, y_test), axis=0)

emoji_to_idx = {em: idx for idx, em in enumerate(emojis)}
#emojis[:10]

all_emojis_idx = np.zeros(all_emojis.shape[0])

for i in range (all_emojis.shape[0]):
    all_emojis_idx[i] = emoji_to_idx[all_emojis[i]]    

all_emojis_one_hot = to_categorical (all_emojis_idx)
    
y_train_idx = all_emojis_one_hot[0:10000,:]
y_dev_idx = all_emojis_one_hot[10000:20000,:]
y_test_idx = all_emojis_one_hot[20000:30000,:]

In [119]:
y_train_idx[0:10]

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0

Now let's start by one-hot encoding the text samples

In [120]:
tokenizer = Tokenizer(num_words = 1000)
tokenizer.fit_on_texts (x_train)

#x_train_sequences = tokenizer.texts_to_sequences(x_train)
#x_dev_sequences = tokenizer.texts_to_sequences(x_dev)
#x_test_sequences = tokenizer.texts_to_sequences(x_test)

x_train_one_hot = tokenizer.texts_to_matrix(x_train, mode='binary')
x_dev_one_hot = tokenizer.texts_to_matrix(x_dev, mode='binary')
x_test_one_hot = tokenizer.texts_to_matrix(x_test, mode='binary')




# remember to pad sequences

In [121]:
x_train_one_hot.shape

(10000, 1000)

In [122]:
y_train_idx.shape

(10000, 64)

In [126]:
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(1000,)))
model.add(Dense(64, activation = 'softmax'))
model.compile(optimizer = 'rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit (x_train_one_hot, y_train_idx,
                    epochs = 10,
                    batch_size = 512,
                    validation_data = (x_dev_one_hot, y_dev_idx))

Train on 10000 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [71]:
model = Sequential()
model.add(Embedding(max_features, 32))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit (input_train, y_train,
                     epochs = 10,
                     batch_size = 128,
                     validation_split = 0.2)

NameError: name 'max_features' is not defined