# A Seq2seq model for generating tweets

In [205]:
import numpy as np
import pandas as pd
import data_load_seq2seq_utils as s2s_util
import data_load_utils as util
from importlib import reload

util = reload(util)
s2s_util = reload(s2s_util)

In [206]:
tweets = util.filter_tweets_min_count(util.read_tweet_data('data/emojis_homemade.csv'), min_count=1000)
tweets['text'] = util.filter_text_for_handles(tweets['text'])

# After the filtering, remember to append a \n character to each tweet

In [207]:
tweets.head()

                                                text emoji
0  RT [VID] 181023 - Foi adicionada a letra D no ...     ©
1  RT 181023 Kris Wu Studio update (3/3)Legendary...     💫
2  RT Now you are watching Indian SuperStar with ...     😎
3                                    dats for keeps      💛
6                             Holy shit no I think.      😩

In [208]:
tweets.shape

(445474, 2)

Define the set of characters that we'll use to encode our text data:

In [209]:
# Create dicts for character/emoji to index conversion

chars_univ, chars_univ_idx = util.get_universal_chars_list()

emojis = sorted(list(set(tweets['emoji'])))
emoji_idx = dict((emoji, emojis.index(emoji)) for emoji in emojis)

In [210]:
TRAIN_SIZE = 2**13 # 8192  try 131072 = 2**18 for production
DEV_SIZE = 2**13   # 8192  try 8192 = 2**13 for production

TWEETS_PER_BATCH = 64
MAX_TWEET_LENGTH = 160
n_train_batches = TRAIN_SIZE / TWEETS_PER_BATCH
n_dev_batches = DEV_SIZE / TWEETS_PER_BATCH


tweets_train = tweets.iloc[0:TRAIN_SIZE] # 8192 = 2**13
tweets_dev = tweets.iloc[TRAIN_SIZE:TRAIN_SIZE+DEV_SIZE] # 2048 = 2**11

In [211]:
tweets_train.shape

(8192, 2)

In [212]:
train_generator = s2s_util.xy_generator(tweets_train, emoji_indices=emoji_idx)
dev_generator = s2s_util.xy_generator(tweets_dev, emoji_indices=emoji_idx)

In [218]:
([emoj, x], y) = train_generator.__next__()
e = emoj.reshape(64, 1, 111)
e.shape

(64, 1, 111)

Now we're going to use the algorithm from the Keras example of a seq2seq model.
We'll supply the emoji to the encoder LSTM which will encode it into two state vectors,
and the decoder LSTM will be trained on the tweets using teacher forcing.



# Summary of the algorithm

- We start with input sequences from a domain (e.g. English sentences)
    and corresponding target sequences from another domain
    (e.g. French sentences).
- An encoder LSTM turns input sequences to 2 state vectors
    (we keep the last LSTM state and discard the outputs).
- A decoder LSTM is trained to turn the target sequences into
    the same sequence but offset by one timestep in the future,
    a training process called "teacher forcing" in this context.
    Is uses as initial state the state vectors from the encoder.
    Effectively, the decoder learns to generate `targets[t+1...]`
    given `targets[...t]`, conditioned on the input sequence.
- In inference mode, when we want to decode unknown input sequences, we:
    - Encode the input sequence into state vectors
    - Start with a target sequence of size 1
        (just the start-of-sequence character)
    - Feed the state vectors and 1-char target sequence
        to the decoder to produce predictions for the next character
    - Sample the next character using these predictions
        (we simply use argmax).
    - Append the sampled character to the target sequence
    - Repeat until we generate the end-of-sequence character or we
        hit the character limit.


# Building the model

In [182]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense


ENCODER_HIDDEN_SIZE = 256

In [183]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, len(emoji_idx)))
encoder = LSTM(ENCODER_HIDDEN_SIZE, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]


In [184]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, len(chars_univ)))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(ENCODER_HIDDEN_SIZE, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(len(chars_univ), activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


In [185]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

model.summary()


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, None, 111)    0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, None, 93)     0                                            
__________________________________________________________________________________________________
lstm_7 (LSTM)                   [(None, 256), (None, 376832      input_7[0][0]                    
__________________________________________________________________________________________________
lstm_8 (LSTM)                   [(None, None, 256),  358400      input_8[0][0]                    
                                                                 lstm_7[0][1]                     
          

In [190]:

model.fit_generator(train_generator,
                    steps_per_epoch=n_train_batches,
                    epochs=100,
                    validation_data=dev_generator,
                    validation_steps=n_dev_batches,
                    verbose=1)

# Save model
model.save('emoji_s2s.h5')


Epoch 1/100


StopIteration: 