# A Seq2seq model for generating tweets

In [16]:
import numpy as np
import pandas as pd
import data_load_seq2seq_utils as s2s_util
import data_load_utils as util
from importlib import reload

util = reload(util)
s2s_util = reload(s2s_util)

In [17]:
tweets_orig = util.read_tweet_data('data/emojis_homemade.csv')
tweets_additional_0 = util.read_tweet_data('data/emojis_additional.csv')

tweets=pd.DataFrame.append(tweets_orig, tweets_additional_0)


In [52]:
tweets = util.filter_tweets_min_count(tweets, min_count=1000)
tweets.reset_index()


tweets['text'] = util.filter_text_for_handles(tweets['text'])

# After the filtering, remember to append a \n character to each tweet

In [53]:
tweets.head()

Unnamed: 0,text,emoji
0,RT [VID] 181023 - Foi adicionada a letra D no ...,©
1,RT 181023 Kris Wu Studio update (3/3)Legendary...,💫
2,RT Now you are watching Indian SuperStar with ...,😎
3,dats for keeps,💛
6,Holy shit no I think.,😩


In [54]:
tweets.shape

(645016, 2)

Define the set of characters that we'll use to encode our text data:

In [55]:
# Create dicts for character/emoji to index conversion

chars_univ, chars_univ_idx = s2s_util.get_universal_chars_list()

emojis = sorted(list(set(tweets['emoji'])))
emoji_idx = dict((emoji, emojis.index(emoji)) for emoji in emojis)

In [56]:
TRAIN_SIZE = 2**19 # 8192  try 131072 = 2**18 for production
DEV_SIZE = 2**14   # 8192  try 8192 = 2**13 for production

TWEETS_PER_BATCH = 2048
MAX_TWEET_LENGTH = 160
n_train_batches = TRAIN_SIZE / TWEETS_PER_BATCH
n_dev_batches = DEV_SIZE / TWEETS_PER_BATCH

print ("n_train_batches:", n_train_batches)
print ("n_dev_batches:", n_dev_batches)

tweets_train = tweets.iloc[0:TRAIN_SIZE] # 8192 = 2**13
tweets_dev = tweets.iloc[TRAIN_SIZE:TRAIN_SIZE+DEV_SIZE] # 2048 = 2**11

n_train_batches: 256.0
n_dev_batches: 8.0


In [57]:
tweets_train.shape

(524288, 2)

In [58]:
train_generator = s2s_util.xy_generator(tweets_train, emoji_indices=emoji_idx)
dev_generator = s2s_util.xy_generator(tweets_dev, emoji_indices=emoji_idx)

In [59]:
([emoj, x], y) = train_generator.__next__()
#e = emoj.reshape(64, 1, 111)
x.shape


(64, 161, 94)

Now we're going to use the algorithm from the Keras example of a seq2seq model.
We'll supply the emoji to the encoder LSTM which will encode it into two state vectors,
and the decoder LSTM will be trained on the tweets using teacher forcing.



# Summary of the algorithm

- We start with input sequences from a domain (e.g. English sentences)
    and corresponding target sequences from another domain
    (e.g. French sentences).
- An encoder LSTM turns input sequences to 2 state vectors
    (we keep the last LSTM state and discard the outputs).
- A decoder LSTM is trained to turn the target sequences into
    the same sequence but offset by one timestep in the future,
    a training process called "teacher forcing" in this context.
    Is uses as initial state the state vectors from the encoder.
    Effectively, the decoder learns to generate `targets[t+1...]`
    given `targets[...t]`, conditioned on the input sequence.
- In inference mode, when we want to decode unknown input sequences, we:
    - Encode the input sequence into state vectors
    - Start with a target sequence of size 1
        (just the start-of-sequence character)
    - Feed the state vectors and 1-char target sequence
        to the decoder to produce predictions for the next character
    - Sample the next character using these predictions
        (we simply use argmax).
    - Append the sampled character to the target sequence
    - Repeat until we generate the end-of-sequence character or we
        hit the character limit.


# Building the model

In [60]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense


ENCODER_HIDDEN_SIZE = 256

In [61]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, len(emoji_idx)))
encoder = LSTM(ENCODER_HIDDEN_SIZE, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]


In [32]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, len(chars_univ)))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(ENCODER_HIDDEN_SIZE, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(len(chars_univ), activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


In [33]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

model.summary()


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, None, 129)    0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, None, 94)     0                                            
__________________________________________________________________________________________________
lstm_3 (LSTM)                   [(None, 256), (None, 395264      input_3[0][0]                    
__________________________________________________________________________________________________
lstm_4 (LSTM)                   [(None, None, 256),  359424      input_4[0][0]                    
                                                                 lstm_3[0][1]                     
          

In [34]:

model.fit_generator(train_generator,
                    steps_per_epoch=n_train_batches,
                    epochs=100,
                    validation_data=dev_generator,
                    validation_steps=n_dev_batches,
                    verbose=1)

# Save model
model.save('emoji_s2s.h5')


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
  8/256 [..............................] - ETA: 26s - loss: 0.0126

KeyboardInterrupt: 

In [35]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(ENCODER_HIDDEN_SIZE,))
decoder_state_input_c = Input(shape=(ENCODER_HIDDEN_SIZE,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [36]:
reverse_chars_idx = dict(
(i, char) for char, i in chars_univ_idx.items())

In [37]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, len(chars_univ)))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, chars_univ_idx['\n']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_chars_idx[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > MAX_TWEET_LENGTH):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, len(chars_univ)))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence


In [62]:
def emoji_to_oh(emoji, emoji_idx):
    emoji_arr = np.zeros(shape=(1, 1, len(emoji_idx)))
    emoji_arr[0, 0, emoji_idx[emoji]] = 1
    return emoji_arr

In [65]:
tweets_train.iloc[3].loc['emoji']

'💛'

In [66]:
# make one hot vector for emoji input
emoji = tweets_train.iloc[3].loc['emoji']
print (emoji)
#emoji_arr = np.zeros(shape=(1, 1, len(emoji_idx)))
#emoji_arr[0, 0, emoji_idx[emoji]] = 1


decode_sequence (emoji_to_oh(emoji, emoji_idx))

💛


'tsto/re belips \n'

In [67]:
for i in range (100):
    emoji = tweets_train.iloc[i].loc['emoji']
    generated_tweet = decode_sequence (emoji_to_oh(emoji, emoji_idx))
    print (emoji, generated_tweet)

© T [VID] 181023 - Foi adicionada a letra D no outdoor misterioso do #BTS em Hollywood.Formando: BTS AND...  ILOVEPAR

💫 T 181023 Kris Wu Studio update (3/3)Legendary creator at The Next Top Bang press conference #KrisWu #Wuyifa

😎  tho gha  apes pinging bay uthresthert hiter ood .co int sen che bostpe whth igh top and chonders Athtis ght. o/It.co/itthins 

💛 tsto/re belips 

😩 ol thes man I whet ill of  houp s atpder hen. Iush thes herties aod ohe pire hhare pear.co/ht.Or I

👑 ar your folow ever you he als 

🤩  Time  Epppaps h thricek deblk 

❤ T Heilk  fortht me and I ihin eleyo fa hitrin  tome bo wing on peming alk no  hetrs://t.co/UDf7h3oD

💕 T Happy 23rd Birthday to Duckie Thot  https://t.co/pae7x6tfHz

😱  Incredible tor invorive in ite iad a pop thime  hat se tcol 4ever un  ithas. 

😢 T Toou hee lles #BT MPNi #BTS Met thest ane #BTOr #TM #MPARMYPhe tis  https://t.co/In99zRKwab

🔥 ored you hitp ia 

🔗 T [DEPASCAL] RUSSEL HOODIE BLUE https://t.co/CR4QrnCiLn https://t.co/TgPX6G9r0k
