Notebook that trains an LSTM to generate tweets based on a given emojis

First we load the data

In [2]:
import numpy as np
import pandas as pd
import data_load_utils as util

from importlib import reload
util = reload (util)


tweets = util.filter_tweets_min_count(
    util.read_tweet_data('data/emojis_homemade.csv'),
    min_count=1000)

tweets['text'] = util.filter_text_for_handles(tweets['text'])

  if self.run_code(code, result):


In [3]:
util.filter_text_for_handles(tweets.iloc[0:5]['text'])

0    RT [VID] 181023 - Foi adicionada a letra D no ...
1    RT 181023 Kris Wu Studio update (3/3)Legendary...
2    RT Now you are watching Indian SuperStar with ...
3                                      dats for keeps 
6                               Holy shit no I think. 
Name: text, dtype: object

In [4]:
tweets

Unnamed: 0,text,emoji
0,RT [VID] 181023 - Foi adicionada a letra D no ...,©
1,RT 181023 Kris Wu Studio update (3/3)Legendary...,💫
2,RT Now you are watching Indian SuperStar with ...,😎
3,dats for keeps,💛
6,Holy shit no I think.,😩
7,"RT army, follow who retweet this",👑
8,RT Simply K-Pop harddrive dump# #BerryGood #Me...,🤩
9,Happy birthday nellie hope you have a fantasti...,❤
10,"RT ""I have passed through fire""Thank you for b...",💕
12,RT Incredible to be involved in the making of ...,😱


In [5]:
tweets.iloc[0,:]

text     RT [VID] 181023 - Foi adicionada a letra D no ...
emoji                                                    ©
Name: 0, dtype: object

In [6]:
tweets.iloc[1]

text     RT 181023 Kris Wu Studio update (3/3)Legendary...
emoji                                                    💫
Name: 1, dtype: object

In [7]:
tweets.shape

(460771, 2)

In [8]:
MAX_TWEET_LENGTH = 160
WINDOW_SIZE = 40
STEP = 3

chars_univ, chars_univ_idx = util.get_universal_chars_list()

In [15]:
tweets_train = tweets.iloc[0:3000] # 100 just to test the model works
tweets_dev = tweets.iloc[3000:3200] # 100 just to test the model works
# tweets_test = tweets.iloc[2000:3000]

In [16]:
%timeit

import time
tic = time.time()

train_x, train_y = util.convert_tweet_to_xy(tweets_train)
dev_x, dev_y = util.convert_tweet_to_xy(tweets_train)

print ("completed in", time.time()-tic, "s")

completed in 4.318266868591309 s


In [17]:
train_x

array([[[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 1., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [18]:
train_x.shape

(120000, 40, 93)

In [19]:
for i in range (50):
    print (util.x_y_bool_array_to_sentence(train_x, train_y, chars_univ, position=i, separator=True))

                                        : 
                                        : 
                                      RT: 
                                   RT [V:I
                                RT [VID]: 
                             RT [VID] 18:1
                          RT [VID] 18102:3
                       RT [VID] 181023 -: 
                    RT [VID] 181023 - Fo:i
                 RT [VID] 181023 - Foi a:d
              RT [VID] 181023 - Foi adic:i
           RT [VID] 181023 - Foi adicion:a
        RT [VID] 181023 - Foi adicionada: 
     RT [VID] 181023 - Foi adicionada a :l
  RT [VID] 181023 - Foi adicionada a let:r
T [VID] 181023 - Foi adicionada a letra :D
VID] 181023 - Foi adicionada a letra D n:o
] 181023 - Foi adicionada a letra D no o:u
81023 - Foi adicionada a letra D no outd:o
23 - Foi adicionada a letra D no outdoor: 
- Foi adicionada a letra D no outdoor mi:s
oi adicionada a letra D no outdoor miste:r
adicionada a letra D no outdoor misterio:s
cionada a l

# Building a network
Intially, let's try generating tweets by training a network on just the tweet data. Once we have an idea how well we can get a network to generate tweets (remember, character by character), we'll compare it to a network that learns to generate tweets by predicting the next chracter jointly from the preceding text and an overall emoji. (remember, this dataset is tweets that all contain exactly one emoji).

## Simple network - a single LSTM into a Dense softmax classifier.

In [21]:
import keras
from keras import layers
from keras.models import Sequential
model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(WINDOW_SIZE, len(chars_univ))))
model.add(layers.Dense(len(chars_univ), activation='softmax'))

# loss function - targets are one-hot encoded
optimizer = keras.optimizers.RMSprop(lr=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

## Training the model and sampling from it using a standard character-by-character method
1. Draw a probability distribution for the next character
2. Reweight the distribution using a temperature parameter
3. Sample the next character at random using the reweighted distribution
4. Add the new character at the end of the available list

In [22]:
def sample (preds, temperature = 1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

## train the model, generate text
Use a range of temeratures after every epoch

In [23]:
tweets.iloc[0]['text'][0:10]

'RT [VID] 1'

In [None]:
import random
import sys

n_seed_chars = 10 # number of characters to use as a seed for text generation

model.optimizer.lr.assign(0.001) # to reset the learning rate if running additional training

# train for 60 epochs
for epoch in range (1, 60):
    print ('epoch', epoch)

    # fit the model for one iteration
    model.fit (train_x, train_y,
               batch_size=1024, epochs=1,
               validation_data=(dev_x, dev_y), #initial_epoch=epoch,
               verbose=1)

    # select a text seed at random
    seed_tweet = tweets.iloc[random.randint(0, len(tweets))]
    generated_text = seed_tweet['text'][0:n_seed_chars]
    print ('--- Generating with seed: "' + generated_text + '"')

    # try a range of sampling temperatures
    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print ('--------- temperature:', temperature)
        sys.stdout.write(generated_text)

        for i in range (MAX_TWEET_LENGTH - n_seed_chars):
            # one-hot encode the characters generated so far
            sampled = np.zeros((1, WINDOW_SIZE, len(chars_univ)))
            for t, char in enumerate (generated_text):
                sampled[0, t, chars_univ_idx[char]] = 1

            # sample the next character
            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars_univ[next_index]

            generated_text += next_char
            generated_text = generated_text[1:]

            sys.stdout.write(next_char)

        print ("\n")    

epoch 1
Train on 120000 samples, validate on 120000 samples
Epoch 1/1
--- Generating with seed: "RT So exci"
--------- temperature: 0.2
RT So exci  too  oo   o    t t      o                            o  e     oe  oo  to os               o        o  o   to o teo  t   e    tao  o      o    t   t 

--------- temperature: 0.5
    t   t :oooy te  l t tt  s    tud :hnw  g u  sr otts   oieo ethee  oo o au   eooatt  e rptog ooBt    lra  h   ehto eh taodoiioylptit roeskltRm  a rnoleotfem 

--------- temperature: 1.0
noleotfem a: w. tioneinhileew3ktey eisnyE yein Viwr*f.u:% o:l.aQdGZ vNp1teMKrIoesP  gt!sozyceetuon3fRetoJ0di9skRnqwSoIi kvgr9gloosac Foo VbfbS9hay:RfWeth G:ooo 

--------- temperature: 1.2
eth G:ooo rtQBiB XtMHeix]i5cqkB#I]1iKaXy7~HlxMdaweifS:gslgat epU MnVke0yNct! at6sJK:qO6FKM#@X,y347Ohvthvs:snuah&ur/ef7S kTiJBxIc14 ep8g8uooi t?;icurettllal0sh U

epoch 2
Train on 120000 samples, validate on 120000 samples
Epoch 1/1
--- Generating with seed: "RT i look "
--------- temperature: 0.2

In [61]:
char_univ_idx

NameError: name 'char_univ_idx' is not defined