Notebook that trains an LSTM to generate tweets based on a given emojis

First we load the data

In [25]:
import numpy as np
import pandas as pd
import data_load_utils as util

from importlib import reload
util = reload (util)


tweets = util.filter_tweets_min_count(
    util.read_tweet_data('data/emojis_homemade.csv'),
    min_count=1000)

tweets['text'] = util.filter_text_for_handles(tweets['text'])

  if self.run_code(code, result):


In [26]:
util.filter_text_for_handles(tweets.iloc[0:5]['text'])

0    RT [VID] 181023 - Foi adicionada a letra D no ...
1    RT 181023 Kris Wu Studio update (3/3)Legendary...
2    RT Now you are watching Indian SuperStar with ...
3                                      dats for keeps 
6                               Holy shit no I think. 
Name: text, dtype: object

In [27]:
tweets

                                                     text  emoji
0       RT [VID] 181023 - Foi adicionada a letra D no ...      ©
1       RT 181023 Kris Wu Studio update (3/3)Legendary...      💫
2       RT Now you are watching Indian SuperStar with ...      😎
3                                         dats for keeps       💛
6                                  Holy shit no I think.       😩
7                       RT army, follow who retweet this       👑
8       RT Simply K-Pop harddrive dump# #BerryGood #Me...      🤩
9       Happy birthday nellie hope you have a fantasti...      ❤
10      RT "I have passed through fire"Thank you for b...      💕
12      RT Incredible to be involved in the making of ...      😱
13      RT Too much feels #BTS #MPN #BTSARMY https://t...      😢
14      RT #PL TOP SCORERS 7 - 6 - the goods  https://...      🔥
15      [DEPASCAL] RUSSEL HOODIE BLUE https://t.co/CR4...      🔗
16      The joys of having a make up artist as your be...      😂
18                      R

In [28]:
tweets.iloc[0,:]

text     RT [VID] 181023 - Foi adicionada a letra D no ...
emoji                                                    ©
Name: 0, dtype: object

In [29]:
tweets.iloc[1]

text     RT 181023 Kris Wu Studio update (3/3)Legendary...
emoji                                                    💫
Name: 1, dtype: object

In [30]:
tweets.shape

(460478, 2)

In [31]:
MAX_TWEET_LENGTH = 160
WINDOW_SIZE = 40
STEP = 3

chars_univ, chars_univ_idx = util.get_universal_chars_list()

In [32]:
tweets_train = tweets.iloc[0:1000]
tweets_dev = tweets.iloc[1000:2000]
tweets_test = tweets.iloc[2000:3000]

In [33]:
%timeit
train_x, train_y = util.convert_tweet_to_xy(tweets_train)

In [34]:
train_x

array([[[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [35]:
train_x.shape

(40000, 40, 93)

In [40]:
for i in range (50):
    print (util.x_y_bool_array_to_sentence(train_x, train_y, chars_univ, position=i, separator=True))

                                        : 
                                        : 
                                      RT: 
                                   RT [V:I
                                RT [VID]: 
                             RT [VID] 18:1
                          RT [VID] 18102:3
                       RT [VID] 181023 -: 
                    RT [VID] 181023 - Fo:i
                 RT [VID] 181023 - Foi a:d
              RT [VID] 181023 - Foi adic:i
           RT [VID] 181023 - Foi adicion:a
        RT [VID] 181023 - Foi adicionada: 
     RT [VID] 181023 - Foi adicionada a :l
  RT [VID] 181023 - Foi adicionada a let:r
T [VID] 181023 - Foi adicionada a letra :D
VID] 181023 - Foi adicionada a letra D n:o
] 181023 - Foi adicionada a letra D no o:u
81023 - Foi adicionada a letra D no outd:o
23 - Foi adicionada a letra D no outdoor: 
- Foi adicionada a letra D no outdoor mi:s
oi adicionada a letra D no outdoor miste:r
adicionada a letra D no outdoor misterio:s
cionada a l

# Building a network
Intially, let's try generating tweets by training a network on just the tweet data. Once we have an idea how well we can get a network to generate tweets (remember, character by character), we'll compare it to a network that learns to generate tweets by predicting the next chracter jointly from the preceding text and an overall emoji. (remember, this dataset is tweets that all contain exactly one emoji).

## Simple network - a single SLTM into a Dense softmax classifier.

In [41]:
from keras import layers
model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(WINDOW_SIZE, len(chars_univ))))
model.add(layers.Dense(len(chars_univ), activation='softmax'))

# loss function - targets are one-hot encoded
optimizer = keras.optimizers.RMSprop(lr=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

NameError: name 'keras' is not defined