Notebook that trains an LSTM to generate tweets based on a given emojis

First we load the data

In [41]:
import numpy as np
import pandas as pd
import data_load_utils as util

from importlib import reload
util = reload (util)


tweets = util.filter_tweets_min_count(
    util.read_tweet_data('data/emojis_homemade.csv'),
    min_count=1000)

tweets['text'] = util.filter_text_for_handles(tweets['text'])

  if self.run_code(code, result):


In [42]:
util.filter_text_for_handles(tweets.iloc[0:5]['text'])

0    RT [VID] 181023 - Foi adicionada a letra D no ...
1    RT 181023 Kris Wu Studio update (3/3)Legendary...
2    RT Now you are watching Indian SuperStar with ...
3                                      dats for keeps 
6                               Holy shit no I think. 
Name: text, dtype: object

In [43]:
tweets

                                                     text  emoji
0       RT [VID] 181023 - Foi adicionada a letra D no ...      ©
1       RT 181023 Kris Wu Studio update (3/3)Legendary...      💫
2       RT Now you are watching Indian SuperStar with ...      😎
3                                         dats for keeps       💛
6                                  Holy shit no I think.       😩
7                       RT army, follow who retweet this       👑
8       RT Simply K-Pop harddrive dump# #BerryGood #Me...      🤩
9       Happy birthday nellie hope you have a fantasti...      ❤
10      RT "I have passed through fire"Thank you for b...      💕
12      RT Incredible to be involved in the making of ...      😱
13      RT Too much feels #BTS #MPN #BTSARMY https://t...      😢
14      RT #PL TOP SCORERS 7 - 6 - the goods  https://...      🔥
15      [DEPASCAL] RUSSEL HOODIE BLUE https://t.co/CR4...      🔗
16      The joys of having a make up artist as your be...      😂
18                      R

In [44]:
tweets.iloc[0,:]

text     RT [VID] 181023 - Foi adicionada a letra D no ...
emoji                                                    ©
Name: 0, dtype: object

In [45]:
tweets.iloc[1]

text     RT 181023 Kris Wu Studio update (3/3)Legendary...
emoji                                                    💫
Name: 1, dtype: object

In [46]:
tweets.shape

(460771, 2)

In [58]:
MAX_TWEET_LENGTH = 160
WINDOW_SIZE = 40
STEP = 3

chars_univ, chars_univ_idx = util.get_universal_chars_list()

In [48]:
tweets_train = tweets.iloc[0:100] # 100 just to test the model works
tweets_dev = tweets.iloc[1100:2100] # 100 just to test the model works
tweets_test = tweets.iloc[2000:3000]

In [49]:
%timeit

train_x, train_y = util.convert_tweet_to_xy(tweets_train)
dev_x, dev_y = util.convert_tweet_to_xy(tweets_train)

In [50]:
train_x

array([[[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [51]:
train_x.shape

(4000, 40, 93)

In [52]:
for i in range (50):
    print (util.x_y_bool_array_to_sentence(train_x, train_y, chars_univ, position=i, separator=True))

                                        : 
                                        : 
                                      RT: 
                                   RT [V:I
                                RT [VID]: 
                             RT [VID] 18:1
                          RT [VID] 18102:3
                       RT [VID] 181023 -: 
                    RT [VID] 181023 - Fo:i
                 RT [VID] 181023 - Foi a:d
              RT [VID] 181023 - Foi adic:i
           RT [VID] 181023 - Foi adicion:a
        RT [VID] 181023 - Foi adicionada: 
     RT [VID] 181023 - Foi adicionada a :l
  RT [VID] 181023 - Foi adicionada a let:r
T [VID] 181023 - Foi adicionada a letra :D
VID] 181023 - Foi adicionada a letra D n:o
] 181023 - Foi adicionada a letra D no o:u
81023 - Foi adicionada a letra D no outd:o
23 - Foi adicionada a letra D no outdoor: 
- Foi adicionada a letra D no outdoor mi:s
oi adicionada a letra D no outdoor miste:r
adicionada a letra D no outdoor misterio:s
cionada a l

# Building a network
Intially, let's try generating tweets by training a network on just the tweet data. Once we have an idea how well we can get a network to generate tweets (remember, character by character), we'll compare it to a network that learns to generate tweets by predicting the next chracter jointly from the preceding text and an overall emoji. (remember, this dataset is tweets that all contain exactly one emoji).

## Simple network - a single SLTM into a Dense softmax classifier.

In [53]:
import keras
from keras import layers
from keras.models import Sequential
model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(WINDOW_SIZE, len(chars_univ))))
model.add(layers.Dense(len(chars_univ), activation='softmax'))

# loss function - targets are one-hot encoded
optimizer = keras.optimizers.RMSprop(lr=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

## Training the model and sampling from it using a standard character-by-character method
1. Draw a probability distribution for the next character
2. Reweight the distribution using a temperature parameter
3. Sample the next character at random using the reweighted distribution
4. Add the new character at the end of the available list

In [54]:
def sample (preds, temperature = 1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

## train the model, generate text
Use a range of temeratures after every epoch

In [55]:
tweets.iloc[0]['text'][0:10]

'RT [VID] 1'

In [60]:
import random
import sys

n_seed_chars = 10 # number of characters to use as a seed for text generation

model.optimizer.lr.assign(0.001) # to reset the learning rate if running additional training

# train for 60 epochs
for epoch in range (1, 60):
    print ('epoch', epoch)

    # fit the model for one iteration
    model.fit (train_x, train_y,
               batch_size=1024, epochs=1,
               validation_data=(dev_x, dev_y), initial_epoch=epoch)

    # select a text seed at random
    seed_tweet = tweets.iloc[random.randint(0, len(tweets))]
    generated_text = seed_tweet['text'][0:n_seed_chars]
    print ('--- Generating with seed: "' + generated_text + '"')

    # try a range of sampling temperatures
    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print ('--------- temperature:', temperature)
        sys.stdout.write(generated_text)

        for i in range (MAX_TWEET_LENGTH - n_seed_chars):
            # one-hot encode the characters generated so far
            sampled = np.zeros((1, WINDOW_SIZE, len(chars_univ)))
            for t, char in enumerate (generated_text):
                sampled[0, t, chars_univ_idx[char]] = 1

            # sample the next character
            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars_univ[next_index]

            generated_text += next_char
            generated_text = generated_text[1:]

            sys.stdout.write(next_char)

        print ("\n")    

epoch 1
Train on 4000 samples, validate on 4000 samples
--- Generating with seed: "RT my hear"
--------- temperature: 0.2
RT my hearY.L*Jg5#_-Sxqeho(R^D+T^

+ 6Y5gc^)g%Rk$+Dr-{|nrLn|0^}1H5z

1Ik69J|Jh8a.IW&VTwZJ}]z AyCd8frBM

ZS}-x8=G&M-GvKwQNJ_'u+Rg

't'J=EF~;4F,s6Z5lR

ka0\Od2(UD@J^kL)6[Re

--------- temperature: 0.5
@J^kL)6[Re

[rG||,'jD3(tB}_eB'Fz)

YS8rkv5KVfsu|u~w!$;~Eb+-k

e8jq/UXTw T£Rh.&

A{@\IPrZ2Q'

{%/BT"9lnmQImAXA,!RtQ

H~|1Q.5$!0/:ELEI)?W"wu,{

6PkbCM=w&P+XRfj@E;ZX=KA

Eq02C7£(j

--------- temperature: 1.0
AEq02C7£(jtJ]FoXI]Np2OX

brj_h#?yZ(M7kc\i.m%u4Fy

m+ksqY~xt3Y/1Kub@%Qt%

ln@^d:I\R^+(K"Mkbjc[MM:P&O

U6T^5bK^ H!bnHG.%mD}

3)~?x,+)CQY}:t?\-!MS

}~N_:~C4RbDJ=,

cr-Z3VJhlF}jZ

--------- temperature: 1.2
Z3VJhlF}jZJ*QQ0D[:CM.

VsM)r?RbZX79'r;m#VzWVL

Td}FyXbb/.k w(9O.(qVk

£U'l/egK[,h=U2[zKj2ZhuU

lQlKc[Pp81fh6ja#^06@NhU=

/mbek2-yXV3}w1w9Uym6z~z,CR0-

£DLB$UeB.MMFp56!W$3tB

epoch 2
Train on 4000 samples, validate on 4000 samples
--- Generating with seed: "RT RT if Y"
--------- temperature: 0.2
RT RT if YA,j

wff£B~IVAZY w~[]*rQ7@Qr

RCUW].Vx!Up=X

.NY.x92WUojK4(QS7Jd5s

q~9c9j"C[v" }lTOo\f.\R;X

zq;EX1Y;qN:|'ZE90VA

l4igTS8Gg9vC&XVcHQwsB

[R£a=bOT2]+e:mAKROtH7/yaNF

--------- temperature: 0.5
ROtH7/yaNFG7C9F_

(N9PIXO6 3Gx@Hj7=C7'oD@$(I8oG

)^RDA]hkQzYdgb£{dpd5x@b

t$hyFmR}:5"Ef&97Q|8TRA)%]E

e+k2*p.Efp34£g (x5)y,w%=;=

W#m!/GdxK

+6iSRqHtaZ$pj oM$,[EWzaUyc*

G'eX

--------- temperature: 1.0
zaUyc*G'eX/nXLAC~AB+Hgdo\sw+hC|

,XW/^bpW{Gc4K &n+\hH#r[GC

j*(bIFE8yv^K)zhfPFAf1kZwyH

X.Hjl? :4e5#d~1QEME+E?sp+3

hPB$9jY)k*#2KlQ£y

1(8'Dn"iGIk&6reW7N*'a"o#=w|tt8

8uX+z

--------- temperature: 1.2
w|tt88uX+zRO4sW=Bucf]{RTZ5dy*uO£

~Xvi:Wdmkv{&Nk)S,L@jbU--jY

;eK fwvcU$

CxH]|['0'?yo^bq9|Yq,@m?ji6M$

6mj9^W29J]hk$E5P(dBc=*)=|:

Rgm£ClL?g4Yf Hl,kqYl~6s.EY5B

f*L[/)xzAn

epoch 3
Train on 4000 samples, validate on 4000 samples
--- Generating with seed: "RT Fam, he"
--------- temperature: 0.2
RT Fam, heg(x41F$$NN~)!r£k5_

W\+j73%U=pEc#_zL BmA

f\v25ut"0yTuZ\kIwkpa(V!uUu

df_lG/!4--c}R£r2iZ$M!pmLH^+

_(f~FVi2i?Gv.o£.Pzt-kt7y"'AcK

nx}4Lt)\1b1qi?%Qpek~Vy|r

)#X8oM

--------- temperature: 0.5
Vy|r)#X8oM*WuB

4@}W*h%f E%tg"dHw,zG,0GTFK

5uw~65ETlZWC73sI[m&=)oW8

SPlIe!N8%d)O  b,Cpp]PdbnoU=T!Iato

DFuszVMk.[3CP?-x+]Vv

9f_Z{'@z-.*CJ,£ASyZW/u

aAaDM&f+W£{k2Ki)[Rx!4

--------- temperature: 1.0
k2Ki)[Rx!4i&$![

|NufG a(eLI%O@rwTJigZ .=Rn

e^7mu9gR~g|@~~6i£Ec£z~xgNNOMF:

$p[Y* -X461qp7iWrz!M

w3wc}_xb;m)(I

 hn#X7Y^Zz$DbBPx8Ck&yTcz

NW#WE1Qx2h9|VX|7£uF_z,nkYnJD4b$

M

--------- temperature: 1.2
nkYnJD4b$M2I,gNe7jORMD0/N686LsCRQG

I2_*$se7~ar|2yyQ6$xgd|K,}r

MU}}]L N8+^h;*ub=RsMF{%{V~sA|H&W"

d':gIND%c;lW[sGOu ^@o|~-u)

r^LmONO"48Tw$}6^5j|$Gj? ~

z!&Q:PH/tE1?\.v0

epoch 4
Train on 4000 samples, validate on 4000 samples
--- Generating with seed: "hopefully "
--------- temperature: 0.2
hopefully _Hr|_4aRH

vVhw-.7M/hME]Foz

vs|M-)?9xMQ=0£f@

d%zOBCQ=,{IT(V:ze/}/z5ur!cx;(HtcR|!

LZv;^+1D8z;qm\&@n^S6xCAy-T3l

6Q,j.ue;s}^fnb3qA@:Vd*F6%A8b

X"H-zcJkyV%TQB&H)T

--------- temperature: 0.5
yV%TQB&H)TXCI4@|YwEW

C4xyO8RmSKo;oeL{h%s@,QETD1

,O£w=]G0:?["qG3[gzOr}8QoZ

S0bYG-A&7]K)s;xw4D-w3p,V

(l6'K#Mx~m~\fAAKZ&4y ];**J%D mgh

^wR1L%Vz +

c[w7q6OV}a;{JBrBN5(

=W_c

--------- temperature: 1.0
BrBN5(=W_cM£j5T_+(YzT=8wt\

o1z;~lZ_xk\^A]vGy@-&!Xqk}eLsg

5&m^f#2p8G}yXyOAlCO5:kLxo8B\(n7*$

% =10jzuWfkKi+PEm]*-^f5[Ra7DF

|#N1An)'ds?]J6S ZS&X{ogX/J

fICuKZbIHe£:U-&Dh

--------- temperature: 1.2
IHe£:U-&Dh\TUP/+ 

(.8M_G;Ql'U'KbSM

?i}]Y=baf{jyrqUb"Ry'/B

-"ex,J6[\5[M1

4{w!8VLhku4

yER9p]u+'MfvB^O/O*Dqo-:/h

Sy :[5[3jWKQ.a}-k)aIjn

_q&?J:OZa;0\?rK-t|h0AV

p:W?$h8Ff-k|

epoch 5
Train on 4000 samples, validate on 4000 samples
--- Generating with seed: "RT Isn't t"
--------- temperature: 0.2
RT Isn't t~d:hB3]%GA,"l

_D JBY4Nbc"5X?£vN81k/+bv~!@

-_xGI09HqBBvum7M},qSmf^p~o

9qW+%BHa+1d6Y1:$|[)uokA*r

RXMItnZ@;z;g9u9U^ecw]m

%8Q/wyW:1^)/(P4£1-

2eEudu&dVP1Qe

VcHlgT

--------- temperature: 0.5
P1QeVcHlgTt(l.zV?jU'2UD(

Z[?I1OoFZMgit.G:Ww1[h

jF£A+;' 02I!TAPgOoD=NH=x:cC

.M,-6@3x^m:RZO?PbG%+OHK&Bpj@'[oxz

jvF7jD~5u #]81I/"5IkS)o$^£

;!EQ@+Z!@]K@4YK}!m0/D0Im+\#52

--------- temperature: 1.0
/D0Im+\#52i#r

F;FA&UIg(~ KUm! Eg(a6q%

[R7r4dY.8XA"C4v*ecKe4=m,0rMN!

ZC=+bP}/|(o5\D4"2V

£Y,JN_cm']LSP

@@.0^LmMRat+TfIlWS1\|

tuw--H^k)Gg+1w)TT,Z-G

)OyLn3S m4^LDR63,'PHpW

--------- temperature: 1.2
DR63,'PHpWtOsO

Jj@_n-p0q%Af;cpN&]A/][:y

%DBwlpIE8OTAdE|=EAh'{:mC0s

=ne;LFAj;"k4\i+A4As|5y6?=f

FG07onaJO"6nxG%"kK{u-}=R^G

e&5 D.CV=£'m77j$8'\_#wh

6TC-\[(wEdq?"Z2+buTp|

epoch 6
Train on 4000 samples, validate on 4000 samples
--- Generating with seed: "I like how"
--------- temperature: 0.2
I like how

g"pWU*XX%D#[

!7TsR.c4FKJLi+MEFul,hU

N:cjZ;£{(]p)TTaR-F9@:(@PE4

Mzjp\YV7~?Ur.lFc68n20R:B

FgtWwidT".T2'W*XgP|~[?4Ll'~

240+£q;yIoqqsAg]G1n\b;}Et179T]$

?1N*nY%m

--------- temperature: 0.5
]$?1N*nY%mA&fV^q*|T7;aZT|!a\F?;

09hF4J"VDBr[e!T) q6z'8

C::gFf)~H9S9Bl{}wiEc\N

&5F1eb;h|F#ls;GLj

]yV\$F';F:"N6

S?VsO.vDs];q8r6p[0@vBAmCW{T

]eN&4u&#|c7ZM£=ZxOu,f4oX0tV*

--------- temperature: 1.0
u,f4oX0tV*

uGjz.s8xjpG^NdxyjeVHN~Jss

3{G@*m6c_;Kh3N+C%7*UGXzs6H

??a5l|C{9£IGv~&&~H+;4Ew

^G_,XxSr f7B:I%4C/~~NH9pO

5qK&Wy]PVJWhbbDf%"eZg

^m)_:d9we}0h{Z8'Z{j\*E

{:98Re:0

--------- temperature: 1.2
*E{:98Re:0/0lz\jk'IQ

Io'A~ZN#an&tV02

mckw/SO:l$L=[iwD[5VJLfPk

#$;{Wkt0L/4$h^1ZzN^l%Cp

E-(\un#};0NO^?()D^*E"m

?nB?O(=kf:Pom\4$V=] uv?

}B£2tjCYIL;7Vakn{8U1*,4NM

.bU~NA!K

epoch 7
Train on 4000 samples, validate on 4000 samples
--- Generating with seed: "Im a grown"
--------- temperature: 0.2
Im a grownEnq$H$}v3=DoGBM/B

:A£0@goq6JdkM[0Ymbx?&$,z

hUI^'pjPad£}g)P6)vy2ljv}

c3xsrh% 

pi|%qfnUa==,mKw5

ckHr:V1''s8jeqkydtD!

Fx\yKw*D(bXjO.]Bek@x!kf]2

O)&2wS./' &"7\n~

--------- temperature: 0.5
./' &"7\n~cVfDz0Av

y+PGGV£4a_yS5u0e)ZgW4E_EC64@C#92v

+Ci[[{c9^HvP%@Ij8l~57

Z6#x.y[;8Um^+H9-7- |n{/'}y6uDj

{^3Zi}o~pNz;LQU=wH~L(^-Q$

e]X/£+@mST0'\g"G\U4"v£A

*MkuzejG|(

--------- temperature: 1.0
*MkuzejG|(

X96U{m!t7bmP.Cb65vTR9;h

X[l[sY,\NZkC}NJZExBGrDzp[

tSCvoc(RUHaqZ'B,Z:WYNN_

£S'7u]+hrfQwbS^mMY0]40r

z1 &NC*MVfPDS_b%U+snzV

]k:AxAX8@_lUS,b]xD~t~Cqc£m\

3CZe[wj

--------- temperature: 1.2
£m\3CZe[wjb+^^Udg{(#]+H}&w9^

I'4#?,%AN£{Yza=ECQ=T$A{;£%S

;Z.[\4£ Hp,(AA83+a8mif%u]

&VqW;£$

$=1KdP]H?8T,£~fkA;aDBg8h KS-

9P/I1d^' m£E.DT=s9A82%,x

{'LKW3h!QZ=@Ty}LA-SzR

epoch 8
Train on 4000 samples, validate on 4000 samples
--- Generating with seed: "Yes oh..."
--------- temperature: 0.2
Yes oh...],

=0KhV7./)#[${Oa#XYr%

k-55mvHf5G£wt/']CBf.N,

^p=g|MQ_B9Yj29i$UU\7Fyaa+

?6 zz$(Euih=lGJIAems54o

x3/p2L(b&D}_$gW(/kTg1T$z9@B/#

X]k LB8K(0Y3yHEN]J_c},~OC

Q;;b

--------- temperature: 0.5
},~OCQ;;bK:\d}

1y0$^0%5Xpru_4qDV.diJaAZI[R

8]aI^;9;3+7&W"1FRH.}fBMgYv

E@ALXC?Q+YX^TmXuDTw

m5Vx]CVZe5H5'JWF +

gDrVAPEec7!G]TSMcckwofy~Sm'Hz_N

{5#5B.46z:[%qfao-C_vVeq3

--------- temperature: 1.0
o-C_vVeq3G=7!3bc

w526{p&{Q£IW~d1x?q@Got

=E!:"_fO&jkRp3-11I2!deKQR"?

X[{HqIM8Q2{5l&Z]f

hJOGE%"j 

&+YH(t&TWW(eaUCw:EB-PV?£2

9{ ]NnFU7z+xBB#E#b*U

H%-'*8HCD g=*m:T=pN8}JZ

--------- temperature: 1.2
:T=pN8}JZvGT!

n;XHav}£s$V#fAf|x&B "4

gQW^=eK]I*Ent*!d2p;4Em5]}@b*QG

0za|:&,PU5v8icVdWX4Q%qJ

qhJV+%66,&N1aZhGFx(

p?L8;pUQ\~M0#1WNzdx

nT:"U[N5kdAuo}9iQ"fFJ1

v$3BO=}@

n51

epoch 9
Train on 4000 samples, validate on 4000 samples
--- Generating with seed: "Me &amp; m"
--------- temperature: 0.2
Me &amp; mb7e91,.8v£)\s:+';M

?!fzklT7@is3=@UED;"AQl;'CeJ#+P

PR+d/214&EG8JS$3]V~Y5:96%r

7|sVqn+D.@u+..A8|pIS**G,

=b3|[9' ],S#;+£j 6K.De-£(

)OV/RcjD[2GiVkoRFg"iN

#]l{-P

--------- temperature: 0.5
g"iN#]l{-P9gl#W)5O4}]sq+[M

,W8U@zXsK5d7A~P5lE*9SX

l+/V=uPT @K-rZ|^tc"'7(bE

RYKV

)kdqO£mKWbg[2rW{vnMy6zph

LiDo'ivU;(x87dDJX;

N),%$mSK_[!7J8[kGwNy~gV"

QdRWIU@}U*"^7seJ@%

--------- temperature: 1.0
U*"^7seJ@%Iu_wbZQ

I=UP(@9sAs5XGhRtaCj(-k

iz;b!2SX|k#jO_);tpQP=thMB\

vKyl]{[fK Oe% z2sAx0ZbaREnn*T+

*LLn,POWU%vnP1{!d,W~*EAux

S~8}J[287y[JqJe[}O£-c|z5

Em(e0LJo

lD87'oMy

--------- temperature: 1.2
JolD87'oMyecXS-L&J1

Ue*e{bu*#=G!IW'B:870I,-Q"

qb,H'A8ZdV#o\,SkN{?vGYE[h

e1QsWE}D*O,b£+F{^

a,rKWe

KeyboardInterrupt: 

In [61]:
char_univ_idx

NameError: name 'char_univ_idx' is not defined