# Text generation with an LSTM and Keras

Redo with chars not tokens.  Also, step by 3 through chars when getting windows (didn't do this for tokens might make big difference so go back and try.)

In [95]:
import codecs
import os
import re
import string
import numpy as np
import pandas as pd
from typing import Sequence
from sklearn.model_selection import train_test_split

import tensorflow_addons as tfa
from keras.datasets import mnist
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras import models, layers, callbacks, optimizers, Sequential, losses
import tqdm
from tqdm.keras import TqdmCallback

def get_text(filename:str):
    """
    Load and return the text of a text file, assuming latin-1 encoding as that
    is what the BBC corpus uses.  Use codecs.open() function not open().
    """
    f = codecs.open(filename, encoding='latin-1', mode='r')
    s = f.read()
    f.close()
    return s

def compress_whitespace(s): # collapse things like "\n   \t  " with " "
    return re.sub(r"(\s+)", ' ', s)

## Load corpus

Let's use [Alexander Hamilton's federalist papers 1-10](https://guides.loc.gov/federalist-papers/text-1-10#s-lg-box-wrapper-25493264) as our corpus.

Try with https://s3.amazonaws.com/text-datasets/nietzsche.txt which is 6x bigger.

In [96]:
# text = get_text("data/federalist-papers.txt")
text = get_text("data/nietzsche.txt")
text = compress_whitespace(text)
text[:300]

'PREFACE SUPPOSING that Truth is a woman--what then? Is there not ground for suspecting that all philosophers, in so far as they have been dogmatists, have failed to understand women--that the terrible seriousness and clumsy importunity with which they have usually paid their addresses to Truth, have'

In [97]:
# TESTING
#text = text[:1000]

In [98]:
tokens = list(text)

## Get vocab and get X, y 

In [99]:
V = sorted(set(tokens))
len(V)

84

In [100]:
V[0:15]

[' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5']

In [101]:
index = {c:i for i,c in enumerate(V)}
def ctoi(c):
    return index[c]

In [102]:
k = 60
step = 1
Xy = [np.array((np.array(tokens[i-k:i],dtype=object),tokens[i])) for i in range(k,len(tokens)-1,step)]

In [103]:
Xy[:5]

[array([array(['P', 'R', 'E', 'F', 'A', 'C', 'E', ' ', 'S', 'U', 'P', 'P', 'O',
        'S', 'I', 'N', 'G', ' ', 't', 'h', 'a', 't', ' ', 'T', 'r', 'u',
        't', 'h', ' ', 'i', 's', ' ', 'a', ' ', 'w', 'o', 'm', 'a', 'n',
        '-', '-', 'w', 'h', 'a', 't', ' ', 't', 'h', 'e', 'n', '?', ' ',
        'I', 's', ' ', 't', 'h', 'e', 'r', 'e'], dtype=object),
        ' '], dtype=object),
 array([array(['R', 'E', 'F', 'A', 'C', 'E', ' ', 'S', 'U', 'P', 'P', 'O', 'S',
        'I', 'N', 'G', ' ', 't', 'h', 'a', 't', ' ', 'T', 'r', 'u', 't',
        'h', ' ', 'i', 's', ' ', 'a', ' ', 'w', 'o', 'm', 'a', 'n', '-',
        '-', 'w', 'h', 'a', 't', ' ', 't', 'h', 'e', 'n', '?', ' ', 'I',
        's', ' ', 't', 'h', 'e', 'r', 'e', ' '], dtype=object),
        'n'], dtype=object),
 array([array(['E', 'F', 'A', 'C', 'E', ' ', 'S', 'U', 'P', 'P', 'O', 'S', 'I',
        'N', 'G', ' ', 't', 'h', 'a', 't', ' ', 'T', 'r', 'u', 't', 'h',
        ' ', 'i', 's', ' ', 'a', ' ', 'w', 'o', 'm', 'a', 'n', 

In [104]:
Xy = np.array(Xy)

In [105]:
X, y = Xy[:,0], Xy[:,1]

In [106]:
X = np.vstack(X)
X[0:2]

array([['P', 'R', 'E', 'F', 'A', 'C', 'E', ' ', 'S', 'U', 'P', 'P', 'O',
        'S', 'I', 'N', 'G', ' ', 't', 'h', 'a', 't', ' ', 'T', 'r', 'u',
        't', 'h', ' ', 'i', 's', ' ', 'a', ' ', 'w', 'o', 'm', 'a', 'n',
        '-', '-', 'w', 'h', 'a', 't', ' ', 't', 'h', 'e', 'n', '?', ' ',
        'I', 's', ' ', 't', 'h', 'e', 'r', 'e'],
       ['R', 'E', 'F', 'A', 'C', 'E', ' ', 'S', 'U', 'P', 'P', 'O', 'S',
        'I', 'N', 'G', ' ', 't', 'h', 'a', 't', ' ', 'T', 'r', 'u', 't',
        'h', ' ', 'i', 's', ' ', 'a', ' ', 'w', 'o', 'm', 'a', 'n', '-',
        '-', 'w', 'h', 'a', 't', ' ', 't', 'h', 'e', 'n', '?', ' ', 'I',
        's', ' ', 't', 'h', 'e', 'r', 'e', ' ']], dtype=object)

## Label encode tokens in X, y

In [107]:
encode = np.vectorize(ctoi)
X = encode(X)
y = encode(y)

In [108]:
targets = np.unique(y)   # not every word in V will be in target classes (words)

In [109]:
X.shape, y.shape

((598808, 60), (598808,))

In [110]:
X[0]

array([38, 40, 27, 28, 23, 25, 27,  0, 41, 43, 38, 38, 37, 41, 31, 36, 29,
        0, 71, 59, 52, 71,  0, 42, 69, 72, 71, 59,  0, 60, 70,  0, 52,  0,
       74, 66, 64, 52, 65,  7,  7, 74, 59, 52, 71,  0, 71, 59, 56, 65, 22,
        0, 31, 70,  0, 71, 59, 56, 69, 56])

Convert X to shape (num sequences, window width k, len(V))

In [111]:
y.shape, len(V), len(targets)

((598808,), 84, 84)

In [112]:
y = pd.get_dummies(y)
y.shape

(598808, 84)

## One hot the tokens (optionally)

In [113]:
do_onehot = True
#do_onehot = False

In [114]:
def onehot(X):
    X_onehot = np.zeros((len(X), k, len(V)), dtype=np.bool)
    for i,record in enumerate(X):
        onehot = np.zeros((k,len(V)), dtype=np.bool)
        for j,wi in enumerate(record):
            onehot[j,wi] = 1
        X_onehot[i] = onehot
    return X_onehot

In [115]:
if do_onehot:
    X = onehot(X)

## Train

In [116]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20)

In [117]:
model = Sequential()
if do_onehot:
    # Must one hot X as num records x k x len(V)
    model.add(layers.LSTM(units=128, input_shape=(k,len(V))))
else:
    # If you don't want to onehot, you can leave X as 2D num records x k.
    model.add(layers.Embedding(input_dim=len(V), output_dim=10, input_length=k))
    model.add(layers.LSTM(units=128, input_shape=(k,1)))
# model.add(layers.Dropout(0.4))
#model.add(layers.BatchNormalization())
model.add(layers.Dense(len(targets), activation='softmax'))
#model.add(layers.Lambda(lambda x: tf.cast(K.argmax(x, axis=-1),dtype=float)))

# opt = optimizers.Adam(learning_rate=0.001)
opt = optimizers.RMSprop(lr=0.01) # keras book uses this

model.compile(loss=losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy'])
#model.summary()

In [89]:
def myfit(epochs, batch_size=128, verbose=0):
    history = model.fit(X_train, y_train,
                        shuffle=True,
                        epochs=epochs,
                        validation_data=(X_valid, y_valid),
                        batch_size=batch_size,
                        verbose=verbose
#                         , callbacks=[tfa.callbacks.TQDMProgressBar(show_epoch_progress=True)]
                        )

In [90]:
myfit(19, verbose=1)

Epoch 1/19
Epoch 2/19
Epoch 3/19
Epoch 4/19
Epoch 5/19
Epoch 6/19
Epoch 7/19
Epoch 8/19
Epoch 9/19
Epoch 10/19
Epoch 11/19
Epoch 12/19
Epoch 13/19
Epoch 14/19
Epoch 15/19
Epoch 16/19
Epoch 17/19
Epoch 18/19
Epoch 19/19


## Generate

In [118]:
# From Deep Learning with Python by François Chollet
# Gets a single int target class from a distribution described by probabilities
# (from softmax) in probs.  The temperature adds noise where temperature=0 means
# pick most likely always.
def sample(probs, temperature=1.0):
    probs = np.asarray(probs).astype('float64')
    probs = np.log(probs) / temperature
    exp_probs = np.exp(probs)
    probs = exp_probs / np.sum(exp_probs)
    probs = np.random.multinomial(1, probs, 1)
    return np.argmax(probs)

Seed the text with k words

In [119]:
start = np.random.randint(0, len(tokens) - k - 1)
generated_words = tokens[start: start + k]
print(''.join(generated_words))
generated_tokens = [ctoi(w) for w in generated_words]
generated_tokens[0:10]

 have betrayed their own secret; it has been for the sake of


[0, 59, 52, 73, 56, 0, 53, 56, 71, 69]

In [120]:
for epochs in range(1,40):
    print(f"-------- {epochs} epochs --------------------------------")
    myfit(epochs=1, verbose=1) # fits one iteration
    print('-'.join(generated_words), end=' ') # same seed
    for i in range(400):
        if do_onehot:
            onehot = np.zeros((1,k,len(V)), dtype=np.bool)
            for j,ci in enumerate(generated_tokens):
                onehot[0,j,ci] = 1
            X1 = onehot
        else:
            X1 = np.array(generated_tokens).reshape(1,k)
        y_prob = model.predict(X1, verbose=0)[0]
        next_token = sample(y_prob, temperature=0.5)
        print(V[next_token], end='')
        generated_tokens.append(next_token)
        generated_tokens = generated_tokens[1:]
    print()

-------- 1 epochs --------------------------------
 -h-a-v-e- -b-e-t-r-a-y-e-d- -t-h-e-i-r- -o-w-n- -s-e-c-r-e-t-;- -i-t- -h-a-s- -b-e-e-n- -f-o-r- -t-h-e- -s-a-k-e- -o-f  the philosophy the contend the become soul at in the world of the other out is the content of the a regard to human as the general former the will be the attaciments of the difficult and the more precisely the obselves of the latter of the seem that which hand and in the his as in the contentic things the in the besper the respection of states to the contempine hone in the erronger and the full a
-------- 2 epochs --------------------------------
 -h-a-v-e- -b-e-t-r-a-y-e-d- -t-h-e-i-r- -o-w-n- -s-e-c-r-e-t-;- -i-t- -h-a-s- -b-e-e-n- -f-o-r- -t-h-e- -s-a-k-e- -o-f  for the subjection the love the poisonismical course seek to so the personality and themselves "bild with it should be count and the the thereon in consciention of all and subjectic and seek the desires the weaken of the far be deprectable, who love. As of

  import sys


of such superficial religion of the severes and sense of oney be superiority of the sense shame and appear to any spirit" and individual soul, and the conceal with one self-the feeling and human and life of the sensations and point of the an excatiomed t
-------- 26 epochs --------------------------------
 -h-a-v-e- -b-e-t-r-a-y-e-d- -t-h-e-i-r- -o-w-n- -s-e-c-r-e-t-;- -i-t- -h-a-s- -b-e-e-n- -f-o-r- -t-h-e- -s-a-k-e- -o-f he sensuioning and impartic description of the surment of the artists and are spectacles and may become obscure and the life of which we will not the prisonfulness and the same discouraged in the same man who has best the saint, the distorous are not in the siner the standard of the conscience of the fastimant of the thing of a philosopher as a hence and utility and strigible to the opposite actio
-------- 27 epochs --------------------------------
 -h-a-v-e- -b-e-t-r-a-y-e-d- -t-h-e-i-r- -o-w-n- -s-e-c-r-e-t-;- -i-t- -h-a-s- -b-e-e-n- -f-o-r- -t-h-e- -s-a-k-e- -o-f 

  import sys


on the conduct and exalted the condition and self in the philosophers as a simplical case. shat present in its solitude of the familiar and said the supersious men, that it is also the former so love of man; and the conscious to the soul of everything itself of striving with a 
-------- 29 epochs --------------------------------
 -h-a-v-e- -b-e-t-r-a-y-e-d- -t-h-e-i-r- -o-w-n- -s-e-c-r-e-t-;- -i-t- -h-a-s- -b-e-e-n- -f-o-r- -t-h-e- -s-a-k-e- -o-f origh the us of stult of trite the schollert iness of the its in th

  import sys


e the art and the morating the sent and the prediges and erse of the whi-(f for spreeanarl and and his facle ine of ingence, and a orgerly and a reviles of tat whisetyole mad ?lue iness ande had and sally, andes whic thead the stal of the the ALmand nbger and medWlved yme whise beaGk, dompt of tranom hor inglees in st dest benes of
-------- 30 epochs --------------------------------
 -h-a-v-e- -b-e-t-r-a-y-e-d- -t-h-e-i-r- -o-w-n- -s-e-c-r-e-t-;- -i-t- -h-a-s- -b-e-e-n- -f-o-r- -t-h-e- -s-a-k-e- -o-f n 

  import sys


-tengr, and phandontbout, anorerile tf whe Bllesnan C on the be wh coahil c earn be wh che her hent ghintb bep once consitm doss en Friongom it of the ttht ive an] and cherindbelic acomtze,--and tob dob and )un of en tof whute rard the prelwh ofen to The oh the snd thon the where thomat it te arin: ouns migl cabeen thut an ne the sher tha i drerlan the impta atethes perm cnase the whe , cQ imime
-------- 31 epochs --------------------------------
 -h-a-v-e- -b-e-t-r-a-y-e-d- -t-h-e-i-r- -o-w-n- -s-e-c-r-e-t-;- -i-t- -h-a-s- -b-e-e-n- -f-o-r- -t-h-e- -s-a-k-e- -o-f  

  import sys


mas a they sme it the kre irehces ofhee na hle th t Thio at thiwr aah both of af aeo be wofrwe here shro ofot rgao mnss ese-wart ounene fe t eone wer imobed.=or ert rn ion dic  be hl it h une thes -Te caet t FteOln -We thin  to phve c dhiince ingive in The tof athe (wat e anodd it the he tit te the igiosse mor toN ale tor ter tiling) risbt aine te oeed the ps at iris thet o2nio tor ofes the fr th
-------- 32 epochs --------------------------------
 -h-a-v-e- -b-e-t-r-a-y-e-d- -t-h-e-i-r- -o-w-n- -s-e-c-r-e-t-;- -i-t- -h-a-s- -b-e-e-n- -f-o-r- -t-h-e- -s-a-k-e- -o-f e nan

  import sys


szs  orr ene aon vome c ontea th e pheeeer sthe; to hinsKin soh2ate tun ss ofes itipil th bote e GEhre or Hht tu t" a w wh oo n it are of ofte o sin ane dathe: t t,e n ouhs dha ple esz wl win ofre t"ce tslyn tne of fe mie h of ood ofheee nb ae the t ooe ca meneto ao t third- al anhirti i thenare  ts  E hs ig of h uniae,t  hib pn pKedridvce th ce of oede geant ma mndo ba he btc "l sc ef t aali
-------- 33 epochs --------------------------------
 -h-a-v-e- -b-e-t-r-a-y-e-d- -t-h-e-i-r- -o-w-n- -s-e-c-r-e-t-;- -i-t- -h-a-s- -b-e-e-n- -f-o-r- -t-h-e- -s-a-k-e- -o-f  an 

  import sys


an nothoeu  Oian tiae ofe  RANa the of a oe mLOE uatisits "t aie anu efs thee mon the oit,oo an toe oth: aor mebe t  hasse mie o h Tn too  of she t ofe teor o songert it ohes ore sos aheinhye inssDhE hebe r oftr o or the de Thee arie thatererha thernito h, ah thit ahe  Onf aontaneane asx, th ined e hen to harsvecof  ir teg  aheiss t hech ine a  icerte t rsiehanete ooe aomKe tneree it at aisero
-------- 34 epochs --------------------------------
 -h-a-v-e- -b-e-t-r-a-y-e-d- -t-h-e-i-r- -o-w-n- -s-e-c-r-e-t-;- -i-t- -h-a-s- -b-e-e-n- -f-o-r- -t-h-e- -s-a-k-e- -o-f 

  import sys


ile of th an then ote  octios pn tIe tn ole an h fawednes aliinng of s ofos Tre wh ex iheom of ihe an t eerte h no c an o hinh  eol fs in h thhe)m ice of tht iighe tie e s hanichenluex-soat th-h w h oheoneaia  the eenis au aon Danose o n toh alies fl ie th e the xee in hcio t ihh t an an ah s seao nhesde chhanine an t. an t ahes iotne toa epth ph in ahe tes IWn the'tian ase tg se er f olLe beanis 
-------- 35 epochs --------------------------------
 -h-a-v-e- -b-e-t-r-a-y-e-d- -t-h-e-i-r- -o-w-n- -s-e-c-r-e-t-;- -i-t- -h-a-s- -b-e-e-n- -f-o-r- -t-h-e- -s-a-k-e- -o-f ondan

  import sys


 Hfphiegoadin  ite asp s 1SOPere aat ae arnn ishm t ath theh aou a alegen oeehairdotin, zof as on bnhode and  orhr th in te eoe Iihr(r te oo  tnonhe is oo i ite it prs th as theG txnu inst Gth oeeali s he in shhe the th ofeeve ht  heene ac wh  Is t ih lecee re bee thev fua te o ad cl it ceis th nonct dt«ceiss, ar sh in the wov as al steonellt ere pt h on   ee me io wr inee orirdas peatou tana
-------- 36 epochs --------------------------------
 -h-a-v-e- -b-e-t-r-a-y-e-d- -t-h-e-i-r- -o-w-n- -s-e-c-r-e-t-;- -i-t- -h-a-s- -b-e-e-n- -f-o-r- -t-h-e- -s-a-k-e- -o-f the t

  import sys


h ay the meanseon varanete t s s ant eenjereLv otherieaan ASte  bar se aote earid t haaehh h ir toae  Hhre ind the, ohpo e etu ra lte ah whalicte ta peinh t n oo ath  the a tes  oenn n af e ntor, t oepaun t e t hnnetonee be li:tees ano p aa eretd mis tae thtsnn hn ten  ist i leanje af une t hy at ofi  an thee the tnio hco tore mererethaane s the asn Dho teuow hamer the ite t momun at oa whe  
-------- 37 epochs --------------------------------
 -h-a-v-e- -b-e-t-r-a-y-e-d- -t-h-e-i-r- -o-w-n- -s-e-c-r-e-t-;- -i-t- -h-a-s- -b-e-e-n- -f-o-r- -t-h-e- -s-a-k-e- -o-f t

  import sys


 S-ooel  onte ei t eth texith d th  his te ro  t sf in cher hea theth att  thdt the ph tH re the thteeall fneecmnl tho t 5thttes ihero tas as co an io thy maltis arh hntsse tend aite tu pertee mnos h.ts ate th theleiheos tosereion or ph thr fh ishann ofitsee tha neyeene tt Ihesnth tictoe thesa -n Iiso e ere lha  ANoeder vepleithe ieedeeiih lfenis v- a aaend in at ere it th  ehntn the  me aans t e
-------- 38 epochs --------------------------------
 -h-a-v-e- -b-e-t-r-a-y-e-d- -t-h-e-i-r- -o-w-n- -s-e-c-r-e-t-;- -i-t- -h-a-s- -b-e-e-n- -f-o-r- -t-h-e- -s-a-k-e- -o-f t

  import sys


ed mheo l ex ha wh Lan so re pueodd tst aroineqofe are oronme theies hn wt b potmayr s felol whe whrc wiop f Rd al e ae ant r ahsat t os hesan inane the anton Lide cot tonee eihis ons ofs  it t rse a e wre w t otu it w t fot t asoi th ohof hele fO hn  oreins icoon ("nss,  ho hamant ee an oer ouholoheeclive Eth  csaioe enmeed fon so linge pien Sasnon ef i lsrfe t fus rae ar ohdo  t at fnily rnise
-------- 39 epochs --------------------------------
 -h-a-v-e- -b-e-t-r-a-y-e-d- -t-h-e-i-r- -o-w-n- -s-e-c-r-e-t-;- -i-t- -h-a-s- -b-e-e-n- -f-o-r- -t-h-e- -s-a-k-e- -o-f  sa

  import sys


nt  ase te thio o  ae oin th h a Ifindtthin  s eatero e a Ai  wifsiirse is th iot  issr eis s thet, we sa =h oto anlin h  aone he te t d Wh th cha  aein lhsr ondi agaet  atha athye hhs e ay aft  ntt h st tert inh anit  bs tehe lonscv hhre eate ae haee an ngnrelos, n aiafinh  Ohr wn (ahuas thh t  int ah th h in ant oe a he antei heteg t  rntede d aht a, asn as then gdan of den rnov gngistir of o


## Notes:

* gotta use a lot of data. started working well with 6x nietchse not federalist papers. 
* hmm...step seems to be just an efficiency issue
* what about batch size vs max len? Seems like we gotta line up sentences so they line up across batches, unless it resets h each batch. fastai book for LMModel3 inits h in `__init__` not `forward` but then uses truncated backprop (of len equal to seqence length k). It also then has to line up the batches.
* what is effect of onehot vs embedding layer? With same setup but with len(V) sized embeddings for chars going into LSTM rather than one hot: got weird div by zero errors and valid accuracy maxed out at .49 with loss 2.0 whereas with no embedding before LSTM, got valid .56 accur and loss 1.59.  Maybe a function of embedding size? `layers.Embedding(input_dim=len(V), output_dim=len(V), input_length=k)`

W/o embeddings at about epoch 60:

```
1248/1248 [==============================] - 15s 12ms/step - loss: 1.1914 - accuracy: 0.6397 - val_loss: 1.5969 - val_accuracy: 0.5601
r-d-e-r-)-,- -r-e-l-i-g-i-o-n- -i-t-s-e-l-f- -m-a-y- -b-e- -u-s-e-d- -a-s- -a- -m-e-a-n-s- -f-o-r- -o-b-t-a-i-n-i-n-g-  nce and simultage perseined to do a desire, that he understand of the best to the world of the contemplation of the so and at the desiress and strength, and accuiration to from the his esseced to such as a stronger man and worst of the soul in a soully of the best to cause the recognized in the sense of any constant their literal, and so much man of the problems to the self-explained by the sight 
```

With embeddings:

```
1248/1248 [==============================] - 17s 14ms/step - loss: 1.8657 - accuracy: 0.5002 - val_loss: 2.0086 - val_accuracy: 0.4913
a-t-e- -o-f- -h-i-s- -s-o-u-l-,- -h-e- -w-i-s-h-e-d- -t-o- -b-e- -d-o-u-b-t-f-u-l- -o-f- -h-i-s- -o-w-n- -c-a-p-a-c-i-t e bei
dency of who the is a pain of world and the present the regariss. The now to constinh-all alon a not or the possible and the powerful maken usfections of the under skecoflune and the makes of the sociement: in the to the greates all all the laid the should respection to a very the subject and that all the repxing the world of the sothing in the because bet the being bess that really of the ma
```

which looks much worse.

Accuracy is higher for char than for tokens likely due to much larger token space than char space.