# Text generation with an LSTM and Keras

In [196]:
import codecs
import os
import re
import string
import numpy as np
import pandas as pd
from typing import Sequence
from sklearn.model_selection import train_test_split

import tensorflow_addons as tfa
from keras.datasets import mnist
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras import models, layers, callbacks, optimizers, Sequential, losses
import tqdm
from tqdm.keras import TqdmCallback

def get_text(filename:str):
    """
    Load and return the text of a text file, assuming latin-1 encoding as that
    is what the BBC corpus uses.  Use codecs.open() function not open().
    """
    f = codecs.open(filename, encoding='latin-1', mode='r')
    s = f.read()
    f.close()
    return s

def words(text:str):
    """
    Given a string, return a list of words normalized as follows.
    Split the string to make words first by using regex compile() function
    and string.punctuation + '0-9\\r\\t\\n]' to replace all those
    char with a space character.
    Split on space to get word list.
    Ignore words < 3 char long.
    Lowercase all words
    Remove English stop words
    """
    ctrl_chars = '\x00-\x1f'
    regex = re.compile(r'[' + ctrl_chars + string.punctuation + '\r\t\n]')
    nopunct = regex.sub(" ", text)  # delete stuff but leave at least a space to avoid clumping together
    words = nopunct.split(" ")
    words = [w for w in words if len(w) > 0]
    words = [w.lower() for w in words]
    return words

def compress_whitespace(s): # collapse things like "\n   \t  " with " "
    return re.sub(r"(\s+)", ' ', s)

## Load corpus

Let's use [Alexander Hamilton's federalist papers 1-10](https://guides.loc.gov/federalist-papers/text-1-10#s-lg-box-wrapper-25493264) as our corpus.

In [197]:
text = get_text("data/federalist-papers.txt")
text = compress_whitespace(text)
text[:300]

'FEDERALIST NO. 1 General Introduction For the Independent Journal. Author: Alexander Hamilton To the People of the State of New York: AFTER an unequivocal experience of the inefficiency of the subsisting federal government, you are called upon to deliberate on a new Constitution for the United State'

In [198]:
import spacy

In [199]:
import en_core_web_sm
nlp = en_core_web_sm.load()
# The following fails on paperspace gradient platform
#nlp = spacy.load("en_core_web_sm") # When I use plain English() it doesn't seem to give POS info

In [200]:
tokens = words(text)
len(tokens), tokens[:10]

(19263,
 ['federalist',
  'no',
  '1',
  'general',
  'introduction',
  'for',
  'the',
  'independent',
  'journal',
  'author'])

In [201]:
# TESTING
tokens = tokens[:10_000]   # total is about 19.2k

## Get vocab and get X, y 

In [202]:
V = sorted(set(tokens))
len(V)

2091

In [203]:
V[:15]

['1',
 '10',
 '11',
 '1685',
 '1706',
 '1774',
 '1st',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9']

In [204]:
index = {w:i for i,w in enumerate(V)}
def wtoi(w):
    return index[w]

In [205]:
k = 50
Xy = [np.array((np.array(tokens[i-k:i],dtype=object),tokens[i])) for i in range(k,len(tokens)-1)]

In [206]:
Xy[:5]

[array([array(['federalist', 'no', '1', 'general', 'introduction', 'for', 'the',
        'independent', 'journal', 'author', 'alexander', 'hamilton', 'to',
        'the', 'people', 'of', 'the', 'state', 'of', 'new', 'york',
        'after', 'an', 'unequivocal', 'experience', 'of', 'the',
        'inefficiency', 'of', 'the', 'subsisting', 'federal', 'government',
        'you', 'are', 'called', 'upon', 'to', 'deliberate', 'on', 'a',
        'new', 'constitution', 'for', 'the', 'united', 'states', 'of',
        'america', 'the'], dtype=object),
        'subject'], dtype=object),
 array([array(['no', '1', 'general', 'introduction', 'for', 'the', 'independent',
        'journal', 'author', 'alexander', 'hamilton', 'to', 'the',
        'people', 'of', 'the', 'state', 'of', 'new', 'york', 'after', 'an',
        'unequivocal', 'experience', 'of', 'the', 'inefficiency', 'of',
        'the', 'subsisting', 'federal', 'government', 'you', 'are',
        'called', 'upon', 'to', 'deliberate', 'on',

In [207]:
Xy = np.array(Xy)

In [208]:
X, y = Xy[:,0], Xy[:,1]

In [209]:
X = np.vstack(X)
X[0:2]

array([['federalist', 'no', '1', 'general', 'introduction', 'for', 'the',
        'independent', 'journal', 'author', 'alexander', 'hamilton',
        'to', 'the', 'people', 'of', 'the', 'state', 'of', 'new', 'york',
        'after', 'an', 'unequivocal', 'experience', 'of', 'the',
        'inefficiency', 'of', 'the', 'subsisting', 'federal',
        'government', 'you', 'are', 'called', 'upon', 'to', 'deliberate',
        'on', 'a', 'new', 'constitution', 'for', 'the', 'united',
        'states', 'of', 'america', 'the'],
       ['no', '1', 'general', 'introduction', 'for', 'the',
        'independent', 'journal', 'author', 'alexander', 'hamilton',
        'to', 'the', 'people', 'of', 'the', 'state', 'of', 'new', 'york',
        'after', 'an', 'unequivocal', 'experience', 'of', 'the',
        'inefficiency', 'of', 'the', 'subsisting', 'federal',
        'government', 'you', 'are', 'called', 'upon', 'to', 'deliberate',
        'on', 'a', 'new', 'constitution', 'for', 'the', 'united',
   

## Label encode tokens in X, y

In [210]:
encode = np.vectorize(wtoi)
X = encode(X)
y = encode(y)

In [211]:
targets = np.unique(y)   # not every word in V will be in target classes (words)

In [212]:
X.shape, y.shape

((9949, 50), (9949,))

In [213]:
X[0]

array([ 791, 1291,    0,  882, 1076,  831, 1874, 1015, 1106,  213,  111,
        917, 1908, 1874, 1401, 1323, 1874, 1799, 1323, 1289, 2085,   94,
        140, 1951,  742, 1323, 1874, 1027, 1323, 1874, 1817,  790,  899,
       2086,  175,  284, 1975, 1908,  512, 1336,   15, 1289,  424,  831,
       1874, 1960, 1800, 1323,  130, 1874])

In [214]:
def onehot(X):
    X_onehot = np.zeros((len(X), k, len(V)), dtype=np.bool)
    for i,record in enumerate(X):
        onehot = np.zeros((k,len(V)), dtype=np.bool)
        for j,wi in enumerate(record):
            onehot[j,wi] = 1
        X_onehot[i] = onehot
    return X_onehot

X_onehot = onehot(X)

Convert X to shape (num sequences, window width k, len(V))

In [215]:
y.shape, len(V), len(targets)

((9949,), 2091, 2087)

In [216]:
y = pd.get_dummies(y)
y.shape

(9949, 2087)

In [217]:
X_onehot_train, X_onehot_valid, y_train, y_valid = train_test_split(X_onehot, y, test_size=0.20)

In [220]:
model = Sequential()
# If you don't want to onehot, you can leave X as 2D num records x k and use this:
#model.add(layers.Embedding(input_dim=len(V), output_dim=100, input_length=k))
# else have to one hot X as num records x k x len(V)
model.add(layers.LSTM(units=256, input_shape=(k,len(V))))
model.add(layers.Dropout(0.4))
model.add(layers.BatchNormalization())
model.add(layers.Dense(len(targets), activation='softmax'))
#model.add(layers.Lambda(lambda x: tf.cast(K.argmax(x, axis=-1),dtype=float)))

opt = optimizers.Adam(learning_rate=0.001)

model.compile(loss=losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy'])
#model.summary()

batch_size = 128
history = model.fit(X_onehot_train, y_train,
                    shuffle=True,
                    epochs=10,
                    validation_data=(X_onehot_valid, y_valid),
                    batch_size=batch_size,
                    verbose=0
                    , callbacks=[tfa.callbacks.TQDMProgressBar(show_epoch_progress=True)]
                    )

HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=10.0, style=Progr…

Epoch 1/10


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=63.0), HTML(value='')), layout=Layout(dis…


Epoch 2/10


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=63.0), HTML(value='')), layout=Layout(dis…


Epoch 3/10


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=63.0), HTML(value='')), layout=Layout(dis…


Epoch 4/10


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=63.0), HTML(value='')), layout=Layout(dis…


Epoch 5/10


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=63.0), HTML(value='')), layout=Layout(dis…


Epoch 6/10


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=63.0), HTML(value='')), layout=Layout(dis…


Epoch 7/10


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=63.0), HTML(value='')), layout=Layout(dis…


Epoch 8/10


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=63.0), HTML(value='')), layout=Layout(dis…


Epoch 9/10


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=63.0), HTML(value='')), layout=Layout(dis…


Epoch 10/10


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=63.0), HTML(value='')), layout=Layout(dis…





## Notes:

* BatchNormalization seems to help training accuracy converge faster. If no embedding layer, batch norm makes massive diff
* Having trouble getting validation accuracy beyond 7 or 8%.
* Moved to no embedding layer and used dropout layer not dropout arg on LSTM. Dropout followed by batch norm made accur increase slowly but reverse order does no good. weird