# LSTM poem generation for Shakespeare's sonnets

In [78]:
import numpy as np
import itertools

Reference: https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/

# Data preprocessing for LSTM

In [79]:
# Function to get Shakespeare's poems from file
def getPoems():
    with open("./data/shakespeare.txt", "r") as f:
        data = f.read().lower()
    # Split by poems
    poems = data.split("\n\n\n")
    # Remove 1st line of each poem
    poems = [poem[20:] for poem in poems]
    return poems

# Get character to integer dictionary for one hot encoding
def getChardict(poems):
    # merge all poems and get list of characters
    data = "".join(poems)
    # Get dictionary of characters for one hot encoding
    chars = sorted(list(set(data)))
    charint = dict((c, i) for i, c in enumerate(chars))
    intchar = dict((i, c) for i, c in enumerate(chars))
    return charint,intchar

# Integer encode the poems
def getIntPoems(charint,poems):
    out = []
    for poem in poems:
        out.append([charint[char] for char in poem])
    return np.array(out)

# Get array of poems
poems = getPoems()
# Get integer encoding dictionary
charint,intchar = getChardict(poems)
# Get Integer encoded poem array
IntPoems = getIntPoems(charint,poems)

# Generate training data

In [107]:
# Generate X and Y training sets from each poem
def getCharacters(poem,n=40,skip=10):
    Xtrain = [poem[i:i+n] for i in range(0,len(poem)-n,skip)]
    Ytrain = [poem[i+n] for i in range(0,len(poem)-n,skip)]
    return Xtrain,Ytrain

# Generate training data 
Ntime = 40
skip = 10
Xtrain = []
Ytrain = []
for poem in IntPoems:
    Xt,Yt = getCharacters(poem,n,skip)
    Xtrain.append(Xt)
    Ytrain.append(Yt)

Ytrain = np.concatenate(Ytrain)
Xtrain = np.concatenate(Xtrain)

# One hot encode the training vectors
import keras
Yt = keras.utils.np_utils.to_categorical(Ytrain)
Xt = keras.utils.np_utils.to_categorical(Xtrain)

print(Xt.shape)
print(Yt.shape)

(8911, 40, 48)
(8911, 48)


# LSTM Model

In [108]:
from keras.layers import LSTM, Dense, Activation, BatchNormalization, Dropout
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint

Nchars = len(charint)
Ntime = 40

model = Sequential()
model.add(LSTM(200, input_shape=(Ntime, Nchars)))
model.add(Dropout(0))
model.add(Dense(Nchars))
model.add(Activation('softmax'))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_7 (LSTM)                (None, 200)               199200    
_________________________________________________________________
dropout_4 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 48)                9648      
_________________________________________________________________
activation_4 (Activation)    (None, 48)                0         
Total params: 208,848
Trainable params: 208,848
Non-trainable params: 0
_________________________________________________________________


In [None]:
# define the checkpoint
fname="./data/Data_LSTM-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(fname, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
model.fit(Xt, Yt, epochs=10, batch_size=128, callbacks=callbacks_list)

Epoch 1/10
Epoch 00001: loss improved from inf to 3.11937, saving model to ./data/Data_LSTM-01-3.1194.hdf5
Epoch 2/10
Epoch 00002: loss improved from 3.11937 to 2.95142, saving model to ./data/Data_LSTM-02-2.9514.hdf5
Epoch 3/10
1920/8911 [=====>........................] - ETA: 14s - loss: 2.8819

# Generate poems from training set

In [117]:
def CharToInt(charint,text):
    return np.array([charint[char] for char in text])
    
def IntToChar(intchar,text):
    return "".join([intchar[char] for char in text])

# Seed the first line
seed = "shall i compare thee to a summer's day?\n"
print('Seed = ',seed)
IntSeed = CharToInt(charint,seed)
print('Integer Encoded Seed=',IntSeed)

IntOut = IntSeed
# generate characters
for i in range(100):
    X = IntOut[i:i+Ntime]
    OneHot_X = keras.utils.np_utils.to_categorical([X],num_classes=len(charint))
    Ypred = model.predict(OneHot_X)
    idx = np.argmax(Ypred)
    IntOut = np.concatenate((IntOut,[idx]))

Seed =  shall i compare thee to a summer's day?

Integer Encoded Seed= [40 29 22 33 33  1 30  1 24 36 34 37 22 39 26  1 41 29 26 26  1 41 36  1
 22  1 40 42 34 34 26 39  3 40  1 25 22 46 21  0]


In [126]:
print(IntOut)
print(IntSeed)
print(IntToChar(intchar,IntOut))

[40 29 22 33 33  1 30  1 24 36 34 37 22 39 26  1 41 29 26 26  1 41 36  1
 22  1 40 42 34 34 26 39  3 40  1 25 22 46 21  0 18 18 33 18 33 17 17 17
 17 39  7  7 13 13 13 11 11 11 11 11 11  7  7  7 11 11 11  7 11 11  7 11
 11  7 11 11  7  7 11 11 11  7 11 11  7  7 11 11 11  7 11 11  7  7 11 11
 11  7 11 11  7  7 11 11 11  7 11 11  7  7 11 11 11  7 11 11  7  7 11 11
 11  7 11 11  7  7 11 11 11  7 11 11  7  7 11 11 11  7 11 11]
[40 29 22 33 33  1 30  1 24 36 34 37 22 39 26  1 41 29 26 26  1 41 36  1
 22  1 40 42 34 34 26 39  3 40  1 25 22 46 21  0]
shall i compare thee to a summer's day?
99l9l8888r--444222222---222-22-22-22--222-22--222-22--222-22--222-22--222-22--222-22--222-22--222-22
