# LSTM poem generation for Shakespeare's sonnets

In [78]:
import numpy as np
import itertools

Reference: https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/

# Data preprocessing for LSTM

In [79]:
# Function to get Shakespeare's poems from file
def getPoems():
    with open("./data/shakespeare.txt", "r") as f:
        data = f.read().lower()
    # Split by poems
    poems = data.split("\n\n\n")
    # Remove 1st line of each poem
    poems = [poem[20:] for poem in poems]
    return poems

# Get character to integer dictionary for one hot encoding
def getChardict(poems):
    # merge all poems and get list of characters
    data = "".join(poems)
    # Get dictionary of characters for one hot encoding
    chars = sorted(list(set(data)))
    charint = dict((c, i) for i, c in enumerate(chars))
    intchar = dict((i, c) for i, c in enumerate(chars))
    return charint,intchar

# Integer encode the poems
def getIntPoems(charint,poems):
    out = []
    for poem in poems:
        out.append([charint[char] for char in poem])
    return np.array(out)

# Get array of poems
poems = getPoems()
# Get integer encoding dictionary
charint,intchar = getChardict(poems)
# Get Integer encoded poem array
IntPoems = getIntPoems(charint,poems)

# Generate training data

In [154]:
# Generate X and Y training sets from each poem
def getCharacters(poem,n=40,skip=10):
    Xtrain = [poem[i:i+n] for i in range(0,len(poem)-n,skip)]
    Ytrain = [poem[i+n] for i in range(0,len(poem)-n,skip)]
    return Xtrain,Ytrain

# Generate training data 
Ntime = 40
skip = 5
Xtrain = []
Ytrain = []
for poem in IntPoems:
    Xt,Yt = getCharacters(poem,n,skip)
    Xtrain.append(Xt)
    Ytrain.append(Yt)

Ytrain = np.concatenate(Ytrain)
Xtrain = np.concatenate(Xtrain)

# One hot encode the training vectors
import keras
Yt = keras.utils.np_utils.to_categorical(Ytrain)
Xt = keras.utils.np_utils.to_categorical(Xtrain)

print(Xt.shape)
print(Yt.shape)

(17735, 40, 48)
(17735, 48)


# LSTM Model

In [156]:
from keras.layers import LSTM, Dense, Activation, BatchNormalization, Dropout
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint

Nchars = len(charint)
Ntime = 40

model = Sequential()
model.add(LSTM(200, input_shape=(Xt.shape[1],Xt.shape[2]),return_sequences=True))
model.add(Dropout(0.2))
model.add(Dense(Nchars))
model.add(Activation('softmax'))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_15 (LSTM)               (None, 200)               199200    
_________________________________________________________________
dropout_9 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 48)                9648      
_________________________________________________________________
activation_9 (Activation)    (None, 48)                0         
Total params: 208,848
Trainable params: 208,848
Non-trainable params: 0
_________________________________________________________________


In [157]:
# define the checkpoint
fname="./data/Data_LSTM-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(fname, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
model.fit(Xt, Yt, epochs=10, batch_size=128, callbacks=callbacks_list)

Epoch 1/10
Epoch 00001: loss improved from inf to 3.06760, saving model to ./data/Data_LSTM-01-3.0676.hdf5
Epoch 2/10
Epoch 00002: loss improved from 3.06760 to 2.84196, saving model to ./data/Data_LSTM-02-2.8420.hdf5
Epoch 3/10
Epoch 00003: loss improved from 2.84196 to 2.51404, saving model to ./data/Data_LSTM-03-2.5140.hdf5
Epoch 4/10
Epoch 00004: loss improved from 2.51404 to 2.36951, saving model to ./data/Data_LSTM-04-2.3695.hdf5
Epoch 5/10
Epoch 00005: loss improved from 2.36951 to 2.28834, saving model to ./data/Data_LSTM-05-2.2883.hdf5
Epoch 6/10
Epoch 00006: loss improved from 2.28834 to 2.23018, saving model to ./data/Data_LSTM-06-2.2302.hdf5
Epoch 7/10
Epoch 00007: loss improved from 2.23018 to 2.17389, saving model to ./data/Data_LSTM-07-2.1739.hdf5
Epoch 8/10
Epoch 00008: loss improved from 2.17389 to 2.13131, saving model to ./data/Data_LSTM-08-2.1313.hdf5
Epoch 9/10
Epoch 00009: loss improved from 2.13131 to 2.08813, saving model to ./data/Data_LSTM-09-2.0881.hdf5
Epoch

<keras.callbacks.History at 0x1249da438>

# Generate poems from training set

In [170]:
def CharToInt(charint,text):
    return np.array([charint[char] for char in text])
    
def IntToChar(intchar,text):
    return "".join([intchar[char] for char in text])

# helper function to sample an index from a probability array
def sample(a, temperature=1.0):
    a = np.log(a) / temperature
    a = np.exp(a) / np.sum(np.exp(a))
    return np.argmax(np.random.multinomial(1, a, 1))

# Seed the first line
seed = "shall i compare thee to a summer's day?\n"
print('Seed = ',seed)
IntSeed = CharToInt(charint,seed)
print('Integer Encoded Seed=',IntSeed)

IntOut = IntSeed
temp = 1.0
lines = 13
# generate characters
for i in range(1000):
    X = IntOut[i:i+Ntime]
    OneHot_X = keras.utils.np_utils.to_categorical([X],num_classes=len(charint))
    Ypred = model.predict(OneHot_X)
    idx = sample(Ypred[0],temp)
    IntOut = np.concatenate((IntOut,[idx]))
    # Count number of poem lines generated
    if idx==0:
        lines-=1
    if lines==0:
        break

Seed =  shall i compare thee to a summer's day?

Integer Encoded Seed= [40 29 22 33 33  1 30  1 24 36 34 37 22 39 26  1 41 29 26 26  1 41 36  1
 22  1 40 42 34 34 26 39  3 40  1 25 22 46 21  0]


In [171]:
print('')
print(IntToChar(intchar,IntOut))

shall i compare thee to a summer's day?
i by tor mm te ound she,
thou d corve shehore wheh thest farmurt gali's,
avem, on then thou tho love the htroy thou hur yean iw bein:y eie foren, and at the hatees'sum efingmeeddghets,
whone ttay thee mave the listhr chabse chave hy al:
a lout,
 eer ymer,
 have ibet ould thy ampey see dotgess bowel coris, ofauss nneredt drepriching mlrs'sung,
ay hats teec aawl
nomen dom hike tent ald damenouvent arleverds dilarw than awer alouy tromfroll my remand.
d eeling thiuu paranet thau hove.
 or thecy whatht tous frale g ay sumanyes ferrmy demenes dathis stoun, hargr wo hape in in eroudher core butle;
nhar cfourpyinge,
aut andes beade theats presigks 'sem tore,

