# LSTM poem generation for Shakespeare's sonnets

In [1]:
import numpy as np
import itertools

Reference: https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/

# Data preprocessing for LSTM

In [22]:
# Function to get Shakespeare's poems from file
def getPoems():
    with open("./data/shakespeare.txt", "r") as f:
        data = f.read().lower()
    # Split by poems
    poems = data.split("\n\n\n")
    # Remove 1st line of each poem
    out = []
    for poem in poems:
        for i in range(len(poem)):
            if poem[i]=='\n':
                break
        out.append(poem[i+1:])
    return out

# Get character to integer dictionary for one hot encoding
def getChardict(poems):
    # merge all poems and get list of characters
    data = "".join(poems)
    # Get dictionary of characters for one hot encoding
    chars = sorted(list(set(data)))
    charint = dict((c, i) for i, c in enumerate(chars))
    intchar = dict((i, c) for i, c in enumerate(chars))
    return charint,intchar

# Integer encode the poems
def getIntPoems(charint,poems):
    out = []
    for poem in poems:
        out.append([charint[char] for char in poem])
    return np.array(out)

# Get array of poems
poems = getPoems()
# Get integer encoding dictionary
charint,intchar = getChardict(poems)
# Get Integer encoded poem array
IntPoems = getIntPoems(charint,poems)

print(charint)

{'\n': 0, ' ': 1, '!': 2, "'": 3, '(': 4, ')': 5, ',': 6, '-': 7, '.': 8, ':': 9, ';': 10, '?': 11, 'a': 12, 'b': 13, 'c': 14, 'd': 15, 'e': 16, 'f': 17, 'g': 18, 'h': 19, 'i': 20, 'j': 21, 'k': 22, 'l': 23, 'm': 24, 'n': 25, 'o': 26, 'p': 27, 'q': 28, 'r': 29, 's': 30, 't': 31, 'u': 32, 'v': 33, 'w': 34, 'x': 35, 'y': 36, 'z': 37}


# Generate training data

In [4]:
# Generate X and Y training sets from each poem
def getCharacters(poem,n=40,skip=10):
    Xtrain = [poem[i:i+n] for i in range(0,len(poem)-n,skip)]
    Ytrain = [poem[i+n] for i in range(0,len(poem)-n,skip)]
    return Xtrain,Ytrain

# Generate training data 
Ntime = 40
skip = 5
Xtrain = []
Ytrain = []
for poem in IntPoems:
    Xt,Yt = getCharacters(poem,Ntime,skip)
    Xtrain.append(Xt)
    Ytrain.append(Yt)

Ytrain = np.concatenate(Ytrain)
Xtrain = np.concatenate(Xtrain)

# One hot encode the training vectors
import keras
Yt = keras.utils.np_utils.to_categorical(Ytrain)
Xt = keras.utils.np_utils.to_categorical(Xtrain)

print(Xt.shape)
print(Yt.shape)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


(17735, 40, 48)
(17735, 48)


# LSTM Model

In [5]:
from keras.layers import LSTM, Dense, Activation, BatchNormalization, Dropout
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint

Nchars = len(charint)
Ntime = 40

model = Sequential()
model.add(LSTM(200, input_shape=(Xt.shape[1],Xt.shape[2])))
model.add(Dropout(0.0))
model.add(Dense(Nchars))
model.add(Activation('softmax'))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 200)               199200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 48)                9648      
_________________________________________________________________
activation_1 (Activation)    (None, 48)                0         
Total params: 208,848
Trainable params: 208,848
Non-trainable params: 0
_________________________________________________________________


In [6]:
# define the checkpoint
fname="./data/Data_LSTM-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(fname, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
model.fit(Xt, Yt, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20
Epoch 00001: loss improved from inf to 3.04097, saving model to ./data/Data_LSTM-01-3.0410.hdf5
Epoch 2/20
Epoch 00002: loss improved from 3.04097 to 2.76346, saving model to ./data/Data_LSTM-02-2.7635.hdf5
Epoch 3/20
Epoch 00003: loss improved from 2.76346 to 2.46589, saving model to ./data/Data_LSTM-03-2.4659.hdf5
Epoch 4/20
Epoch 00004: loss improved from 2.46589 to 2.32910, saving model to ./data/Data_LSTM-04-2.3291.hdf5
Epoch 5/20
Epoch 00005: loss improved from 2.32910 to 2.25187, saving model to ./data/Data_LSTM-05-2.2519.hdf5
Epoch 6/20
Epoch 00006: loss improved from 2.25187 to 2.19591, saving model to ./data/Data_LSTM-06-2.1959.hdf5
Epoch 7/20
Epoch 00007: loss improved from 2.19591 to 2.14301, saving model to ./data/Data_LSTM-07-2.1430.hdf5
Epoch 8/20
Epoch 00008: loss improved from 2.14301 to 2.10054, saving model to ./data/Data_LSTM-08-2.1005.hdf5
Epoch 9/20
Epoch 00009: loss improved from 2.10054 to 2.05907, saving model to ./data/Data_LSTM-09-2.0591.hdf5
Epoch

<keras.callbacks.History at 0x11b472940>

# Generate poems from training set

In [13]:
def CharToInt(charint,text):
    return np.array([charint[char] for char in text])
    
def IntToChar(intchar,text):
    return "".join([intchar[char] for char in text])

# helper function to sample an index from a probability array
def sample(a, temperature=1.0):
    a = np.log(a) / temperature
    a = np.exp(a)
    a = a/np.sum(a)*.99
    return np.argmax(np.random.multinomial(1, a, 1))

def generatePoem(model,intchar,charint,seed,temp=1.0):
    print('Seed = ',seed)
    IntSeed = CharToInt(charint,seed)
    print('Integer Encoded Seed=',IntSeed)

    IntOut = IntSeed
    temp = 1.0
    lines = 13
    # generate characters
    for i in range(1000):
        X = IntOut[i:i+Ntime]
        OneHot_X = keras.utils.np_utils.to_categorical([X],num_classes=len(charint))
        Ypred = model.predict(OneHot_X)
        idx = sample(Ypred[0],temp)
        IntOut = np.concatenate((IntOut,[idx]))
        # Count number of poem lines generated
        if idx==0:
            lines-=1
        if lines==0:
            break
    return IntToChar(intchar,IntOut)

In [14]:
seed = "shall i compare thee to a summer's day?\n"
temp = [1.5,0.75,0.25]
for i in temp:
    print('Generated Poem at temp = ',i,':')
    print(generatePoem(model,intchar,charint,seed,temp=i))

Generated Poem at temp =  1.5 :
Seed =  shall i compare thee to a summer's day?

Integer Encoded Seed= [40 29 22 33 33  1 30  1 24 36 34 37 22 39 26  1 41 29 26 26  1 41 36  1
 22  1 40 42 34 34 26 39  3 40  1 25 22 46 21  0]
shall i compare thee to a summer's day?
and mas's hess arlofnturee io e-timessto gance,
bact on with traughtress eracterat to mich.
  ald toor owang in thy reaighs sa me, ary whire,
or of 'ee iw the sofllvest frormew in il sond,
theres storn thy  arveris wastworthinge in thenoumy
i jearindzedes to thet, cell-erantu hire,
than seef tebuty thean te be, agfoend tay so!
,owet wele verise snombangsszened frate:
afle searet thou hatr zagres aiten windaody.
  bu liss tham in thou sow, moke to hith, heregzst?
untheld yethy cornthings, tore ir my hzart beake blzert
 orethare withtheis im ereast, i gear the erweab, of tires roth ray me,
tho wrirg bale on in ouths swarte,

Generated Poem at temp =  0.75 :
Seed =  shall i compare thee to a summer's day?

Integer Encoded Seed=

# LSTM model 2

In [17]:
# 2 layer LSTM
Nchars = len(charint)
Ntime = 40

model2 = Sequential()
model2.add(LSTM(200, input_shape=(Xt.shape[1],Xt.shape[2]),return_sequences=True))
model2.add(Dropout(0.0))
model2.add(LSTM(200))
model2.add(Dropout(0.0))
model2.add(Dense(Nchars))
model2.add(Activation('softmax'))
model2.summary()

model2.compile(loss='categorical_crossentropy', optimizer='adam')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_6 (LSTM)                (None, 40, 200)           199200    
_________________________________________________________________
dropout_4 (Dropout)          (None, 40, 200)           0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 200)               320800    
_________________________________________________________________
dropout_5 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 48)                9648      
_________________________________________________________________
activation_2 (Activation)    (None, 48)                0         
Total params: 529,648
Trainable params: 529,648
Non-trainable params: 0
_________________________________________________________________


In [18]:
# define the checkpoint
fname="./data/Data_LSTM-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(fname, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
model2.fit(Xt, Yt, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20
Epoch 00001: loss improved from inf to 3.05924, saving model to ./data/Data_LSTM-01-3.0592.hdf5
Epoch 2/20

KeyboardInterrupt: 

In [11]:
seed = "shall i compare thee to a summer's day?\n"
temp = [1.5,0.75,0.25]
for i in temp:
    print('Generated Poem at temp = ',i,':')
    print(generatePoem(model,seed,temp=i))

Generated Poem at temp =  1.5 :
Seed =  shall i compare thee to a summer's day?

Integer Encoded Seed= [40 29 22 33 33  1 30  1 24 36 34 37 22 39 26  1 41 29 26 26  1 41 36  1
 22  1 40 42 34 34 26 39  3 40  1 25 22 46 21  0]


TypeError: IntToChar() missing 1 required positional argument: 'text'