# LSTM poem generation for Shakespeare's sonnets

In [1]:
import numpy as np
import itertools

Reference: https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/

# Data preprocessing for LSTM

In [110]:
# Function to get Shakespeare's poems from file
def getPoems():
    with open("./data/shakespeare.txt", "r") as f:
        data = f.read().lower()
    # Split by poems
    poems = data.split("\n\n\n")
    # Remove 1st line of each poem
    out = []
    for poem in poems:
        for i in range(len(poem)):
            if poem[i]=='\n':
                break
        out.append(poem[i+1:])
    return out

# Get character to integer dictionary for one hot encoding
def getChardict(poems):
    # merge all poems and get list of characters
    data = "".join(poems)
    # Get dictionary of characters for one hot encoding
    chars = sorted(list(set(data)))
    charint = dict((c, i) for i, c in enumerate(chars))
    intchar = dict((i, c) for i, c in enumerate(chars))
    return charint,intchar

# Integer encode the poems
def getIntPoems(charint,poems):
    out = []
    for poem in poems:
        out.append([charint[char] for char in poem])
    return np.array(out)

def getWordlist():
    with open("./data/shakespeare.txt", "r") as f:
        data = f.read().lower()
    # Split by poems
    poems = data.split("\n\n\n")
    # Split poem by line, remove 1st line
    poems = [poem.split("\n")[1:] for poem in poems]
    # Split each line into a list of words
    poems = [[line.split(" ") for line in poem] for poem in poems]
    # Strip punctuation : Optional
    poems_by_lines = [[[word.strip(",.:;?!()").lower() for word in line] for line in poem ] for poem in poems]
    # Combine all the lines in a single poem so that each pome is just a list of words
    poems_by_words = [list(itertools.chain.from_iterable(poem)) for poem in poems_by_lines]
    wordlist = np.concatenate(poems_by_words)
    wordlist = np.unique(wordlist)
    wordlist = np.sort(wordlist)
    return wordlist

# Get array of poems
poems = getPoems()
# Get integer encoding dictionary
charint,intchar = getChardict(poems)
# Get Integer encoded poem array
IntPoems = getIntPoems(charint,poems)
# Get sorted list of words from all shakespeare poems
wordlist=np.array(getWordlist()[1:])

# Generate training data

In [3]:
# Generate X and Y training sets from each poem
def getCharacters(poem,n=40,skip=10):
    Xtrain = [poem[i:i+n] for i in range(0,len(poem)-n,skip)]
    Ytrain = [poem[i+n] for i in range(0,len(poem)-n,skip)]
    return Xtrain,Ytrain

# Generate training data 
Ntime = 80
skip = 5
Xtrain = []
Ytrain = []
for poem in IntPoems:
    Xt,Yt = getCharacters(poem,Ntime,skip)
    Xtrain.append(Xt)
    Ytrain.append(Yt)

Ytrain = np.concatenate(Ytrain)
Xtrain = np.concatenate(Xtrain)

# One hot encode the training vectors
import keras
Yt = keras.utils.np_utils.to_categorical(Ytrain)
Xt = keras.utils.np_utils.to_categorical(Xtrain)

print(Xt.shape)
print(Yt.shape)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


(17652, 40, 38)
(17652, 38)


# LSTM Model

In [19]:
from keras.layers import LSTM, Dense, Activation, BatchNormalization, Dropout
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint

Nchars = len(charint)
Ntime = 40

model = Sequential()
model.add(LSTM(200, input_shape=(Xt.shape[1],Xt.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(Nchars))
model.add(Activation('softmax'))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_8 (LSTM)                (None, 200)               191200    
_________________________________________________________________
dropout_8 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 38)                7638      
_________________________________________________________________
activation_5 (Activation)    (None, 38)                0         
Total params: 198,838
Trainable params: 198,838
Non-trainable params: 0
_________________________________________________________________


In [20]:
# define the checkpoint
fname="./data/Data_LSTM2-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(fname, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
model.fit(Xt, Yt, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20
Epoch 00001: loss improved from inf to 3.03423, saving model to ./data/Data_LSTM-01-3.0342.hdf5
Epoch 2/20
Epoch 00002: loss improved from 3.03423 to 2.78176, saving model to ./data/Data_LSTM-02-2.7818.hdf5
Epoch 3/20
Epoch 00003: loss improved from 2.78176 to 2.47586, saving model to ./data/Data_LSTM-03-2.4759.hdf5
Epoch 4/20
Epoch 00004: loss improved from 2.47586 to 2.33861, saving model to ./data/Data_LSTM-04-2.3386.hdf5
Epoch 5/20
Epoch 00005: loss improved from 2.33861 to 2.26043, saving model to ./data/Data_LSTM-05-2.2604.hdf5
Epoch 6/20
Epoch 00006: loss improved from 2.26043 to 2.20275, saving model to ./data/Data_LSTM-06-2.2027.hdf5
Epoch 7/20
Epoch 00007: loss improved from 2.20275 to 2.15045, saving model to ./data/Data_LSTM-07-2.1505.hdf5
Epoch 8/20
Epoch 00008: loss improved from 2.15045 to 2.10606, saving model to ./data/Data_LSTM-08-2.1061.hdf5
Epoch 9/20
Epoch 00009: loss improved from 2.10606 to 2.05821, saving model to ./data/Data_LSTM-09-2.0582.hdf5
Epoch

<keras.callbacks.History at 0x135ce3eb8>

# Generate poems from training set

In [278]:
def CharToInt(charint,text):
    return np.array([charint[char] for char in text])
    
def IntToChar(intchar,text):
    return "".join([intchar[char] for char in text])

# helper function to sample an index from a probability array
def sample(a, temperature=1.0):
    A = np.log(a) / temperature
    A = np.exp(A)
    A = A/np.sum(A)
    A = A/np.sum(A)
    return np.argmax(np.random.multinomial(1, A))

# If char is letter in word list
def IsWordLetter(char):
    letters = '\'abcdefghijklmnopqrstuvwxyz'
    # If the current letter is in word list, then return 1
    for i in letters:
        if char==i:
            return True
    return False

def validWords(text,wordlist,charint,intchar):
    # Valid letters in wordlist
    letters = '\'abcdefghijklmnopqrstuvwxyz'
    strtext = IntToChar(intchar,text)
    # If the current letter is not in word list, then all character options are valid
    if IsWordLetter(strtext[-1])==False:
        return 1
    
    # Find current word we are building
    for i in range(1,200):
        curword = strtext[len(strtext)-i:]
        if IsWordLetter(curword[0])==False:
            curword = curword[1:]
            break
    # Find if current word is in list of words
    L = []
    for word in wordlist:
        x = word.find(curword)
        if x==0:
            if len(curword) < len(word):
                L.append(word[len(curword)])

    # If no words in dictionary are found then the word must end
    if L==[]:
        L = [' ','\n',':',';']
    L=np.unique(L)
    # Build output vector
    out = np.zeros(len(charint))
    for i in L:
        out[charint[i]]=1.0
    return out

def WordSample(text, wordlist, charint,intchar, a, temperature=1.0):
    A = np.log(a) / temperature
    A = np.exp(A)
    b = validWords(text,wordlist,charint,intchar)
    A = np.multiply(A,b)
    # Normalize
    A = A/np.sum(A)
    return np.argmax(np.random.multinomial(1, A))

def generatePoem(model,intchar,charint,seed,temp=1.0):
    print('Seed = ',seed)
    IntSeed = CharToInt(charint,seed)
    IntOut = IntSeed
    temp = 1.0
    lines = 13
    # generate characters
    for i in range(1000):
        X = IntOut[i:i+Ntime]
        OneHot_X = keras.utils.np_utils.to_categorical([X],num_classes=len(charint))
        Ypred = model.predict(OneHot_X)
        idx = sample(Ypred[0],temp)
        IntOut = np.concatenate((IntOut,[idx]))
        # Count number of poem lines generated
        if idx==0:
            lines-=1
        if lines==0:
            break
    return IntToChar(intchar,IntOut)

# Generate poem constraining to real words
def generatePoem2(model,wordlist,intchar,charint,seed,temp=1.0):
    print('Seed = ',seed)
    IntSeed = CharToInt(charint,seed)
    IntOut = IntSeed
    temp = 1.0
    lines = 13
    # generate characters
    for i in range(1000):
        X = IntOut[i:i+Ntime]
        OneHot_X = keras.utils.np_utils.to_categorical([X],num_classes=len(charint))
        Ypred = model.predict(OneHot_X)
        idx = WordSample(IntOut,wordlist,charint,intchar,Ypred[0],temp)
        IntOut = np.concatenate((IntOut,[idx]))
        # Count number of poem lines generated
        if idx==0:
            lines-=1
        if lines==0:
            break
    return IntToChar(intchar,IntOut)

In [236]:
seed = "shall i compare thee to a summer's day?\n"
temp = [1.5,0.75,0.25]
for i in temp:
    print('Generated Poem at temp = ',i,':')
    print(generatePoem(model,intchar,charint,seed,temp=i))

Generated Poem at temp =  1.5 :
Seed =  shall i compare thee to a summer's day?

shall i compare thee to a summer's day?
geeditly thing domi lelt dising thiths and those her.
, and gife can the pariefild hor hought?
not no ir bling nhcheerwhor her thin that sen' blowsttlenst gresed:
and with thee having not, and thie sthith,
and thy selmorns i kno, thei shiesung)?
for om whise thee sermy not io that seprecuse.
  you or  ond deatey taltiit memy fill-graingl?
mure worf i sbeet in this sipas fon tele,
for nothols not ound of the and leadsy hedweat.
n be to tay fomerend your shillk, and thy weae apy will,
unwirnigh    arnmengiget in ach,
all pooutir's on fon nowr chich my e,un yers
shringe since erenfuigh lioks now of thy ight be.

Generated Poem at temp =  0.75 :
Seed =  shall i compare thee to a summer's day?

shall i compare thee to a summer's day?
loth prussing of and of sich dovighn
 sall stalle prace him nithe tham that pevesuwer she,
 urt ruth the poodst pross caving my stall,
nore 

In [279]:
seed = "shall i compare thee to a summer's day?\n"
temp = [1.5,0.75,0.25]
for i in temp:
    print('Generated Poem at temp = ',i,':')
    print(generatePoem2(model,wordlist,intchar,charint,seed,temp=i))

Generated Poem at temp =  1.5 :
Seed =  shall i compare thee to a summer's day?

shall i compare thee to a summer's day?
edge wouldst youth's greater under messengers:
which chose sweet-bearer it share took four sullen sue sheaves
sheaves often my here's best,
by sullen inconstant often twain livery falls these world-dead cheeks rise eyed attainted:
audit lusty forlorn wolf thee aright:
but thence merits clay therein hours 'had hated verses:
and things decayed if fist that's more deeds:
approve votary thousand doom took history peace:
issueless issueless rearward hours poesy:
and simple sums orient roses blow selling nearly days
th' instant onwards forth termed messengers;
our indeed sheaves if one eased and imperfect:
chief forty semblance hill went one (poorly thee more nearly

Generated Poem at temp =  0.75 :
Seed =  shall i compare thee to a summer's day?

shall i compare thee to a summer's day?
shallowest my wills  fame ne'er-converted if years wert;eyed issueless:
entertain ne'er

# LSTM model 2

In [17]:
# 2 layer LSTM
Nchars = len(charint)
Ntime = 40

model2 = Sequential()
model2.add(LSTM(200, input_shape=(Xt.shape[1],Xt.shape[2]),return_sequences=True))
model2.add(Dropout(0.2))
model2.add(LSTM(200))
model2.add(Dropout(0.2))
model2.add(Dense(Nchars))
model2.add(Activation('softmax'))
model2.summary()

model2.compile(loss='categorical_crossentropy', optimizer='adam')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_6 (LSTM)                (None, 40, 200)           191200    
_________________________________________________________________
dropout_6 (Dropout)          (None, 40, 200)           0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 200)               320800    
_________________________________________________________________
dropout_7 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 38)                7638      
_________________________________________________________________
activation_4 (Activation)    (None, 38)                0         
Total params: 519,638
Trainable params: 519,638
Non-trainable params: 0
_________________________________________________________________


In [18]:
# define the checkpoint
fname="./data/Data_LSTM2-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(fname, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
model2.fit(Xt, Yt, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20
Epoch 00001: loss improved from inf to 3.03414, saving model to ./data/Data_LSTM-01-3.0341.hdf5
Epoch 2/20
Epoch 00002: loss improved from 3.03414 to 2.80788, saving model to ./data/Data_LSTM-02-2.8079.hdf5
Epoch 3/20
Epoch 00003: loss improved from 2.80788 to 2.44372, saving model to ./data/Data_LSTM-03-2.4437.hdf5
Epoch 4/20
Epoch 00004: loss improved from 2.44372 to 2.28120, saving model to ./data/Data_LSTM-04-2.2812.hdf5
Epoch 5/20
Epoch 00005: loss improved from 2.28120 to 2.16701, saving model to ./data/Data_LSTM-05-2.1670.hdf5
Epoch 6/20
Epoch 00006: loss improved from 2.16701 to 2.08202, saving model to ./data/Data_LSTM-06-2.0820.hdf5
Epoch 7/20
Epoch 00007: loss improved from 2.08202 to 2.00487, saving model to ./data/Data_LSTM-07-2.0049.hdf5
Epoch 8/20
Epoch 00008: loss improved from 2.00487 to 1.94232, saving model to ./data/Data_LSTM-08-1.9423.hdf5
Epoch 9/20
Epoch 00009: loss improved from 1.94232 to 1.88593, saving model to ./data/Data_LSTM-09-1.8859.hdf5
Epoch

<keras.callbacks.History at 0x131f42be0>

In [25]:
seed = "shall i compare thee to a summer's day?\n"
temp = [1.5,0.75,0.25]
for i in temp:
    print('Generated Poem at temp = ',i,':')
    print(generatePoem(model2,intchar,charint,seed,temp=i))

Generated Poem at temp =  1.5 :
Seed =  shall i compare thee to a summer's day?

shall i compare thee to a summer's day?
u hack ene rund it befaired ouncengid,
thy beauty madk, and that shoust bestase make wnow,
which my searh's and vord no tellsey by fale,
  for ain with thy lovert's lose anther so love cride:
thou though i speeirs told their whisk chave rromomed,
put i tave remamy thou brsadest whem dooust,
'ush faving freit leave had tithe's fay thee more sele:
th'sefo gralice you bestory blought wat,
that som am thy dzadt, and that with this prowed deathen griead;
to the tan the somledzst so eir's with (bvave,
parist lakind him lover of mance incantube,
sifingut now, and moring well buind on ath's dowe.
by that reif this love thy porst to ostnele.

Generated Poem at temp =  0.75 :
Seed =  shall i compare thee to a summer's day?

shall i compare thee to a summer's day?
the may the frait doth the vourso of (ids.
the sweet farteringing to he porce are:
a simven's tel afd beadt pzoth b

In [280]:
seed = "shall i compare thee to a summer's day?\n"
temp = [1.5,0.75,0.25]
for i in temp:
    print('Generated Poem at temp = ',i,':')
    print(generatePoem2(model2,wordlist,intchar,charint,seed,temp=i))

Generated Poem at temp =  1.5 :
Seed =  shall i compare thee to a summer's day?

shall i compare thee to a summer's day?
thence shamed black mine eyes' bloody shaken
astonished my robe sang once crooked groans:
cunning are and my more destroys one:
one angry withering allow now steeled vilest
gates fairest gives although says forsworn being days:
correspondence send'st fairer fairest farther hindmost tells bears
  forwards merits beauty's doom feathered bower head
often theirs often tombed is't looks down-betraying tattered:
and my fearing amazeth silvered potions potions dully
methods march times nothing mountain nativity farther something:
thence that's after-torn mayst once worthiness amends blessed-cheap thine intents roses
doth incapable offices puts mine awards
rainy from thee bitterness often took twenty heart's painter's died:

Generated Poem at temp =  0.75 :
Seed =  shall i compare thee to a summer's day?

shall i compare thee to a summer's day?
that's my lusty ensconce issue