# LSTM poem generation for Shakespeare's sonnets

In [2]:
import numpy as np
import itertools

Reference: https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/

# Data preprocessing for LSTM

In [3]:
# Function to get Shakespeare's poems from file
def getPoems():
    with open("./data/shakespeare.txt", "r") as f:
        data = f.read().lower()
    # Split by poems
    poems = data.split("\n\n\n")
    # Remove 1st line of each poem
    out = []
    for poem in poems:
        for i in range(len(poem)):
            if poem[i]=='\n':
                break
        out.append(poem[i+1:])
    return out

# Get character to integer dictionary for one hot encoding
def getChardict(poems):
    # merge all poems and get list of characters
    data = "".join(poems)
    # Get dictionary of characters for one hot encoding
    chars = sorted(list(set(data)))
    charint = dict((c, i) for i, c in enumerate(chars))
    intchar = dict((i, c) for i, c in enumerate(chars))
    return charint,intchar

# Integer encode the poems
def getIntPoems(charint,poems):
    out = []
    for poem in poems:
        out.append([charint[char] for char in poem])
    return np.array(out)

def getWordlist():
    with open("./data/shakespeare.txt", "r") as f:
        data = f.read().lower()
    # Split by poems
    poems = data.split("\n\n\n")
    # Split poem by line, remove 1st line
    poems = [poem.split("\n")[1:] for poem in poems]
    # Split each line into a list of words
    poems = [[line.split(" ") for line in poem] for poem in poems]
    # Strip punctuation : Optional
    poems_by_lines = [[[word.strip(",.:;?!()").lower() for word in line] for line in poem ] for poem in poems]
    # Combine all the lines in a single poem so that each pome is just a list of words
    poems_by_words = [list(itertools.chain.from_iterable(poem)) for poem in poems_by_lines]
    wordlist = np.concatenate(poems_by_words)
    wordlist = np.unique(wordlist)
    wordlist = np.sort(wordlist)
    return wordlist

# Get array of poems
poems = getPoems()
# Get integer encoding dictionary
charint,intchar = getChardict(poems)
# Get Integer encoded poem array
IntPoems = getIntPoems(charint,poems)
# Get sorted list of words from all shakespeare poems
wordlist=np.array(getWordlist()[1:])

# Generate training data

In [4]:
# Generate X and Y training sets from each poem
def getCharacters(poem,n=40,skip=10):
    Xtrain = [poem[i:i+n] for i in range(0,len(poem)-n,skip)]
    Ytrain = [poem[i+n] for i in range(0,len(poem)-n,skip)]
    return Xtrain,Ytrain

# Generate training data 
Ntime = 40
skip = 5
Xtrain = []
Ytrain = []
for poem in IntPoems:
    Xt,Yt = getCharacters(poem,Ntime,skip)
    Xtrain.append(Xt)
    Ytrain.append(Yt)

Ytrain = np.concatenate(Ytrain)
Xtrain = np.concatenate(Xtrain)

# One hot encode the training vectors
import keras
Yt = keras.utils.np_utils.to_categorical(Ytrain)
Xt = keras.utils.np_utils.to_categorical(Xtrain)

print(Xt.shape)
print(Yt.shape)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


(17652, 40, 38)
(17652, 38)


# LSTM Model

In [5]:
from keras.layers import LSTM, Dense, Activation, BatchNormalization, Dropout
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint

Nchars = len(charint)
Ntime = 40

model = Sequential()
model.add(LSTM(200, input_shape=(Xt.shape[1],Xt.shape[2])))
model.add(Dropout(0.0))
model.add(Dense(Nchars))
model.add(Activation('softmax'))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 200)               191200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 38)                7638      
_________________________________________________________________
activation_1 (Activation)    (None, 38)                0         
Total params: 198,838
Trainable params: 198,838
Non-trainable params: 0
_________________________________________________________________


In [7]:
# define the checkpoint
fname="./data/Data_LSTM1layer_100-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(fname, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
model.fit(Xt, Yt, epochs=100, batch_size=128, callbacks=callbacks_list)

Epoch 1/100
Epoch 00001: loss improved from inf to 1.80976, saving model to ./data/Data_LSTM1layer_100-01-1.8098.hdf5
Epoch 2/100
Epoch 00002: loss improved from 1.80976 to 1.76835, saving model to ./data/Data_LSTM1layer_100-02-1.7684.hdf5
Epoch 3/100
Epoch 00003: loss improved from 1.76835 to 1.72537, saving model to ./data/Data_LSTM1layer_100-03-1.7254.hdf5
Epoch 4/100
Epoch 00004: loss improved from 1.72537 to 1.68369, saving model to ./data/Data_LSTM1layer_100-04-1.6837.hdf5
Epoch 5/100
Epoch 00005: loss improved from 1.68369 to 1.65060, saving model to ./data/Data_LSTM1layer_100-05-1.6506.hdf5
Epoch 6/100
Epoch 00006: loss improved from 1.65060 to 1.59198, saving model to ./data/Data_LSTM1layer_100-06-1.5920.hdf5
Epoch 7/100
Epoch 00007: loss improved from 1.59198 to 1.54159, saving model to ./data/Data_LSTM1layer_100-07-1.5416.hdf5
Epoch 8/100
Epoch 00008: loss improved from 1.54159 to 1.48859, saving model to ./data/Data_LSTM1layer_100-08-1.4886.hdf5
Epoch 9/100
Epoch 00009: los

Epoch 32/100
Epoch 00032: loss improved from 0.13699 to 0.11845, saving model to ./data/Data_LSTM1layer_100-32-0.1185.hdf5
Epoch 33/100
Epoch 00033: loss improved from 0.11845 to 0.10679, saving model to ./data/Data_LSTM1layer_100-33-0.1068.hdf5
Epoch 34/100
Epoch 00034: loss did not improve
Epoch 35/100
Epoch 00035: loss did not improve
Epoch 36/100
Epoch 00036: loss did not improve
Epoch 37/100
Epoch 00037: loss improved from 0.10679 to 0.10667, saving model to ./data/Data_LSTM1layer_100-37-0.1067.hdf5
Epoch 38/100
Epoch 00038: loss improved from 0.10667 to 0.08561, saving model to ./data/Data_LSTM1layer_100-38-0.0856.hdf5
Epoch 39/100
Epoch 00039: loss improved from 0.08561 to 0.06386, saving model to ./data/Data_LSTM1layer_100-39-0.0639.hdf5
Epoch 40/100
Epoch 00040: loss improved from 0.06386 to 0.04507, saving model to ./data/Data_LSTM1layer_100-40-0.0451.hdf5
Epoch 41/100
Epoch 00041: loss improved from 0.04507 to 0.02600, saving model to ./data/Data_LSTM1layer_100-41-0.0260.hdf

Epoch 68/100
Epoch 00068: loss improved from 0.00548 to 0.00499, saving model to ./data/Data_LSTM1layer_100-68-0.0050.hdf5
Epoch 69/100
Epoch 00069: loss improved from 0.00499 to 0.00454, saving model to ./data/Data_LSTM1layer_100-69-0.0045.hdf5
Epoch 70/100
Epoch 00070: loss improved from 0.00454 to 0.00415, saving model to ./data/Data_LSTM1layer_100-70-0.0042.hdf5
Epoch 71/100
Epoch 00071: loss improved from 0.00415 to 0.00381, saving model to ./data/Data_LSTM1layer_100-71-0.0038.hdf5
Epoch 72/100
Epoch 00072: loss improved from 0.00381 to 0.00349, saving model to ./data/Data_LSTM1layer_100-72-0.0035.hdf5
Epoch 73/100
Epoch 00073: loss improved from 0.00349 to 0.00321, saving model to ./data/Data_LSTM1layer_100-73-0.0032.hdf5
Epoch 74/100
Epoch 00074: loss improved from 0.00321 to 0.00296, saving model to ./data/Data_LSTM1layer_100-74-0.0030.hdf5
Epoch 75/100
Epoch 00075: loss improved from 0.00296 to 0.00273, saving model to ./data/Data_LSTM1layer_100-75-0.0027.hdf5
Epoch 76/100
Epo

<keras.callbacks.History at 0x125e3a240>

# Generate poems from training set

In [12]:
def CharToInt(charint,text):
    return np.array([charint[char] for char in text])
    
def IntToChar(intchar,text):
    return "".join([intchar[char] for char in text])

# helper function to sample an index from a probability array
def sample(a, temperature=1.0):
    A = np.log(a) / temperature
    A = np.exp(A)
    A = A/np.sum(A)*.99
    return np.argmax(np.random.multinomial(1, A))

# If char is letter in word list
def IsWordLetter(char):
    letters = '\'abcdefghijklmnopqrstuvwxyz'
    # If the current letter is in word list, then return 1
    for i in letters:
        if char==i:
            return True
    return False

def validWords(text,wordlist,charint,intchar):
    # Valid letters in wordlist
    letters = '\'abcdefghijklmnopqrstuvwxyz'
    strtext = IntToChar(intchar,text)
    # If the current letter is not in word list, then all character options are valid
    if IsWordLetter(strtext[-1])==False:
        return 1
    
    # Find current word we are building
    for i in range(1,200):
        curword = strtext[len(strtext)-i:]
        if IsWordLetter(curword[0])==False:
            curword = curword[1:]
            break
    # Find if current word is in list of words
    L = []
    for word in wordlist:
        x = word.find(curword)
        if x==0:
            if len(curword) < len(word):
                L.append(word[len(curword)])

    # If no words in dictionary are found then the word must end
    if L==[]:
        L = [' ','\n',':',';']
    L=np.unique(L)
    # Build output vector
    out = np.zeros(len(charint))
    for i in L:
        out[charint[i]]=1.0
    return out

def WordSample(text, wordlist, charint,intchar, a, temperature=1.0):
    A = np.log(a) / temperature
    A = np.exp(A)
    b = validWords(text,wordlist,charint,intchar)
    A = np.multiply(A,b)
    # Normalize
    A = A/np.sum(A)*.99
    return np.argmax(np.random.multinomial(1, A))

def generatePoem(model,intchar,charint,seed,temp=1.0):
    print('Seed = ',seed)
    IntSeed = CharToInt(charint,seed)
    IntOut = IntSeed
    temp = 1.0
    lines = 13
    # generate characters
    for i in range(1000):
        X = IntOut[i:i+Ntime]
        OneHot_X = keras.utils.np_utils.to_categorical([X],num_classes=len(charint))
        Ypred = model.predict(OneHot_X)
        idx = sample(Ypred[0],temp)
        IntOut = np.concatenate((IntOut,[idx]))
        # Count number of poem lines generated
        if idx==0:
            lines-=1
        if lines==0:
            break
    return IntToChar(intchar,IntOut)

# Generate poem constraining to real words
def generatePoem2(model,wordlist,intchar,charint,seed,temp=1.0):
    print('Seed = ',seed)
    IntSeed = CharToInt(charint,seed)
    IntOut = IntSeed
    temp = 1.0
    lines = 13
    # generate characters
    for i in range(1000):
        X = IntOut[i:i+Ntime]
        OneHot_X = keras.utils.np_utils.to_categorical([X],num_classes=len(charint))
        Ypred = model.predict(OneHot_X)
        idx = WordSample(IntOut,wordlist,charint,intchar,Ypred[0],temp)
        IntOut = np.concatenate((IntOut,[idx]))
        # Count number of poem lines generated
        if idx==0:
            lines-=1
        if lines==0:
            break
    return IntToChar(intchar,IntOut)

In [13]:
seed = "shall i compare thee to a summer's day?\n"
temp = [1.5,0.75,0.25]
for i in temp:
    print('Generated Poem at temp = ',i,':')
    print(generatePoem(model,intchar,charint,seed,temp=i))

Generated Poem at temp =  1.5 :
Seed =  shall i compare thee to a summer's day?

shall i compare thee to a summer's day?
the simf and day, and parsube a beermer;
o grast me in waring tand wall if lowe,
tho ghat the cemplaved white thy praine,
:secings the herpsto d to thes to be tofleds be ther,
whan ffor i love farrich you word's se fante,
lice momh mps mut this my purt cearn thiughtz.
  knowe in not self ar il besice to sing,
alt yout to theme wornd so cuncesing theez
them shaved arivery now,
hau douth thes my nerezt and you altherzing treen
cozess mijered she thy erpeode nce rhis hme,
loking grouty cheilt lach oundw things my ming,
whels offingrest sink oot as thy croult frown?

Generated Poem at temp =  0.75 :
Seed =  shall i compare thee to a summer's day?

shall i compare thee to a summer's day?
thet brouty bind you ast love spreppeace,
but in to in so this dpariing llase (pork,
why shough d ar in thy hade d rice genting?
but no gake clliee notpiris my nempendd.
s miner no hing, 

In [14]:
seed = "shall i compare thee to a summer's day?\n"
temp = [1.5,0.75,0.25]
for i in temp:
    print('Generated Poem at temp = ',i,':')
    print(generatePoem2(model,wordlist,intchar,charint,seed,temp=i))

Generated Poem at temp =  1.5 :
Seed =  shall i compare thee to a summer's day?

shall i compare thee to a summer's day?
thyself sooner indeed thence surly and pierced:
but merits outstripped tombed thence farther bearer:
but thoughts forth thousand shz loves dost;
matter thyself glowing loves my fell white faring:
late history theez diseased painting lastingz:
but thoughts beauty's sooner that's doom and loves former brainsz
thence thanks wolf issueless ornaments;ignorance vilest reeks
one interest thence tombed tombed thyself transgression surfeit hides
wolf most folly semblance highmost coz
touches thence took:
  sorrows theirs imitated themes wz;celez thez both
and that's borne telling to-themselves reeleth
thee wert ere varying ornaments often owes

Generated Poem at temp =  0.75 :
Seed =  shall i compare thee to a summer's day?

shall i compare thee to a summer's day?
these bereft rather under told touches ornaments:
fortune's songs;
and titles often inconstant under something th

# LSTM model 2

In [None]:
# 2 layer LSTM
Nchars = len(charint)
Ntime = 40

model2 = Sequential()
model2.add(LSTM(200, input_shape=(Xt.shape[1],Xt.shape[2]),return_sequences=True))
model2.add(Dropout(0.0))
model2.add(LSTM(200))
model2.add(Dropout(0.0))
model2.add(Dense(Nchars))
model2.add(Activation('softmax'))
model2.summary()

model2.compile(loss='categorical_crossentropy', optimizer='adam')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 40, 200)           191200    
_________________________________________________________________
dropout_2 (Dropout)          (None, 40, 200)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 200)               320800    
_________________________________________________________________
dropout_3 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 38)                7638      
_________________________________________________________________
activation_2 (Activation)    (None, 38)                0         
Total params: 519,638
Trainable params: 519,638
Non-trainable params: 0
_________________________________________________________________


In [None]:
# define the checkpoint
fname="./data/Data_LSTM2layer_100-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(fname, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
model2.fit(Xt, Yt, epochs=100, batch_size=128, callbacks=callbacks_list)

Epoch 1/100
Epoch 00001: loss improved from inf to 3.02255, saving model to ./data/Data_LSTM2layer_100-01-3.0226.hdf5
Epoch 2/100
Epoch 00002: loss improved from 3.02255 to 2.73458, saving model to ./data/Data_LSTM2layer_100-02-2.7346.hdf5
Epoch 3/100
Epoch 00003: loss improved from 2.73458 to 2.37633, saving model to ./data/Data_LSTM2layer_100-03-2.3763.hdf5
Epoch 4/100
Epoch 00004: loss improved from 2.37633 to 2.22731, saving model to ./data/Data_LSTM2layer_100-04-2.2273.hdf5
Epoch 5/100
Epoch 00005: loss improved from 2.22731 to 2.11506, saving model to ./data/Data_LSTM2layer_100-05-2.1151.hdf5
Epoch 6/100
Epoch 00006: loss improved from 2.11506 to 2.02493, saving model to ./data/Data_LSTM2layer_100-06-2.0249.hdf5
Epoch 7/100
Epoch 00007: loss improved from 2.02493 to 1.94337, saving model to ./data/Data_LSTM2layer_100-07-1.9434.hdf5
Epoch 8/100
Epoch 00008: loss improved from 1.94337 to 1.87680, saving model to ./data/Data_LSTM2layer_100-08-1.8768.hdf5
Epoch 9/100
Epoch 00009: los

In [None]:
seed = "shall i compare thee to a summer's day?\n"
temp = [1.5,0.75,0.25]
for i in temp:
    print('Generated Poem at temp = ',i,':')
    print(generatePoem(model2,intchar,charint,seed,temp=i))

In [None]:
seed = "shall i compare thee to a summer's day?\n"
temp = [1.5,0.75,0.25]
for i in temp:
    print('Generated Poem at temp = ',i,':')
    print(generatePoem2(model2,wordlist,intchar,charint,seed,temp=i))