# LSTM poem generation for Shakespeare's sonnets

In [None]:
import numpy as np
import itertools

Reference: https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/

# Data preprocessing for LSTM

In [None]:
# Function to get Shakespeare's poems from file
def getPoems():
    with open("./data/shakespeare.txt", "r") as f:
        data = f.read().lower()
    # Split by poems
    poems = data.split("\n\n\n")
    # Remove 1st line of each poem
    poems = [poem[20:] for poem in poems]
    return poems

# Get character to integer dictionary for one hot encoding
def getChardict(poems):
    # merge all poems and get list of characters
    data = "".join(poems)
    # Get dictionary of characters for one hot encoding
    chars = sorted(list(set(data)))
    charint = dict((c, i) for i, c in enumerate(chars))
    intchar = dict((i, c) for i, c in enumerate(chars))
    return charint,intchar

# Integer encode the poems
def getIntPoems(charint,poems):
    out = []
    for poem in poems:
        out.append([charint[char] for char in poem])
    return np.array(out)

# Get array of poems
poems = getPoems()
# Get integer encoding dictionary
charint,intchar = getChardict(poems)
# Get Integer encoded poem array
IntPoems = getIntPoems(charint,poems)

# Generate training data

In [None]:
# Generate X and Y training sets from each poem
def getCharacters(poem,n=40,skip=10):
    Xtrain = [poem[i:i+n] for i in range(0,len(poem)-n,skip)]
    Ytrain = [poem[i+n] for i in range(0,len(poem)-n,skip)]
    return Xtrain,Ytrain

# Generate training data 
Ntime = 40
skip = 5
Xtrain = []
Ytrain = []
for poem in IntPoems:
    Xt,Yt = getCharacters(poem,n,skip)
    Xtrain.append(Xt)
    Ytrain.append(Yt)

Ytrain = np.concatenate(Ytrain)
Xtrain = np.concatenate(Xtrain)

# One hot encode the training vectors
import keras
Yt = keras.utils.np_utils.to_categorical(Ytrain)
Xt = keras.utils.np_utils.to_categorical(Xtrain)

print(Xt.shape)
print(Yt.shape)

# LSTM Model

In [None]:
from keras.layers import LSTM, Dense, Activation, BatchNormalization, Dropout
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint

Nchars = len(charint)
Ntime = 40

model = Sequential()
model.add(LSTM(200, input_shape=(Xt.shape[1],Xt.shape[2])))
model.add(Dropout(0.0))
model.add(Dense(Nchars))
model.add(Activation('softmax'))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# define the checkpoint
fname="./data/Data_LSTM-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(fname, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
model.fit(Xt, Yt, epochs=20, batch_size=128, callbacks=callbacks_list)

# Generate poems from training set

In [None]:
def CharToInt(charint,text):
    return np.array([charint[char] for char in text])
    
def IntToChar(intchar,text):
    return "".join([intchar[char] for char in text])

# helper function to sample an index from a probability array
def sample(a, temperature=1.0):
    a = np.log(a) / temperature
    a = np.exp(a)
    a = a/np.sum(a)*.99
    return np.argmax(np.random.multinomial(1, a, 1))

def generatePoem(model,seed,temp=1.0):
    print('Seed = ',seed)
    IntSeed = CharToInt(charint,seed)
    print('Integer Encoded Seed=',IntSeed)

    IntOut = IntSeed
    temp = 1.0
    lines = 13
    # generate characters
    for i in range(1000):
        X = IntOut[i:i+Ntime]
        OneHot_X = keras.utils.np_utils.to_categorical([X],num_classes=len(charint))
        Ypred = model.predict(OneHot_X)
        idx = sample(Ypred[0],temp)
        IntOut = np.concatenate((IntOut,[idx]))
        # Count number of poem lines generated
        if idx==0:
            lines-=1
        if lines==0:
            break
    return IntToChar(IntOut)

In [None]:
seed = "shall i compare thee to a summer's day?\n"
temp = [1.5,0.75,0.25]
for i in temp:
    print('Generated Poem at temp = ',i,':')
    print(generatePoem(model,seed,temp=i))

# LSTM model 2

In [None]:
# 2 layer LSTM
Nchars = len(charint)
Ntime = 40

model2 = Sequential()
model2.add(LSTM(200, input_shape=(Xt.shape[1],Xt.shape[2])))
model2.add(Dropout(0.0))
model2.add(LSTM(200, input_shape=(Xt.shape[1],Xt.shape[2])))
model2.add(Dropout(0.0))
model2.add(Dense(Nchars))
model2.add(Activation('softmax'))
model2.summary()

model2.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# define the checkpoint
fname="./data/Data_LSTM-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(fname, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
model2.fit(Xt, Yt, epochs=20, batch_size=128, callbacks=callbacks_list)

In [None]:
seed = "shall i compare thee to a summer's day?\n"
temp = [1.5,0.75,0.25]
for i in temp:
    print('Generated Poem at temp = ',i,':')
    print(generatePoem(model,seed,temp=i))