# LSTM test

In [1]:


# Copied from https://github.com/ChunML/text-generator/blob/master/RNN_utils.py as it does all necessary thing needed for 
# char-to-char data loading.
def load_data(data_dir, seq_length):
    data = open(data_dir, 'r', encoding="utf-8").read()  # Read data
    chars = list(set(data))  # get possible chars
    VOCAB_SIZE = len(chars)

    print('Data length: {} characters'.format(len(data)))
    print('Vocabulary size: {} characters'.format(VOCAB_SIZE))

    ix_to_char = {ix:char for ix, char in enumerate(chars)}  # index to char map
    char_to_ix = {char:ix for ix, char in enumerate(chars)}  # char to index map

    X = np.zeros((len(data)//seq_length, seq_length, VOCAB_SIZE))  # input data
    y = np.zeros((len(data)//seq_length, seq_length, VOCAB_SIZE))  # target data
    # Divide input to suitable sequance sizes and match with correct target
    for i in range(0, len(data)//seq_length):
        X_sequence = data[i*seq_length:(i+1)*seq_length]
        X_sequence_ix = [char_to_ix[value] for value in X_sequence]
        input_sequence = np.zeros((seq_length, VOCAB_SIZE))
        for j in range(seq_length):
            input_sequence[j][X_sequence_ix[j]] = 1.
            X[i] = input_sequence

        y_sequence = data[i*seq_length+1:(i+1)*seq_length+1]  # next character, as we want to predict next character
        y_sequence_ix = [char_to_ix[value] for value in y_sequence]
        target_sequence = np.zeros((seq_length, VOCAB_SIZE))
        for j in range(seq_length):
            target_sequence[j][y_sequence_ix[j]] = 1.
            y[i] = target_sequence
    return X, y, VOCAB_SIZE, ix_to_char

In [69]:
X, y, VOCAB_SIZE, ix_to_char = load_data(DATA_DIR, 100)

Data length: 114475 characters
Vocabulary size: 89 characters


In [2]:
def load_vocabulary(data_dir, seq_length):
    data = open(data_dir, 'r', encoding="utf-8").read()  # Read data
    chars = list(set(data))  # get possible chars
    VOCAB_SIZE = len(chars)

    print('Data length: {} characters'.format(len(data)))
    print('Vocabulary size: {} characters'.format(VOCAB_SIZE))

    ix_to_char = {ix:char for ix, char in enumerate(chars)}  # index to char map
    char_to_ix = {char:ix for ix, char in enumerate(chars)}  # char to index map
    
    return VOCAB_SIZE, ix_to_char, char_to_ix

In [3]:
def data_generator(data_dir, seq_length, batch_size, steps_per_epoch):
    data = open(data_dir, 'r', encoding="utf-8").read()  # Read data
    chars = list(set(data))  # get possible chars
    VOCAB_SIZE = len(chars)

    print('Data length: {} characters'.format(len(data)))
    print('Vocabulary size: {} characters'.format(VOCAB_SIZE))

    ix_to_char = {ix:char for ix, char in enumerate(chars)}  # index to char map
    char_to_ix = {char:ix for ix, char in enumerate(chars)}  # char to index map
    
    batch_nr = 0
    
    while True:
        
        X = np.zeros((batch_size, seq_length, VOCAB_SIZE))  # input data
        y = np.zeros((batch_size, seq_length, VOCAB_SIZE))
        
        pos_start = batch_nr*batch_size*seq_length  # Continue where left on from patch
        #print(pos_start)
        
        for i in range(0, batch_size):        
            
            X_sequence = data[pos_start + i*seq_length:pos_start + (i+1)*seq_length]
            X_sequence_ix = [char_to_ix[value] for value in X_sequence]
            input_sequence = np.zeros((seq_length, VOCAB_SIZE))

            for j in range(len(X_sequence)):  # Last sequence otherwise shorter
                input_sequence[j][X_sequence_ix[j]] = 1.
                X[i] = input_sequence

            y_sequence = data[pos_start+i*seq_length+1:pos_start + (i+1)*seq_length+1]  # next character, as we want to predict next character
            y_sequence_ix = [char_to_ix[value] for value in y_sequence]
            target_sequence = np.zeros((seq_length, VOCAB_SIZE))
            for j in range(len(y_sequence)):
                target_sequence[j][y_sequence_ix[j]] = 1.
                y[i] = target_sequence
        
        if batch_nr == (steps_per_epoch-1):  # Because we start from zero
            batch_nr = 0
        else:
            batch_nr += 1
                
        
        yield(X, y)

In [26]:
# Method copied from the same source to get initial stuff done.
def generate_text(model, length, vocab_size, ix_to_char, y_char = None):
    # starting with random character
    ix = [np.random.randint(vocab_size)]
    
    if not y_char:        
        print(ix)
        #ix = [37]
        y_char = [ix_to_char[ix[-1]]]
    print(y_char)
    X = np.zeros((1, length, vocab_size))
    for i in range(length):
        # appending the last predicted character to sequence
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        # TODO choose with probability (tempeture)
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    return ('').join(y_char)

TODO make the vocabulary smaller - has many weird symbols

### Model training

Initialize variables

In [5]:
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM, SimpleRNN
from keras.layers.wrappers import TimeDistributed
#from RNN_utils import *

DATA_DIR = "poems_test_small.txt"
BATCH_SIZE = 50
HIDDEN_DIM = 500
SEQ_LENGTH = 100
DROPOUT_RATE = 0.2
WEIGHTS = ''

GENERATE_LENGTH = 500
LAYER_NUM = 2

Using TensorFlow backend.


In [6]:
VOCAB_SIZE, ix_to_char, char_to_ix = load_vocabulary("poems_test_small.txt", SEQ_LENGTH)

Data length: 114475 characters
Vocabulary size: 89 characters


In [7]:
# Creating training data
#X, y, VOCAB_SIZE, ix_to_char = load_data(DATA_DIR, SEQ_LENGTH)
print(VOCAB_SIZE)
# Creating and compiling the Network
model = Sequential()
model.add(LSTM(HIDDEN_DIM, input_shape=(None, VOCAB_SIZE), return_sequences=True))
model.add(Dropout(rate = DROPOUT_RATE))
for i in range(LAYER_NUM - 1):
    model.add(LSTM(HIDDEN_DIM, return_sequences=True))
    model.add(Dropout(rate = DROPOUT_RATE))
model.add(TimeDistributed(Dense(VOCAB_SIZE)))
model.add(Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="rmsprop")

89


In [10]:
BATCH_SIZE = 100

data = open(DATA_DIR, 'r', encoding="utf-8").read()  # Read data
chars = list(set(data))  # get possible chars
VOCAB_SIZE = len(chars)

print(VOCAB_SIZE)

steps_per_epoch = len(data)//SEQ_LENGTH//BATCH_SIZE
print(steps_per_epoch)

epoch = 20

model.fit_generator(data_generator(DATA_DIR, SEQ_LENGTH, BATCH_SIZE, steps_per_epoch), steps_per_epoch=steps_per_epoch, verbose = 1, epochs = 10)
model.save_weights('test_checkpoint_layer_{}_hidden_{}_epoch_{}.hdf5'.format(LAYER_NUM, HIDDEN_DIM, epoch))

89
11
Epoch 1/10
Data length: 114475 characters
Vocabulary size: 89 characters
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
model.save_weights('test_checkpoint_layer_{}_hidden_{}_epoch_{}.hdf5'.format(LAYER_NUM, HIDDEN_DIM, 10))

In [75]:
# Training if there is no trained weights specified
nb_epoch = 0

while True:
    print('\n\nEpoch: {}\n'.format(nb_epoch))
    model.fit(X, y, batch_size=BATCH_SIZE, verbose=1, epochs=1)
    #model.fit_generator(data_generator(DATA_DIR, SEQ_LENGTH, BATCH_SIZE), steps_per_epoch=steps_per_epoch, verbose = 1, epochs = 10)
    nb_epoch += 1
    #print(generate_text(model, 500, VOCAB_SIZE, ix_to_char))
    if nb_epoch % 5 == 0:
        model.save_weights('checkpoint_layer_{}_hidden_{}_epoch_{}.hdf5'.format(LAYER_NUM, HIDDEN_DIM, nb_epoch))



Epoch: 0

Epoch 1/1


Epoch: 1

Epoch 1/1

KeyboardInterrupt: 

In [29]:
WEIGHTS = "test_checkpoint_layer_2_hidden_500_epoch_20.hdf5"

model.load_weights(WEIGHTS)
generate_text(model, 100, VOCAB_SIZE, ix_to_char)
print('\n\n')

[76]
['Y']
Your the sore the sore the sore the sore the sore the sore the sore the sore the sore the sore the s




In [85]:
#WEIGHTS = "checkpoint_layer_2_hidden_500_epoch_45.hdf5"

model.load_weights(WEIGHTS)
generate_text(model, GENERATE_LENGTH, VOCAB_SIZE, ix_to_char)
print('\n\n')

4
The song of the bit was hadred.

All caure has starf us were suem
Of sing that been pay
will be cheed and sone,
And every hand bean in like a bear
On a pursise were best and seed my freed.
Your move from the best by the see
the track of chuils out most is again,
We streng homen winds and made me to meat her wenders lave sead befich,
and slapped hit hand one and donely hear before
She past and slouds in be comes
frem by mands
Of what have been a selfrined more by the fellenge,
All should you ca




In [88]:
generate_text(model, GENERATE_LENGTH, VOCAB_SIZE, ix_to_char)
print('')

X and pulled have been paid.
Lefire your baces for me to your breast
And shill nome do mander.
You meak see mes mes weer with sermalles, it was breaks,
And lose though my frem shark before
She samber randen with the earth blood rand all
veise un and of your door,
And they seed marered, with a fright starf,
All the emelomer said was will become to read,
they are shall were droam.

They tore wand to me sour canded roush and day,
A woman of strength will she marbless
and the sunmeress are who was n

'X and pulled have been paid.\nLefire your baces for me to your breast\nAnd shill nome do mander.\nYou meak see mes mes weer with sermalles, it was breaks,\nAnd lose though my frem shark before\nShe samber randen with the earth blood rand all\nveise un and of your door,\nAnd they seed marered, with a fright starf,\nAll the emelomer said was will become to read,\nthey are shall were droam.\n\nThey tore wand to me sour canded roush and day,\nA woman of strength will she marbless\nand the sunmeress are who was no'

In [89]:
generate_text(model, GENERATE_LENGTH, VOCAB_SIZE, ix_to_char)
print('')

_ strange than shaded,
Farent this pred in is our last,
And say summarer, may many him sould be miderall,
Agamanted my deach sheet my ore of diem.
'Oh doint that sleeps who mall shar sunderst,
Sleep my langhas in my sade may
befored my flesh, and mised pay
That was not dume to did and dead
'Here is a cold mo tryeblace and bread,
And stall mesele me toor frammericals, and the were dreams.
They are so is asse breanthes
s werk out she was to coves the world,
A river sings a beautiful song,
Here bes


### Generate text

In [24]:
def load_vocabulary(data_dir, seq_length):
    data = open(data_dir, 'r', encoding="utf-8").read()  # Read data
    chars = list(set(data))  # get possible chars
    VOCAB_SIZE = len(chars)

    print('Data length: {} characters'.format(len(data)))
    print('Vocabulary size: {} characters'.format(VOCAB_SIZE))

    ix_to_char = {ix:char for ix, char in enumerate(chars)}  # index to char map
    char_to_ix = {char:ix for ix, char in enumerate(chars)}  # char to index map
    
    return VOCAB_SIZE, ix_to_char, char_to_ix

In [8]:
VOCAB_SIZE, ix_to_char, char_to_ix = load_vocabulary("poets_top_100k.txt", 100)

Data length: 3612464 characters
Vocabulary size: 126 characters


In [20]:
WEIGHTS = "models_100k/checkpoint_layer_2_hidden_500_epoch_10.hdf5"

model.load_weights(WEIGHTS)
generate_text(model, GENERATE_LENGTH, VOCAB_SIZE, ix_to_char)
print('\n\n')

ValueError: Dimension 0 in both shapes must be equal, but are 126 and 128 for 'Assign_4' (op: 'Assign') with input shapes: [126,2000], [128,2000].