In [4]:
import re

# import data from file
with open('../data/r_l_stevenson_poems.txt') as file:
    txt = file.read()[1:]

# split into single poems
poems = txt.split('<END_OF_THE_POEM>')
poems = list(map(lambda p: p.lower(), poems))
poems = list(map(lambda p: re.sub(' +', ' ', p).strip(), poems))
poems = list(map(lambda p: re.sub('\n ', '\n', p), poems))



poems

["in winter i get up at night,\nand dress by yellow candle light.\nin summer quite the other way,\ni have to go to bed by day.\n\ni have to go to bed and see\nthe birds still hopping on the tree,\nor hear the grown-up people's feet,\nstill going past me in the street.\n\nand does it not seem hard to you,\nwhen all the sky is clear and blue,\nand i should like so much to play,\nto have to go to bed by day?",
 'all night long and every night,\nwhen my mamma puts out the light\ni see the people marching by,\nas plain as day, before my eye.\n\narmies and emperors and kings,\nall carrying different kinds of things,\nand marching in so grand a way,\nyou never saw the like by day.\n\nso fine a show was never seen\nat the great circus on the green;\nfor every kind beast and man\nis marching in that caravan.\n\nat first they move a little slow,\nbut still the faster on they go,\nand still beside them close i keep\nuntil we reach the town of sleep.',
 "three of us afloat in the meadow by the swi

In [5]:
# create dictionary word|char -> number

def peel_alphanumerics(word):
    """
    If word contains some non-alpha characters then returns a list of word and single non-alpha characters.
    i.e.
    "bank," -> ["bank", ","]
    "bank,`" -> ["bank", ",", "`"]
    "bank" -> ["bank"]
    """
    return [re.sub(r'[^a-zA-Z]', '', str(word))] + list(re.sub(r'[a-zA-Z]', '', str(word)))

def paragraph_to_list(paragraph):
    """
    Maps paragraph to list of words or punctations. Puts the end of poem ('_EOP') sign at the end.
    :return: list of words/punctations
    """
    unflatten = list(map(lambda w: peel_alphanumerics(w), paragraph.split(' ')))
    return [item for sublist in unflatten for item in sublist] + ['_EOP'] # end of poem char

def get_word_series(paragraphs):
    """
    Maps list of paragraps into one list of words (series of words).
    """
    wordlist = []
    for p in paragraphs:
        wordlist += paragraph_to_list(p)
    return wordlist

def get_word_list(paragraphs):
    """
    :return: a sorted list of unique words or punctations in paragraphs
    """
    return sorted(set(get_word_series(paragraphs)))

word_list = get_word_list(poems)
word_list

['\n',
 '!',
 '"',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 ':',
 ';',
 '?',
 '_',
 '_EOP',
 'a',
 'abeating',
 'abed',
 'abeda',
 'abedi',
 'able',
 'ablowing',
 'aboard',
 'aboatingwhere',
 'about',
 'aboutare',
 'aboutbut',
 'abouti',
 'aboutin',
 'aboutthere',
 'aboutto',
 'aboutwhenever',
 'above',
 'abreakingin',
 'abroad',
 'abroadafar',
 'abroadand',
 'abroadtill',
 'abroadyou',
 'acharging',
 'across',
 'adventure',
 'afarwhere',
 'afloat',
 'afloatingcastles',
 'afloatwary',
 'africa',
 'after',
 'again',
 'againbefore',
 'againgreen',
 'againopen',
 'againthe',
 'againup',
 'age',
 'ages',
 'ageschildren',
 'air',
 'airhow',
 'airthe',
 'alert',
 'aliti',
 'alive',
 'aliveand',
 'all',
 'alland',
 'allies',
 'allo',
 'allover',
 'allroll',
 'alone',
 'alonehe',
 'alonethe',
 'along',
 'aloudand',
 'also',
 'although',
 'always',
 'am',
 'among',
 'an',
 'anchored',
 'ancient',
 'and',
 'another',
 'ants',
 'any',
 'apes',
 'apparelledhere',
 'appearedhow',
 'apple',
 'apples',
 

In [6]:
word_series = get_word_series(poems)
word_series

['in',
 'winter',
 'i',
 'get',
 'up',
 'at',
 'nightand',
 ',',
 '\n',
 'dress',
 'by',
 'yellow',
 'candle',
 'lightin',
 '.',
 '\n',
 'summer',
 'quite',
 'the',
 'other',
 'wayi',
 ',',
 '\n',
 'have',
 'to',
 'go',
 'to',
 'bed',
 'by',
 'dayi',
 '.',
 '\n',
 '\n',
 'have',
 'to',
 'go',
 'to',
 'bed',
 'and',
 'seethe',
 '\n',
 'birds',
 'still',
 'hopping',
 'on',
 'the',
 'treeor',
 ',',
 '\n',
 'hear',
 'the',
 'grownup',
 '-',
 'peoples',
 "'",
 'feetstill',
 ',',
 '\n',
 'going',
 'past',
 'me',
 'in',
 'the',
 'streetand',
 '.',
 '\n',
 '\n',
 'does',
 'it',
 'not',
 'seem',
 'hard',
 'to',
 'youwhen',
 ',',
 '\n',
 'all',
 'the',
 'sky',
 'is',
 'clear',
 'and',
 'blueand',
 ',',
 '\n',
 'i',
 'should',
 'like',
 'so',
 'much',
 'to',
 'playto',
 ',',
 '\n',
 'have',
 'to',
 'go',
 'to',
 'bed',
 'by',
 'day',
 '?',
 '_EOP',
 'all',
 'night',
 'long',
 'and',
 'every',
 'nightwhen',
 ',',
 '\n',
 'my',
 'mamma',
 'puts',
 'out',
 'the',
 'lighti',
 '\n',
 'see',
 'the',
 '

In [7]:
import numpy as np

def word_series_to_matrix(word_series, word_list, sample_length=5, word_step=1):
    """
    Transaltes series of words/punctations into metrix of 0's or 1's.
    
    In matrix of samples (X):
    - First dimension represents number of samples
    - Second dimension represents length of each sample
    - Third dimension respresents vector of all possible words (according to word_list vector)
      The word at given position is marked as 1.
    
    :sample_length: number of words located in each sample of X
    
    :word_step: step between each sample (take sample_length words in the first sample, 
                then move by word_step words, and take the second sample of length sample_length)
                
    :return: Matrix of boolean values. Each column contains exacly one '1' value on index 
             equivalent to given word index in word_list.
    """
    samples_count = int((len(word_series) - sample_length)/word_step)
    
    # contains sequence of words in each sample
    X = np.zeros((samples_count, sample_length, len(word_list)))
    
    # contains next single word after the sample
    y = np.zeros((samples_count, len(word_list)))
    
    for sample_start, idx in zip(range(0, len(word_series) - sample_length, word_step), 
                                 range(0, samples_count)):
        for x_word_idx in range(sample_start, sample_start + sample_length):
            word = word_series[x_word_idx]
            assert word in word_list, 'Word "{}" is not located in given word_list'.format(word)
            X[idx, x_word_idx - sample_start, word_list.index(word)] = 1
        y[idx, word_list.index(word_series[sample_start + sample_length])] = 1
    return X, y

word_series_to_matrix(['not', 'a', 'good', 'solution', ',', 'not', 'exactly'], ['a', 'not', 'good', 'exactly', 'solution', ','])

(array([[[0., 1., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0.],
         [0., 0., 0., 0., 0., 1.]],
 
        [[1., 0., 0., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0.],
         [0., 0., 0., 0., 0., 1.],
         [0., 1., 0., 0., 0., 0.]]]), array([[0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0.]]))

In [8]:
X, y = word_series_to_matrix(word_series, word_list, sample_length=20, word_step=2)

X.shape, y.shape

((3560, 20, 1925), (3560, 1925))

In [9]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop

samples_count = X.shape[0]
sample_length = X.shape[1]
word_set_size = X.shape[2]
    
model = Sequential()
model.add(LSTM(128, input_shape=(sample_length, word_set_size)))
model.add(Dense(word_set_size))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Instructions for updating:
Colocations handled automatically by placer.


In [11]:
def pretty_print(word_series):
    """
    Takes a list of words/punctations and pretty prints it.
    
    Examples:
    ['however', ',', 'all', 'is', 'possible', '.', 'huh', '!'] -> 'However, all is possible. Huh!'
    """
    start = '__START__'
    end = '__END__'
    word_series = [start] + word_series + [end]
    printed = ''
    sentence_stops = ['!', '.', '?', start, '_EOP']
    whitespace_after = ['!', ')', ',', '.', ':', ';', '?', '_']
    others = ['', '!','(',')',',', '.', ':', ';', '?', '[', '_', '_EOP']
    
    for idx, current in enumerate(word_series[1:len(word_series)-1]):
        idx += 1
        before = word_series[idx-1]
        after = word_series[idx+1]
        
        if current in sentence_stops:
            quotes_started = False
        
        if before in sentence_stops and current not in sentence_stops:
            current = current.capitalize()
            
        if current == '_EOP':
            current = '\n\n<END_OF_POEM>\n\n'
            
        if after in whitespace_after or current == '\n\n':
            printed += current
        else:
            printed += current + ' '
        
        
    return printed

print(pretty_print(['however', ',', 'all', 'is', 'possible', '.', 'huh', '!', '_EOP', 'next', 'paragraph', '.']))

However, all is possible. Huh! 

<END_OF_POEM>

 Next paragraph. 


In [None]:
from keras.callbacks import LambdaCallback
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import ModelCheckpoint

import random
import sys


def sample(preds, temperature=1.0):
    """
    Takes a vector of probabilities and returns most probable solution depending on temperature
    """
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    
    if epoch % 10 != 0:
        return
    
    WORDS_TO_GENERATE = 200
    
    print()
    print('\n----- Generating text after Epoch: %d' % epoch)

    sentence = word_series[:sample_length]
    seed = sentence

    print('----- Generating with seed: "' + pretty_print(seed) + '"')

    for i in range(WORDS_TO_GENERATE):
        x_pred = np.zeros((1, sample_length, word_set_size))
        
        for idx, word in enumerate(seed):
            x_pred[0, idx, word_list.index(word)] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_word = word_list[sample(preds, 0.1)]

        sentence += [next_word]
        seed = sentence[-sample_length:]
        
    print(pretty_print(sentence))
    return
        
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2,
                              patience=1, min_lr=0.001)

callbacks = [print_callback, reduce_lr]

model.fit(X, y, batch_size=128, epochs=1000, callbacks=callbacks)

Instructions for updating:
Use tf.cast instead.
Epoch 1/1000


----- Generating text after Epoch: 0
----- Generating with seed: "In winter i get up at nightand, 
 dress by yellow candle lightin. 
 summer quite the other "
In winter i get up at nightand, 
 dress by yellow candle lightin. 
 summer quite the other 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000


----- Generating text after Epoch: 10
----- Generating with seed: "In winter i get up at nightand, 
 dress by yellow candle lightin. 
 summer quite the 