# Data clearing and processing

In [1]:
import re

# import data from file
with open('../data/alice-adventures-in-wonderland.txt') as file:
    txt = file.read()[1:]

# split into paragraphs of text
paragraphs = txt.split('\n\n')
paragraphs = list(map(lambda p: p.replace('\n', ' '), paragraphs))
paragraphs = list(map(lambda p: p.replace('--', ' '), paragraphs))
paragraphs = list(map(lambda p: p.replace('-', ''), paragraphs))
paragraphs = list(map(lambda p: p.lower(), paragraphs))

# Text is splitted into paragraphs and all letters are lowered.
# Chars like dots and comas are left - rnn should place them in the right places!

paragraphs

['alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, ‘and what is the use of a book,’ thought alice ‘without pictures or conversations?’',
 'so she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisychain would be worth the trouble of getting up and picking the daisies, when suddenly a white rabbit with pink eyes ran close by her.',
 'there was nothing so very remarkable in that; nor did alice think it so very much out of the way to hear the rabbit say to itself, ‘oh dear! oh dear! i shall be late!’ (when she thought it over afterwards, it occurred to her that she ought to have wondered at this, but at the time it all seemed quite natural); but when the rabbit actually took a watch out of its waistcoatpocket, and looked at it

In [2]:
# create dictionary word|char -> number

def peel_alphanumerics(word):
    """
    If word contains some non-alpha characters then returns a list of word and single non-alpha characters.
    i.e.
    "bank," -> ["bank", ","]
    "bank,`" -> ["bank", ",", "`"]
    "bank" -> ["bank"]
    """
    return [re.sub(r'[^a-zA-Z]', '', str(word))] + list(re.sub(r'[a-zA-Z]', '', str(word)))

def paragraph_to_list(paragraph):
    """
    Maps paragraph to list of words or punctations. Puts the end of paragraph ('_EOP') sign at the end.
    :return: list of words/punctations
    """
    unflatten = list(map(lambda w: peel_alphanumerics(w), paragraph.split(' ')))
    return [item for sublist in unflatten for item in sublist] + ['_EOP'] # end of paragraph char

def get_word_series(paragraphs):
    """
    Maps list of paragraps into one list of words (series of words).
    """
    wordlist = []
    for p in paragraphs:
        wordlist += paragraph_to_list(p)
    return wordlist

def get_word_list(paragraphs):
    """
    :return: a sorted list of unique words or punctations in paragraphs
    """
    return sorted(set(get_word_series(paragraphs)))

word_list = get_word_list(paragraphs)
word_list

['',
 '!',
 '(',
 ')',
 ',',
 '.',
 ':',
 ';',
 '?',
 '[',
 '_',
 '_EOP',
 'a',
 'abide',
 'able',
 'about',
 'above',
 'absence',
 'absurd',
 'acceptance',
 'accident',
 'accidentally',
 'account',
 'accounting',
 'accounts',
 'accusation',
 'accustomed',
 'ache',
 'across',
 'act',
 'actually',
 'ada',
 'added',
 'adding',
 'addressed',
 'addressing',
 'adjourn',
 'adoption',
 'advance',
 'advantage',
 'adventures',
 'advice',
 'advisable',
 'advise',
 'affectionately',
 'afford',
 'afore',
 'afraid',
 'after',
 'aftertime',
 'afterwards',
 'again',
 'against',
 'age',
 'ago',
 'agony',
 'agree',
 'ah',
 'ahem',
 'air',
 'alarm',
 'alarmed',
 'alas',
 'alice',
 'alices',
 'alive',
 'all',
 'allow',
 'almost',
 'alone',
 'along',
 'aloud',
 'already',
 'also',
 'altered',
 'alternately',
 'altogether',
 'always',
 'am',
 'ambition',
 'among',
 'an',
 'ancient',
 'and',
 'anger',
 'angrily',
 'angry',
 'animal',
 'animals',
 'ann',
 'annoyed',
 'another',
 'answer',
 'answered',
 'answ

In [3]:
word_series = get_word_series(paragraphs)
word_series

['alice',
 'was',
 'beginning',
 'to',
 'get',
 'very',
 'tired',
 'of',
 'sitting',
 'by',
 'her',
 'sister',
 'on',
 'the',
 'bank',
 ',',
 'and',
 'of',
 'having',
 'nothing',
 'to',
 'do',
 ':',
 'once',
 'or',
 'twice',
 'she',
 'had',
 'peeped',
 'into',
 'the',
 'book',
 'her',
 'sister',
 'was',
 'reading',
 ',',
 'but',
 'it',
 'had',
 'no',
 'pictures',
 'or',
 'conversations',
 'in',
 'it',
 ',',
 'and',
 '‘',
 'what',
 'is',
 'the',
 'use',
 'of',
 'a',
 'book',
 ',',
 '’',
 'thought',
 'alice',
 'without',
 '‘',
 'pictures',
 'or',
 'conversations',
 '?',
 '’',
 '_EOP',
 'so',
 'she',
 'was',
 'considering',
 'in',
 'her',
 'own',
 'mind',
 'as',
 '(',
 'well',
 'as',
 'she',
 'could',
 ',',
 'for',
 'the',
 'hot',
 'day',
 'made',
 'her',
 'feel',
 'very',
 'sleepy',
 'and',
 'stupid',
 ')',
 ',',
 'whether',
 'the',
 'pleasure',
 'of',
 'making',
 'a',
 'daisychain',
 'would',
 'be',
 'worth',
 'the',
 'trouble',
 'of',
 'getting',
 'up',
 'and',
 'picking',
 'the',
 'da

In [4]:
import numpy as np

def word_series_to_matrix(word_series, word_list, sample_length=5, word_step=1):
    """
    Transaltes series of words/punctations into metrix of 0's or 1's.
    
    In matrix of samples (X):
    - First dimension represents number of samples
    - Second dimension represents length of each sample
    - Third dimension respresents vector of all possible words (according to word_list vector)
      The word at given position is marked as 1.
    
    :sample_length: number of words located in each sample of X
    
    :word_step: step between each sample (take sample_length words in the first sample, 
                then move by word_step words, and take the second sample of length sample_length)
                
    :return: Matrix of boolean values. Each column contains exacly one '1' value on index 
             equivalent to given word index in word_list.
    """
    samples_count = int((len(word_series) - sample_length)/word_step)
    
    # contains sequence of words in each sample
    X = np.zeros((samples_count, sample_length, len(word_list)))
    
    # contains next single word after the sample
    y = np.zeros((samples_count, len(word_list)))
    
    for sample_start, idx in zip(range(0, len(word_series) - sample_length, word_step), 
                                 range(0, samples_count)):
        for x_word_idx in range(sample_start, sample_start + sample_length):
            word = word_series[x_word_idx]
            assert word in word_list, 'Word "{}" is not located in given word_list'.format(word)
            X[idx, x_word_idx - sample_start, word_list.index(word)] = 1
        y[idx, word_list.index(word_series[sample_start + sample_length])] = 1
    return X, y

word_series_to_matrix(['not', 'a', 'good', 'solution', ',', 'not', 'exactly'], ['a', 'not', 'good', 'exactly', 'solution', ','])

(array([[[0., 1., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0.],
         [0., 0., 0., 0., 0., 1.]],
 
        [[1., 0., 0., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0.],
         [0., 0., 0., 0., 0., 1.],
         [0., 1., 0., 0., 0., 0.]]]), array([[0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0.]]))

In [11]:
X, y = word_series_to_matrix(word_series, word_list, sample_length=20, word_step=2)

X.shape, y.shape

((16556, 20, 2517), (16556, 2517))

# RNN network

In [12]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop

samples_count = X.shape[0]
sample_length = X.shape[1]
word_set_size = X.shape[2]
    
model = Sequential()
model.add(LSTM(128, input_shape=(sample_length, word_set_size)))
model.add(Dense(word_set_size))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [13]:
def pretty_print(word_series):
    """
    Takes a list of words/punctations and pretty prints it.
    
    Examples:
    ['however', ',', 'all', 'is', 'possible', '.', 'huh', '!'] -> 'However, all is possible. Huh!'
    """
    start = '__START__'
    end = '__END__'
    word_series = [start] + word_series + [end]
    printed = ''
    sentence_stops = ['!', '.', '?', start, '_EOP']
    whitespace_after = ['!', ')', ',', '.', ':', ';', '?', '_']
    others = ['', '!','(',')',',', '.', ':', ';', '?', '[', '_', '_EOP']
    
    for idx, current in enumerate(word_series[1:len(word_series)-1]):
        idx += 1
        before = word_series[idx-1]
        after = word_series[idx+1]
        
        if current in sentence_stops:
            quotes_started = False
        
        if before in sentence_stops and current not in sentence_stops:
            current = current.capitalize()
            
        if current == '_EOP':
            current = '\n\n'
            
        if after in whitespace_after or current == '\n\n':
            printed += current
        else:
            printed += current + ' '
        
        
    return printed

print(pretty_print(['however', ',', 'all', 'is', 'possible', '.', 'huh', '!', '_EOP', 'next', 'paragraph', '.']))

However, all is possible. Huh! 

Next paragraph. 


In [None]:
from keras.callbacks import LambdaCallback
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import ModelCheckpoint

import random
import sys


def sample(preds, temperature=1.0):
    """
    Takes a vector of probabilities and returns most probable solution depending on temperature
    """
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    
    if epoch % 10 != 0:
        return
    
    WORDS_TO_GENERATE = 200
    
    print()
    print('\n----- Generating text after Epoch: %d' % epoch)

    sentence = word_series[:sample_length]
    seed = sentence

    print('----- Generating with seed: "' + pretty_print(seed) + '"')

    for i in range(WORDS_TO_GENERATE):
        x_pred = np.zeros((1, sample_length, word_set_size))
        
        for idx, word in enumerate(seed):
            x_pred[0, idx, word_list.index(word)] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_word = word_list[sample(preds, 0.1)]

        sentence += [next_word]
        seed = sentence[-sample_length:]
        
    print(pretty_print(sentence))
    return
        
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2,
                              patience=1, min_lr=0.001)

callbacks = [print_callback, reduce_lr]

model.fit(X, y, batch_size=128, epochs=1000, callbacks=callbacks)

Epoch 1/1000


----- Generating text after Epoch: 0
----- Generating with seed: "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing "
Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing. 

‘ ‘ you ’ ’ ’ the hatter. 

‘ ‘ ’ ’ ’ the hatter. 

‘ ‘ ’ ’ said the hatter. 

‘ ‘ ’ ’ ’ said the gryphon. 

‘ ‘ you ’ ’ ’ said the gryphon. 

‘ ‘ you ’ ’ ’ said the gryphon. 

‘ ‘ you you ’ ’ ’ said the hatter. 

‘ ‘, ’ ’ said the gryphon. 

‘ ‘ ’ ’ ’ the gryphon. 

‘ ‘ ’ ’ ’ said the gryphon. 

‘ ‘ you ’ ’ ’ said the gryphon. 

‘ ‘ ’ ’ said the gryphon. 

‘ ‘ ’ ’ ’ the hatter. 

‘ ‘, ’ ’ the hatter. 

‘ ‘ ’ ’ said the gryphon. 

‘ ‘ ’ ’ ’ said the gryphon. 

‘ ‘ you ’ ’ ’ the hatter. 

‘ ‘ ’ ’ said the hatter. 

‘ ‘, ’ ’ said the gryphon. 

‘ ‘ you ’ ’ ’ said the hatter. 


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000


-

Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000


----- Generating text after Epoch: 50
----- Generating with seed: "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing "
Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to hear: or or she had been anxiously into no, and all over at the queen, who was only them to do. 

They were a large ’ said alice, ’ 

Youd ‘ ’ lobster a great or drink, ’ the hatter replied, and. 

To alice, perhaps ‘, ’ thought alice,) ‘ ’ thought to herself. I ‘ havent, ’ alice thought out, as ‘ if the queen had to an much through the long was so, that thought alice, i ‘ might might what i might ” i cant ’ remember you, ’ know he. 



‘ they can, ’ said the gryphon, and ‘ it would have no to go at all all the change! ’ 

First ‘, it was ’ at the hatter at all the queen did not hear of sight. 

T

Epoch 89/1000
Epoch 90/1000
Epoch 91/1000


----- Generating text after Epoch: 90
----- Generating with seed: "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing "
Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: or or she she had to it was how to be, ” as she know. Of ‘ one of they would you? ’ 

To alice a now, and i might back to alice how to come, who looked up and it out that; its ‘ could remember. ’ 

Why ‘ that a knocking? ’ said the queen, you know what he was a story. 



This time she had to to be a long, and said was, in time took the little, and he at ( pronounced, miss, as in a moment; ’ alice began: she thought, a little her look for made her went out the the queen answered very to say it: it would be quite about a shouted;., when ‘ they were ’ the mock turtle hatter. 

No how seemed! ’ said alice, so very much, in a large way reason out, and the pool ’ she ( had to her 

  


Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: in or she were trying to open how ‘ ’ said what i to say, as they lay little try to ask the queen. Who ‘ are so getting them ’ getting the dormouse. 

You ‘ on, ’ said the gryphon, and ‘ form dont ’ know its ’ all she happens! But said this time. 

It ‘ ’ no first, ’ she was on. 

 ‘ ’ read little scream the queen, half how he seemed to be with in a oh, at the cook yet, you came the different, and off for off of their fur. 

Are ‘ they should?? ’ said the a lobster something out that its answer, to to alice severely to the gryphon. 

. ‘, i ‘ wasnt ’ very you might not, ’ said alice. 

I ‘ the mock turtle mine, ’ said the queen, and it thats ’ last officers at few time, and was a little before she gave her answer, so she went of first, 
Epoch 112/1000
Epoch 113/1000
Epoch 114/1000
Epoch 115/1000
Epoch 116/1000
Epoch 117/1000
Epoch 118/1000
Epoch 119/1000
Epoch 120/1000
Epoch 121/100

Epoch 153/1000
Epoch 154/1000
Epoch 155/1000
Epoch 156/1000
Epoch 157/1000
Epoch 158/1000
Epoch 159/1000
Epoch 160/1000
Epoch 161/1000


----- Generating text after Epoch: 160
----- Generating with seed: "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing "
Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: or or she she had to it was quite much, and this she had never been and she was just it to is over over and went on, for the two without his table, but shy you might a he or. ’ 

The ‘ turned ’ 

And the hatter was my first, but was came came ‘ in out, that ‘ dont ’ a they lived suddenly im ’ said, ’ the they were in an foot the once, as said and as she was too such to herself, but she had never frightened out of the ah was soup, and she had never would have out up and: poor alice again in as as well, and not! ’ 

I ‘ couldnt it know, ’ said the caterpillar. She ‘ that: you if it was

Epoch 196/1000
Epoch 197/1000
Epoch 198/1000
Epoch 199/1000
Epoch 200/1000
Epoch 201/1000


----- Generating text after Epoch: 200
----- Generating with seed: "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing "
Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: or or she she had to it was quite in; and was not different. 

The king looked anxiously alice, and the king golden little heavy. Did the fall went on. 

I ‘ dont ’ know what youre that a mean ’ too began to! ’ 

The mock turtle looked up and alice off on, with the other of the table, upsetting; the seven opened into, and to the white in rabbit in in another knocking,, and was the cat. 

Hear ‘, ’ said the queen, i ‘ dont ’ see what he. ’ said with a an: they got on off to work time they could all a face with quite him out, and they got looked at the queen to look her: it ‘ the mock turtle went, on, three you had to me if he ha

Epoch 239/1000
Epoch 240/1000
Epoch 241/1000


----- Generating text after Epoch: 240
----- Generating with seed: "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing "
Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: or or she she had to it was how to say, who ‘ i over, she shant  ’ when she dont ’ look of that it, but did not quite like you like; she found that it had would, not quite of sleep ” ” like the other my only left, or another thought it this, and could, 

The king turned and the baby, ’ said the a great one of them, and said to explain that she, my to such in a a he was, ’ alice the mock turtle. 

Very ‘ much much, ’ said the gryphon, and ‘ alice went on, half ‘ dont ’ be a child, ’ she put on their leaves, and she a large she round her the over she was close to them that well ‘ means then the queen, ’ said the mock turtle, and ‘ then i beg fall she had never to them out o



----- Generating text after Epoch: 280
----- Generating with seed: "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing "
Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: or or she she had to it was quite much, and this she had never been and them was put them, and as she could back to the fan, a shriek respect. 

No ‘ wasnt ’ a little curious thing! ’ said the mock turtle with a very getting; and all about, and the mock party she hope dull, just dear ‘ wish such, ’ know well was, as she went never come to little them as as could as if was only ‘ one of you, and know. 

Alice replied of the number to say, who was now i ‘ might so done,? ’ and she was in the same that was the look of to hear, and the baby to see it was now. 

Was the cat half in a hands, she was a well.. 

I ‘ ‘ was ’ understand, ’ the hatter said to a little: but it began talking in a hurry to play at the a time she 

Epoch 323/1000
Epoch 324/1000
Epoch 325/1000
Epoch 326/1000
Epoch 327/1000
Epoch 328/1000
Epoch 329/1000
Epoch 330/1000
Epoch 331/1000


----- Generating text after Epoch: 330
----- Generating with seed: "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing "
Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: or or she she had to it was quite in; and was not different. 

The king looked anxiously alice, and the king golden little heavy. Did the white went on, and he got up and he a added,) suddenly alice got to the door, she went on: but ‘, ’ the queen was to make out: 

Then they all round, half on?, ’ alice replied. 

Well ‘, i know! ’ said alice, i ‘ must be shutting use. But the hatter ’ if he thought. 

It ‘ party yet very to no child tone, what ’ a little say! Turning alice head: she the queen, and her angrily to work; and suddenly she got on the queen, and had the doing out of her 

Epoch 367/1000
Epoch 368/1000
Epoch 369/1000
Epoch 370/1000
Epoch 371/1000


----- Generating text after Epoch: 370
----- Generating with seed: "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing "
Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: or or she she had to it was quite in; and was not this time. 

It ‘ is a little finish, ’ said the mock turtle, ive ‘ ’ talk for moral in a if. 

So they would be only much that that. So no quite a my opportunity this she was gone out she that out of the ill ‘ ’ 

On ‘ not a answer, ’ the mock turtle: just ‘ now like a a cat of a sort. You! Dont ’ a great question, ’ said alice. 

Well ‘, i know ’ oh the table, ’ said the queen: and the set to a very getting out the dormouse, and said to the knave concert sleep of cats that went in the other. 

The long queen jumped how? ’ oh could ‘ see again, ’ said to the gryphon, and ‘ if i hadnt ’ better

Epoch 410/1000
Epoch 411/1000


----- Generating text after Epoch: 410
----- Generating with seed: "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing "
Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: or or she she had to it was quite in; and was not this time. 

It ‘ is a little finish, ’ said the mock turtle, ive ‘ ’ talk for moral in a if. 

So they would ’ only the gryphon. 

Hush! The the queen said to take a mouse, who was began to herself with all at the mock turtle with but, looking by them round the but. 

The mock turtle with the queen, and then this the cat, and was sitting: hearth the other she had never to dormouse depends. 

There seemed as better to be hatter ’ one of the duchess, and the cat seemed was a large listening wait on this,, a little said to alice, in a very hurry; 

All ‘ im ’ turtle for some of great never never them, i know it was as theres. ’ 

Somebody 

Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: or or she she had to it was quite in; and was not different. 

The king looked anxiously alice, and the king golden; and then the queen,, and to herself what ‘ am to get? ’ 

One ‘ only your only the queen said to know. 

There was a little bit of her on her on to see it would you more might come to be some of the right again, so you you see what i never ” ” ’ the mock turtle. You never never even with a cheshire  ’ said the mock turtle. 

Why ‘, ’ thought alice, they ‘ must have the ’ not repeated ’ 

And ‘ the next twinkling of you, ’ little regular the gryphon heard here, but the queen was to them, and all stand, too knew that was quite easy as had do. 

If ‘ i could be the about ever may that ’ said the mock turtle. 

Yes ‘,, ’ said the queen. 

‘ never one 
Epoch 452/1000
Epoch 453/1000
Epoch 454/1000
Epoch 455/1000
Epoch 456/1000
Epoch 457/1000
Epoch 458/1000
Epoch 459/1000
Ep

Epoch 493/1000
Epoch 494/1000
Epoch 495/1000
Epoch 496/1000
Epoch 497/1000
Epoch 498/1000
Epoch 499/1000
Epoch 500/1000
Epoch 501/1000


----- Generating text after Epoch: 500
----- Generating with seed: "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing "
Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: or or she she had to it was how to say, who ‘ move over the other side of which up you know the till was however, so helped; about,; and was for some of them, and thought by that that was were trying to be a very earnestly, of this alice, not think, and she had been changed much them out up, all the hatter she was just as she had this over she found it was was to her:) youre, the come on with her at them began with them its was. 

The ‘ couldnt on: ’ said the first, that it was be more there was there was. So much alice again, and this way so much them in waiting that the gryphon the

In [None]:
# random 50 words
random_result = []
for idx in np.random.random((50)) * 2500:
    random_result += [word_list[int(idx)]]
print(pretty_print(random_result))