Example script to generate text from Nietzsche's writings.

At least 20 epochs are required before the generated text
starts sounding coherent.

It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.

If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.

In [244]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import codecs

In [271]:
def get_text():
    path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
    path = get_file('pan_tadeusz.txt', origin='http://www.gutenberg.org/files/31536/31536-0.txt')
    
    with codecs.open(path, encoding='utf-8') as f:
        text = f.read().lower()
    print('corpus length:', len(text))
    return text

def build_dict(text):
    chars = sorted(list(set(text)))
    print('total chars:', len(chars))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))
    return char_indices, indices_char, len(chars)

def vectorize(text, char_indices, indices_char, dict_size, maxlen = 40, step = 3):
    ''' cut the text in semi-redundant sequences of maxlen characters'''
    sentences = []
    next_chars = []
    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i: i + maxlen])
        next_chars.append(text[i + maxlen])
    print('nb sequences:', len(sentences))

    print('Vectorization...')
    x = np.zeros((len(sentences), maxlen, dict_size), dtype=np.bool)
    y = np.zeros((len(sentences), dict_size), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            x[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1        
    return x, y

In [169]:
def build_model(units, maxlen, dict_size):
    ''' build the model: a single LSTM '''
    print('Build model...')
    model = Sequential()
    model.add(LSTM(units=units, input_shape=(maxlen, dict_size)))
    model.add(Dense(dict_size))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.01))
    return model

In [179]:
def sample(preds, temperature=1.0):
    ''' helper function to sample an index from a probability array '''
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [226]:
def generate_samples(model, text, maxlen, char_indices, indices_char, dict_size, sample_size = 40):
    start_index = random.randint(0, len(text) - maxlen - 1)

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = r''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(sample_size):
            x_pred = np.zeros((1, maxlen, dict_size))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
            
        print()

In [191]:
def train_and_sample(model, x, y, iterations, epochs, maxlen, char_indices, indices_char, dict_size, sample_size):
    ''' train the model, output generated text after each iteration '''
    for iteration in range(1, iterations):
        print('-' * 50)
        print('Iteration', iteration)
        model.fit(x, y, batch_size=128, epochs=epochs, verbose = 1)

        if iteration % int(iterations / 10) == 0:
            generate_samples(model, text, maxlen, char_indices, indices_char, dict_size, 50)

In [195]:
maxlen, step = 2, 1

In [279]:
text = get_text()

corpus length: 240531


In [280]:
text = text[1520:5000]

In [281]:
char_indices, indices_char, dict_size = build_dict(text)

total chars: 44


In [282]:
x, y = vectorize(text, char_indices, indices_char, dict_size, maxlen, step)

nb sequences: 3478
Vectorization...


In [283]:
x.shape, y.shape

((3478, 2, 44), (3478, 44))

In [284]:
model = build_model(32, maxlen, dict_size)
model.summary()

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_15 (LSTM)               (None, 32)                9856      
_________________________________________________________________
dense_15 (Dense)             (None, 44)                1452      
_________________________________________________________________
activation_15 (Activation)   (None, 44)                0         
Total params: 11,308
Trainable params: 11,308
Non-trainable params: 0
_________________________________________________________________


In [300]:
model.fit(x, y, batch_size=4096, epochs=50, verbose = 1);

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [302]:
genearte_samples(model, text, maxlen, char_indices, indices_char, dict_size, 50)

----- diversity: 0.2
----- Generating with seed: "o "
o do w pod bo trzepki stojący stopię zarze do do tyc
----- diversity: 0.5
----- Generating with seed: "o "
o ga na s powyzności eurowy
         pod pie
się z
----- diversity: 1.0
----- Generating with seed: "o "
o kołki w pody przą
nałacieńsknich;
i podzi stwą i
----- diversity: 1.2
----- Generating with seed: "o "
o jaszącéj twortwą przym ludemkny zny jak mo, dresny


In [286]:
train_and_sample(model, x, y, 10, 5, maxlen, char_indices, indices_char, dict_size, 50)

--------------------------------------------------
Iteration 1
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
----- diversity: 0.2
----- Generating with seed: "by"
by dodzaszarzedzie podzanie podzarzarzedzie podo dod
----- diversity: 0.5
----- Generating with seed: "by"
by drie porzeczył podośdiegozie jakiepzym szorzy odz
----- diversity: 1.0
----- Generating with seed: "by"
byęzane ja.
j ztrzewkó drarraéi fodu grawarotnych d
----- diversity: 1.2
----- Generating with seed: "by"
by  bozni lnałwdianochodugagzyed,  udtyk nalegoragłą
--------------------------------------------------
Iteration 2
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
----- diversity: 0.2
----- Generating with seed: "ki"
ki pod pod wie wie na nie na nie sto wie pod mie nie
----- diversity: 0.5
----- Generating with seed: "ki"
ki powiech pony prze pie na się pod w do świ
i wie 
----- diversity: 1.0
----- Generating with seed: "ki"
ki rech ie.
tówwe w rawie co pore i wiecha twyczech
----- diversity: 1.2
----- G