In [1]:
import numpy as np
import keras

Using TensorFlow backend.


In [2]:
path = keras.utils.get_file("nietzsche.txt", origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")

In [3]:
# with open(path, 'r') as f:
#     text = f.read().lower()
# print("len(text) = {}".format(len(text)))

In [4]:
import codecs

with codecs.open(path, 'r', encoding='utf-8') as f:
    text = f.read().lower()
print("len(text) = {}".format(len(text)))

len(text) = 600893


In [5]:
charset = sorted(list(set(text)))
print("len(charset) = {}".format(len(charset)))
print("Unique chars = {}".format(charset))

len(charset) = 57
Unique chars = ['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ä', 'æ', 'é', 'ë']


In [6]:
MAX_LEN = 60
STEP = 3

In [7]:
sentences = []
next_chars = []

for i in range(0, len(text) - MAX_LEN, STEP):
    sentences.append(text[i: i + MAX_LEN])
    next_chars.append(text[i + MAX_LEN])
    
print("len(sentences) = {}".format(len(sentences)))

len(sentences) = 200278


In [8]:
print(repr(text[:MAX_LEN+20]))
print(repr(sentences[0]))
print(repr(next_chars[0]))

'preface\n\n\nsupposing that truth is a woman--what then? is there not ground\nfor su'
'preface\n\n\nsupposing that truth is a woman--what then? is the'
'r'


In [9]:
char_idx_dict = dict((char, charset.index(char)) for char in charset)
print("char_idx_dict = {}".format(char_idx_dict))

def encode_sentence(sentences):
    if not isinstance(sentences, list):
        sentences = [sentences]
    
    encoded = np.zeros((len(sentences), MAX_LEN, len(charset)))
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            encoded[i, t, char_idx_dict[char]] = 1
    return encoded

def encode_char(chars):
    if not isinstance(chars, list):
        chars = [chars]
    
    encoded = np.zeros((len(chars), len(charset)))
    for i, char in enumerate(chars):
        encoded[i, char_idx_dict[char]] = 1
    return encoded

def decode_sentence(one_hot_sentence):
    chars = [charset[i] for i in one_hot_sentence.argmax(axis=1)]
    return ''.join(chars)

char_idx_dict = {'\n': 0, ' ': 1, '!': 2, '"': 3, "'": 4, '(': 5, ')': 6, ',': 7, '-': 8, '.': 9, '0': 10, '1': 11, '2': 12, '3': 13, '4': 14, '5': 15, '6': 16, '7': 17, '8': 18, '9': 19, ':': 20, ';': 21, '=': 22, '?': 23, '[': 24, ']': 25, '_': 26, 'a': 27, 'b': 28, 'c': 29, 'd': 30, 'e': 31, 'f': 32, 'g': 33, 'h': 34, 'i': 35, 'j': 36, 'k': 37, 'l': 38, 'm': 39, 'n': 40, 'o': 41, 'p': 42, 'q': 43, 'r': 44, 's': 45, 't': 46, 'u': 47, 'v': 48, 'w': 49, 'x': 50, 'y': 51, 'z': 52, 'ä': 53, 'æ': 54, 'é': 55, 'ë': 56}


In [10]:
x = encode_sentence(sentences)
y = encode_char(next_chars)

print("x.shape = {}".format(x.shape))
print("y.shape = {}".format(y.shape))

x.shape = (200278, 60, 57)
y.shape = (200278, 57)


In [11]:
decode_sentence(x[0])

'preface\n\n\nsupposing that truth is a woman--what then? is the'

In [12]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

model = Sequential()
model.add(LSTM(128, input_shape=(MAX_LEN, len(charset))))
model.add(Dense(len(charset), activation='softmax'))

Instructions for updating:
Colocations handled automatically by placer.


In [13]:
LEARNING_RATE = 0.01
BATCH_SIZE = 128

In [14]:
from keras.optimizers import RMSprop

optimizer = RMSprop(lr=LEARNING_RATE)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [15]:
model.fit(x, y, batch_size=BATCH_SIZE, epochs=1)

Instructions for updating:
Use tf.cast instead.
Epoch 1/1


<keras.callbacks.History at 0x16071f9e358>

In [16]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [17]:
def encode_one_hot(text):
    encoded = np.zeros((1, MAX_LEN, len(text)))
    for idx, c in enumerate(text):
        encoded[0, idx, char_idx_dict[c]] = 1
    return encoded

In [18]:
import random
import sys

start_index = random.randint(0, len(text) - MAX_LEN - 1)
generated_text = text[start_index:(start_index + MAX_LEN)]
print("Starting with: {}".format(repr(generated_text)))

for temperature in [0.2, 0.5, 1.0, 1.2]:
    print("======= temperature: {} =======".format(temperature))
    sys.stdout.write(generated_text)
    for i in range(400):
        sampled = encode_sentence(generated_text)
        preds = model.predict(sampled, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_char = charset[next_index]
        sys.stdout.write(next_char)
        sys.stdout.flush()
        generated_text += next_char
        generated_text = generated_text[1:]
    print()

Starting with: 'will do.--what is lacking in\nengland, and has always been la'
will do.--what is lacking in
england, and has always been lat such as a pertical superination of the from the sense to been they will they they they they they in the sense to the most be not to in they not they in the superinations and to they to they and the great such they will they they they himself they they they such a most present they they to his some they they they in the superination of the deners and they to be the inselfise to the forment of the
he deners and they to be the inselfise to the forment of theyself supernantions they they they in nothing more how the don they he oun all their our with must they explaness.



1chulled to the present to meath is new man and more they insompations to in ording and consections, they we
supponing there and all to distoul and a mistances in lature and some all they wimh the father to they haging to a manter and we mand to contly himself in the god anding and