In [1]:
from __future__ import print_function
from utils import Token2IDTransformer, split_data_into_correct_batches_for_stateful_rnn, deep_sample_seq, predict_f_for_stateful_rnn
from functools import partial
import numpy as np
import random
import sys

Using TensorFlow backend.


In [2]:
path = "data/merged_sent_split.txt"
text = open(path).read().lower()
text = text.replace('\x01', '')
corp_length = len(text)
print('corpus length:', corp_length)

corpus length: 184154079


In [3]:
t2i = Token2IDTransformer().fit(text)

In [4]:
chars = t2i.vocab
char_cats = len(chars)
print('total chars:', len(chars))

total chars: 40


In [5]:
batch_size = 32
max_len = 40
batch_shape = (batch_size, max_len)

In [6]:
# prepare data for stateful rnn
text = text[:-(corp_length % batch_size)]
corp_length = len(text)

# transform text into sequence of indices
enc_text = t2i.transform(text)

In [7]:
X, y = split_data_into_correct_batches_for_stateful_rnn(enc_text, batch_size, max_len)
y = y[:, :, None]

In [8]:
from keras.models import Model
from keras.layers import Dense, Activation, Input, Embedding
from keras.layers import LSTM
from keras.layers.wrappers import TimeDistributed

from keras.optimizers import RMSprop
from keras.losses import sparse_categorical_crossentropy

def create_char_rnn():
    inp = Input(batch_shape=(batch_size, max_len), dtype="int32")
    v = Embedding(char_cats, 32)(inp)
    h = LSTM(128, stateful=True, return_sequences=True, unroll=True)(v)
    y = TimeDistributed(Dense(char_cats, activation='softmax'))(h)
    model = Model(inp, y, name="char_rnn")
    model.compile(optimizer=RMSprop(), loss=sparse_categorical_crossentropy)
    return model

In [9]:
rnn = create_char_rnn()

In [10]:
rnn.fit(X, y, batch_size=batch_size, shuffle=False, epochs=1)

Epoch 1/1
 167968/4603840 [>.............................] - ETA: 9908s - loss: 2.0988

KeyboardInterrupt: 

In [11]:
predict_func = partial(predict_f_for_stateful_rnn, rnn, batch_shape)

In [14]:
# generate text

start_index = 1234
for diversity in [1.0]:
    print()
    print('----- diversity:', diversity)

    generated = ''
    sentence = text[start_index: start_index + max_len]
    generated += sentence
    print('----- Generating with seed: "' + sentence + '"')
    sys.stdout.write(generated)

    pred_depth = 4
    top_k = 5
    for i in range(400 // pred_depth):
        t = t2i.transform(sentence)
        next_seq = t2i.inverse_transform(deep_sample_seq(predict_func, t, top_k, seq_len=pred_depth))

        generated += next_seq
        sentence = sentence[pred_depth:] + next_seq

        sys.stdout.write(next_seq)
        sys.stdout.flush()


----- diversity: 1.0
----- Generating with seed: " знаю ничего: я сама привыкла за людьми "
 знаю ничего: я сама привыкла за людьми и стал и присколодый, с пострал и но петр никогда и странныму петрого, положил с представление польбать.
котория с пермоньком половами всегда первами все пологу известно в портими.
на караские и непальсе нетельная и при селека прежда, и началось вышкинаями передонов.
к ними, и покарить начитала...
петрогдательный пармого платьяна сторой, сказывает, когда.
она во своему, что ты, кто ни по свою молч

In [None]:
%debug