In [1]:
from __future__ import print_function
from keras.models import Model
from keras.layers import Dense, Activation, Input, Embedding
from keras.layers import LSTM, GRU
from keras.layers.wrappers import TimeDistributed
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.utils import np_utils
import numpy as np
import random
import sys

Using TensorFlow backend.


In [2]:
path = "data/merged_sent_split.txt"
text = open(path).read().lower()
corp_length = len(text)
print('corpus length:', corp_length)

chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
c2i_map_f = lambda c: char_indices[c]
indices_char = dict((i, c) for i, c in enumerate(chars))
i2c_map_f = lambda i: indices_char[i]

corpus length: 184189032
total chars: 41


In [3]:
def i2c(i):
    return ''.join(list(map(i2c_map_f, i)))

def c2i(c):
    return np.array(list(map(c2i_map_f, c)), dtype=np.int)

In [4]:
batch_size = 16
max_len = 40
batch_shape = (batch_size, max_len)
char_cats = len(chars)

In [5]:
# prepare data for stateful rnn
text = text[:-(corp_length % batch_size)]
# text = np.array(list(text))
corp_length = len(text)
corp_length

# transform text into sequence of indices
enc_text = np.array(list(map(c2i_map_f, text)), dtype=np.int)
enc_text

step = corp_length // batch_size
seqs_num = (step - 1) // max_len # make sure there is always an y ahead

def split_data_into_correct_batches(array):
    '''Input is a flat text array, output X is:
       batch0 ... ... (1 seq)
       batch0 ... ... (2 seq)
       .
       .
       .
       batch0 ... ... (batch_shape seq)
       batch1 ... ... (1 seq)
       .
       .
       .
       batch1 ... ... (batch_shape seq)
       batch2 ... ... (1 seq)
       .
       .
       .
       y is 1 element shifted X
       such packing is needed for correct implementation of stateful rnn
       '''
    r_text = array.reshape(batch_size, len(array) // batch_size)
    X = r_text[:,:seqs_num * max_len].reshape((batch_size, seqs_num, max_len))
    X = X.reshape((batch_size * seqs_num, max_len), order="F")

    y = r_text[:,1:seqs_num * max_len + 1].reshape((batch_size, seqs_num, max_len))
    y = y.reshape((batch_size * seqs_num, max_len), order="F")
    return X, y[:, :, None]

X, y = split_data_into_correct_batches(enc_text)

In [6]:
from keras.losses import sparse_categorical_crossentropy

In [7]:
def create_char_rnn():
    inp = Input(batch_shape=(batch_size, max_len), dtype="int32")
    v = Embedding(char_cats, 32)(inp)
    h = GRU(256, stateful=True, return_sequences=True, unroll=True, implementation=2)(v)
    h = GRU(256, stateful=True, return_sequences=True, unroll=True, implementation=2)(h)
    y = TimeDistributed(Dense(char_cats, activation='softmax'))(h)
    model = Model(inp, y, name="char_rnn")
    model.compile(optimizer=RMSprop(), loss=sparse_categorical_crossentropy)
    return model

In [8]:
rnn = create_char_rnn()

In [9]:
rnn.fit(X, y, batch_size=batch_size, shuffle=False, epochs=1)

Epoch 1/1

KeyboardInterrupt: 

In [10]:
def predict_func(seed, temperature=1.0):
    history = np.zeros((batch_shape))
    history[0] = seed
    preds = rnn.predict(history, verbose=0)[0, -1, :]
    preds = np.log(preds + 1e-10) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return preds

In [11]:
def test_predict(seed, temperature=1.0):
    probs = np.abs(np.random.randn(char_cats))
    probs /= np.sum(probs)
    return probs

In [181]:
def sample_seq(predict_f, seed, top_k, seq_len=1, temp_sym=1.0, temp_seq=1.0, return_full_tree=False):
#     assert 0 < seq_len <= 5
    
    s = list(batch_shape[1:]) # without batch_size
    s[0] = 0
    sequences     = [[] for i in range(seq_len + 1)]
    probabilities = [[] for i in range(seq_len + 1)]
    sequences[0].append(np.zeros(s, dtype=np.int))
    probabilities[0].append(1.)

    seed_len = len(seed)
    for i in range(seq_len):
        for seq, prob in zip(sequences[i], probabilities[i]):
            seed_sample = np.concatenate((seed[i:], seq))
            preds = predict_f(seed_sample, temp_sym)
            arg_preds = np.argsort(preds)[-top_k:]
            probs = preds[arg_preds]

            added_seqs = [np.concatenate((seq, p)) for p in np.split(arg_preds, top_k)]
            sequences[i+1] += added_seqs
            probabilities[i+1] += [a[0] for a in np.split(probs * prob, top_k)]

    if return_full_tree:
        return sequences, probabilities
    else:
        final_probs = np.log(np.array(probabilities[-1]) + 1e-10) / temp_seq
        exp_final_probs = np.exp(final_probs)
        final_preds = np.random.multinomial(1, exp_final_probs / np.sum(exp_final_probs), 1)
        return sequences[-1][np.argmax(final_preds)]

In [None]:
# generate text

start_index = 1236
pred_depth = 2
top_k = 20

for diversity in [1.0]:
    print()
    print('----- diversity:', diversity)
    print('----- depth:', pred_depth)
    print('----- top k:', top_k)
    print('')
    
    generated = ''
    sentence = text[start_index: start_index + max_len]
    generated += sentence
    print('----- Generating with seed: "' + sentence + '"')

    sys.stdout.write(generated)

    for i in range(400 // pred_depth):
        t = c2i(sentence)
        next_seq = i2c(sample_seq(predict_func, t, top_k, seq_len=pred_depth, temp_seq=diversity, temp_sym=diversity))

        generated += next_seq
        sentence = sentence[pred_depth:] + next_seq

        sys.stdout.write(next_seq)
        sys.stdout.flush()


----- diversity: 1.0
----- depth: 2
----- top k: 20

----- Generating with seed: "наю ничего: я сама привыкла за людьми жи"
наю ничего: я сама привыкла за людьми живет и лет...
у ней по пристальное дело, дети вышлося, а рюмки писали в постриченные людя и страна...
но, зрительный, в стало быть молчание.
старый голод заплатил таким делом тихо: и ее видишься, видите!
я вовсе он прямо мне уж...
а ведете, помишлое, и все живут, а ребять везде морозы.
это в картам, помора перед примеру.
позкликая совсем это в корнете и страсть лег

In [36]:
log_probs = np.log(np.array(p[-1]) + 1e-10)
probs = np.exp(log_probs)
probs = probs / np.sum(probs)
final_preds = np.random.multinomial(1, probs, 1)

In [142]:
final_preds = np.random.multinomial(1, probs, 1)
final_preds

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0]])