In [1]:
import tensorflow as tf
from keras import backend as K
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
K.set_session(sess)
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, LSTM,TimeDistributed, Embedding
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping, ReduceLROnPlateau
from keras.optimizers import RMSprop, Adam, SGD
import sys
import numpy as np

Using TensorFlow backend.


In [2]:
fname = 'poem_texts.txt'
text = open(fname, 'r', encoding='utf-8').read()
print('corpus length:', len(text))

corpus length: 2028773


In [3]:
chars = sorted(list(set(text)))
print('unique chars:', len(chars))
char_idx = dict((c, i) for i, c in enumerate(chars))
idx_char = dict((i, c) for i, c in enumerate(chars))

bigrams = []
for i in range(0, len(text)-1, 1):
    bigrams.append(text[i] + text[i+1])
bigrams = sorted(list(set(bigrams)))
print('unique bigrams:', len(bigrams))
bigram_idx = dict((c, i) for i, c in enumerate(bigrams))
idx_bigram = dict((i, c) for i, c in enumerate(bigrams))

trigrams = []
for i in range(0, len(text)-2, 1):
    trigrams.append(text[i] + text[i+1] + text[i+2])
trigrams = sorted(list(set(trigrams)))
print('unique trigrams:', len(trigrams))
trigram_idx = dict((c, i) for i, c in enumerate(trigrams))
idx_trigram = dict((i, c) for i, c in enumerate(trigrams))

words = text.split()
print('word count:', len(words))
words = sorted(list(set(words)))
print('unique words:', len(words))

unique chars: 179
unique bigrams: 3915
unique trigrams: 29380
word count: 323745
unique words: 98049


In [45]:
textl = text.lower()

In [59]:
allowed = '[^\n !,-.:;?абвгдежзийклмнопрстуфхцчшщъыьэюяё]'

In [60]:
import re
textr = re.sub(allowed,' ', textl)
textr = re.sub(' +',' ', textr)

In [61]:
textr[:300]

' - ?\n .\nтак и мне узнать случилось,\nчто за птица купидон;\nсердце страстное пленилось;\nпризнаюсь и я влюблен!\nпролетело счастья время,\nкак, любви не зная бремя,\nя живал да попевал,\nкак в театре и на балах,\nна гуляньях иль в воксалах\nлегким зефиром летал;\nкак, смеясь во зло амуру,\nя писал карикатуру\nн'

In [81]:
vocab = set(textr)
stoi = {c:i for i,c in enumerate(sorted(list(vocab)))}
itos = {i:c for c,i in stoi.items()}
vocab_size = len(stoi)
vocab_size

42

In [97]:
seq_len = 400
batch_size = 32 # decrease if you have "Failed to allocate memory" error when start training
track_len = len(textr) // batch_size
tracks = []
for i in range(0, len(textr) - track_len, track_len):
    t = textr[i:i+track_len]
    parts = t.split('\n')
    t = '\n'.join(parts[1:-1])
    t = [stoi[c] for c in t]
    tracks.append(t)

min_len = (min([len(t) for t in tracks]) // seq_len) * seq_len +1
tracks = [t[:min_len] for t in tracks]
tracks = np.array(tracks)

# Let's see what we've got
print(tracks.shape)

x = tracks[:,:-1]
y_labels = tracks[:,1:]

from keras.utils.np_utils import to_categorical

y = to_categorical(y_labels, num_classes=None)
y = y.reshape(32, -1, 42)

#batchify
x = x.reshape(32, -1, seq_len)
x = np.transpose(x, (1, 0, 2))
y = y.reshape(32, -1, seq_len, 42)
y = np.transpose(y, (1, 0, 2, 3))
print(x.shape, y.shape)

(32, 60801)
(152, 32, 400) (152, 32, 400, 42)


In [98]:
cells = 512
drop = 0.2
embed = 100 # size of character embedding
layers = 2
lr = 0.01
clip = 1.0 # gradient clipping to prevent exploding gradients
stateful=False # maintain layer state between batches

print(vocab_size, embed, batch_size, seq_len)

model = Sequential()
model.add(Embedding(vocab_size, embed, batch_input_shape=(batch_size, seq_len)))
for l in range(layers):
    model.add(LSTM(cells, return_sequences=True, stateful=stateful, dropout=drop))
model.add(Dense(vocab_size))
model.add(Activation('softmax'))
optimizer = Adam(lr, clipnorm=clip)
model.compile(loss="categorical_crossentropy", optimizer=optimizer)
model.summary()

42 100 32 400
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (32, 400, 100)            4200      
_________________________________________________________________
lstm_27 (LSTM)               (32, 400, 512)            1255424   
_________________________________________________________________
lstm_28 (LSTM)               (32, 400, 512)            2099200   
_________________________________________________________________
dense_14 (Dense)             (32, 400, 42)             21546     
_________________________________________________________________
activation_14 (Activation)   (32, 400, 42)             0         
Total params: 3,380,370
Trainable params: 3,380,370
Non-trainable params: 0
_________________________________________________________________


In [99]:
max_len = 8
class ResetStatesCallback(Callback):
    def __init__(self):
        self.counter = 0

    def on_batch_begin(self, batch, logs={}):
        if self.counter % max_len == 0:
            print('reset')
            self.model.reset_states()
        self.counter += 1

In [100]:
def get_callbacks(filepath, patience=5):
    learning_rate_reduction = ReduceLROnPlateau(monitor='loss', 
                                            patience=patience, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001)
    es = EarlyStopping('loss', verbose=1, min_delta=0.02, patience=patience, mode="min")
    return [learning_rate_reduction, es]

In [101]:
def sample(preds, temperature=0.5):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [102]:
def predictive_model(main_model): # change batch size to 1 for work with one sequence
    model = Sequential()
    model.add(Embedding(vocab_size, embed, batch_input_shape=(1, seq_len)))
    for l in range(layers):
        model.add(LSTM(cells, return_sequences=True, stateful=stateful, dropout=drop))
    model.add(Dense(vocab_size))
    model.add(Activation('softmax'))
    optimizer = Adam(lr, clipnorm=clip)
    model.compile(loss="categorical_crossentropy", optimizer=optimizer)
    old_weights = main_model.get_weights()
    model.set_weights(old_weights)
    return model

In [103]:
# generate new chars from model
def test(model, l=300, seed = None, t=0.5):
    start_from = np.random.randint(len(text)-seq_len)+seq_len
    seed_string = text[start_from:start_from + seq_len] if seed is None else seed
    print('\n\nSeed:  ', seed_string)
    print('----')
    sys.stdout.write(seed_string)
    prmodel = predictive_model(model)
    for i in range(l):
        prmodel.reset_states()
        padlen = (len(seed_string) // seq_len +1) * seq_len
        seed_string = seed_string.rjust(padlen)[-seq_len*3:]
        test_tracks = [seed_string]
        tidx = grams(test_tracks)
        xt, _ = vectorize(tidx)
        preds = prmodel.predict(np.array(xt), batch_size=1, verbose=0)
        preds = preds[-1][-1] # last symbol of last sequence
        next_item = idx_char[sample(preds, t)]
        seed_string = seed_string + next_item
        sys.stdout.write(next_item)
        sys.stdout.flush()    

In [107]:
for iteration in range(1, 51):
    print('\nIteration', iteration)
    model_name = 'char_%s_%d_%d_%.1f_%d.h5' % (fname, layers, cells, drop, iteration)
    history=model.fit(
        x, y, 
        batch_size=batch_size, 
        epochs=1, 
        verbose=1, 
        shuffle=False,
        callbacks=get_callbacks(filepath=model_name)
    )
    model.save_weights(model_name, overwrite=True)
    model.reset_states()
    if iteration%3 == 0:
        test(model)


Iteration 1


ValueError: Error when checking input: expected embedding_14_input to have 2 dimensions, but got array with shape (152, 32, 400)

In [16]:
seed = "У нас в Малом зале до сих пор проходят"
test(model, 300, seed, 0.5)



Seed:   У нас в Малом зале до сих пор проходят
----
У нас в Малом зале до сих пор проходять * порра воло.
Вому,
И стол телны,
И частил статой на верь перь,
Ла сре дечай п

KeyboardInterrupt: 