In [2]:
from __future__ import print_function
from keras.models import Model
from keras.layers import Dense, Activation, Input, Embedding, Reshape
from keras.layers import LSTM, GRU, Conv1D
from keras.layers.wrappers import TimeDistributed
from keras.activations import sigmoid
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.callbacks import TensorBoard
import numpy as np
import random
import sys
import os

In [None]:
from libs.utils import text_preprocess

In [None]:
with open('data/dostoewskij.txt', encoding='utf-8') as f:
    dostoewskij_text = f.read().lower()
with open('data/non_dostoewskij_texts.txt', encoding='utf-8') as f:
    non_dostoewskij_text = f.read().lower()

dostoewskij_text = text_preprocess(dostoewskij_text)
non_dostoewskij_text = text_preprocess(non_dostoewskij_text)

# dostoewskij_text = clear_text_from_rare_chars(dostoewskij_text, delete_enters=True)
# non_dostoewskij_text = clear_text_from_rare_chars(non_dostoewskij_text, delete_enters=True)

print('dostoewskij_length:\t', len(dostoewskij_text))
print('non_dostoewskij_length:\t', len(non_dostoewskij_text))

In [None]:
from libs.utils import load_transformer

transformer = load_transformer('models/shm_c1')

chars = transformer.tokens
char_cats = len(chars)
print('total chars:', char_cats)

In [None]:
print_len = 2000
pos = np.random.randint(len(dostoewskij_text))
print(dostoewskij_text[pos:pos+print_len])
print('-' * 100)
pos = np.random.randint(len(non_dostoewskij_text))
print(non_dostoewskij_text[pos:pos+print_len])

In [None]:
n_batches = len(dostoewskij_text) // 3000
batch_size = 16
#n_batches -= n_batches % batch_size
max_len = 200

In [None]:
# transform text into sequence of indices
original_indexes        = [transformer.transform(sent) for sent in original_sentences]
non_dostoewskij_indexes = [transformer.transform(sent) for sent in non_dostoewskij_indexes]

In [None]:
def split_data_into_correct_batches(text1_indexes, text2_indexes, make_equal_folding = True):
    prime_number = 2147483647
    
    X = np.zeros((n_batches, max_len), dtype=np.int64)
    Y = np.zeros((n_batches,), dtype=np.int64)
    
    choose_from_first = True
    index1 = 0
    index2 = 0
    for i in range(n_batches):
        if make_equal_folding:
            if choose_from_first:
                index1 = (index1 + prime_number) % (len(text1_indexes) - max_len + 1)
                X[i, :] = text1_indexes[index1:index1+max_len]
                Y[i] = 0
            else:
                index2 = (index2 + prime_number) % (len(text2_indexes) - max_len + 1)
                X[i, :] = text2_indexes[index2:index2+max_len]
                Y[i] = 1
                
            choose_from_first = not choose_from_first
        else:
            index1 = (index1 + prime_number) % (len(text1_indexes) + len(text2_indexes) - 2*max_len + 2)
            if index1 < len(text1_indexes) - max_len + 1:
                X[i, :] = text1_indexes[index1:index1 + max_len]
                Y[i] = 0
            else:
                index2 = index1 - (len(text1_indexes) - max_len + 1)
                X[i, :] = text2_indexes[index2:index2 + max_len]
                Y[i] = 1
    return X, Y

X, y = split_data_into_correct_batches(dostoewskij_indexes, non_dostoewskij_indexes, make_equal_folding=True)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

a = plt.hist(y)

In [None]:
from keras.losses import sparse_categorical_crossentropy

In [None]:
def create_char_rnn():
    inp = Input(shape=(max_len,), dtype="int32")
    v = Embedding(char_cats, int(char_cats / 1.5))(inp)
    h1 = GRU(256, stateful=False, return_sequences=True, unroll=True, implementation=0)(v)
    h2 = GRU(256, stateful=False, return_sequences=False, unroll=True, implementation=0)(h1)
    y = Dense(2, activation='softmax')(h2)
    model = Model(inp, y, name="char_rnn")
    model.compile(optimizer=RMSprop(), loss=sparse_categorical_crossentropy, metrics=['accuracy'])
    return model

In [None]:
rnn = create_char_rnn()

In [None]:
print(rnn.summary())

In [None]:
history = rnn.fit(X, y, batch_size=batch_size, shuffle=True, epochs=1)#, callbacks=[tb])

In [None]:
n_epochs = 8
histories = []
for epoch in range(n_epochs):
    X, y = split_data_into_correct_batches(dostoewskij_indexes, non_dostoewskij_indexes, make_equal_folding=True)
    histories.append(rnn.fit(X, y, batch_size=batch_size, shuffle=True, epochs=1))

In [None]:
rnn.save('models/discriminator_style_rnn_model.h5')