In [None]:
from __future__ import print_function
from keras.models import Model
from keras.layers import Dense, Activation, Input, Embedding, Reshape
from keras.layers import LSTM, GRU, Conv1D
from keras.layers.wrappers import TimeDistributed
from keras.activations import sigmoid
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.callbacks import TensorBoard
import numpy as np
import random
import sys
import os
import json
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
with open('data/sampled.json', encoding='utf-8') as f:
    json_data = json.load(f)

In [None]:
len(json_data)

In [None]:
original_sentences = list(json_data.keys())
fake_sentences = []
for x in json_data.values():
    fake_sentences.extend(x)

print('original_sentences:\t', len(original_sentences))
print('fake_sentences:\t', len(fake_sentences))

print('mean len of original sentences:\t', np.mean([len(x) for x in original_sentences]), 'chars')
print('mean len of fake sentences:\t', np.mean([len(x) for x in fake_sentences]), 'chars')

In [None]:
from libs.utils import load_transformer

transformer = load_transformer('models/shm_c1')

chars = transformer.tokens
char_cats = len(chars)
print('total chars:', char_cats)

In [None]:
a = plt.hist([len(x) for x in original_sentences], bins = 30)

In [None]:
a = plt.hist([len(x) for x in fake_sentences], bins = 30)

In [None]:
max_len = 200

In [None]:
from libs.utils import pad

# transform text into sequence of indices
pad_idx = char_cats
original_indexes = [pad(transformer.transform(sent), max_len, pad_idx) for sent in original_sentences]
fake_indexes     = [pad(transformer.transform(sent), max_len, pad_idx) for sent in fake_sentences]

In [None]:
n_batches = (len(original_indexes) + len(fake_indexes)) // 300
batch_size = 16

In [None]:
def split_data_into_correct_batches(text1_indexes, text2_indexes, make_equal_folding = True):
    prime_number = 2147483647
    
    X = np.zeros((n_batches, max_len), dtype=np.int64)
    Y = np.zeros((n_batches,), dtype=np.int64)
    
    choose_from_first = True
    index1 = 0
    index2 = 0
    for i in range(n_batches):
        if make_equal_folding:
            if choose_from_first:
                index1 = (index1 + prime_number) % (len(text1_indexes))
                X[i, :] = text1_indexes[index1]
                Y[i] = 0
            else:
                index2 = (index2 + prime_number) % (len(text2_indexes))
                X[i, :] = text2_indexes[index2]
                Y[i] = 1
                
            choose_from_first = not choose_from_first
        else:
            index1 = (index1 + prime_number) % (len(text1_indexes) + len(text2_indexes))
            if index1 < len(text1_indexes) - max_len + 1:
                X[i, :] = text1_indexes[index1]
                Y[i] = 0
            else:
                index2 = index1 - (len(text1_indexes))
                X[i, :] = text2_indexes[index2]
                Y[i] = 1
    return X, Y

X, y = split_data_into_correct_batches(original_indexes, fake_indexes, make_equal_folding=True)

In [None]:
X.shape

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

a = plt.hist(y)

In [None]:
from keras.losses import sparse_categorical_crossentropy

def create_char_rnn():
    inp = Input(shape=(max_len,), dtype="int32")
    v = Embedding(char_cats+1, int(char_cats / 1.5))(inp)
    h1 = GRU(256, stateful=False, return_sequences=True, unroll=True, implementation=0)(v)
    h2 = GRU(256, stateful=False, return_sequences=False, unroll=True, implementation=0)(h1)
    y = Dense(2, activation='softmax')(h2)
    model = Model(inp, y, name="char_rnn")
    model.compile(optimizer=RMSprop(), loss=sparse_categorical_crossentropy, metrics=['accuracy'])
    return model

In [None]:
rnn = create_char_rnn()

In [None]:
print(rnn.summary())

In [None]:
history = rnn.fit(X, y, batch_size=batch_size, shuffle=True, epochs=1)

In [None]:
n_epochs = 6
n_batches = (len(original_indexes) + len(fake_indexes)) // 3
histories = []
for epoch in range(n_epochs):
    X, y = split_data_into_correct_batches(original_indexes, fake_indexes, make_equal_folding=True)
    histories.append(rnn.fit(X, y, batch_size=batch_size, shuffle=True, epochs=1))

In [None]:
rnn.save('models/discriminator_believability_rnn_model.h5')