In [None]:
from __future__ import print_function
from keras.models import Model
from keras.layers import Dense, Activation, Input, Embedding, Reshape, MaxPooling1D, Conv1D
from keras.layers import LSTM, GRU, Conv1D
from keras.layers import Dropout, BatchNormalization, Flatten
from keras.layers.wrappers import TimeDistributed
from keras.activations import sigmoid
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.callbacks import TensorBoard
import numpy as np
import random
import sys
import os
import json
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
with open('data/merged_sampled.json', encoding='utf-8') as f:
    json_data = json.load(f)

In [None]:
len(json_data)

In [None]:
original_sentences = list(json_data.keys())
fake_sentences = []
for x in json_data.values():
    fake_sentences.extend(x)

print('original_sentences:\t', len(original_sentences))
print('fake_sentences:\t', len(fake_sentences))

print('mean len of original sentences:\t', np.mean([len(x) for x in original_sentences]), 'chars')
print('mean len of fake sentences:\t', np.mean([len(x) for x in fake_sentences]), 'chars')

In [None]:
from libs.utils import load_transformer

transformer = load_transformer('models/shm_c3')

chars = transformer.tokens
char_cats = len(chars)
print('total chars:', char_cats)

In [None]:
a = plt.hist([len(x) for x in original_sentences], bins=30)

In [None]:
a = plt.hist([len(x) for x in fake_sentences], bins=30)

In [None]:
max_len = 200

In [None]:
from libs.utils import pad

# transform text into sequence of indices
pad_idx = char_cats
original_indexes = np.array([pad(transformer.transform(sent), max_len, pad_idx) for sent in original_sentences])
fake_indexes     = np.array([pad(transformer.transform(sent), max_len, pad_idx) for sent in fake_sentences])

In [None]:
n_batches = (len(original_indexes) + len(fake_indexes)) // 300
batch_size = 16

In [None]:
from libs.utils import split_data_into_correct_batches
X, y = split_data_into_correct_batches(original_indexes, fake_indexes, n_batches, max_len, make_equal_folding=True)

In [None]:
X.shape

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

a = plt.hist(y)

In [None]:
from keras.losses import sparse_categorical_crossentropy

def create_cnn():
    inp = Input(shape=(max_len,), dtype="int32")
    v = Embedding(char_cats+1, int(char_cats / 1.5))(inp)
    x = Conv1D(128, kernel_size=8, activation='relu', padding='same')(v)
    x = Dropout(0.3)(BatchNormalization()(x))
    x = MaxPooling1D(4, padding='same')(x)

    x = Conv1D(256, kernel_size=4, activation='relu', padding='same')(x)
    x = Dropout(0.3)(BatchNormalization()(x))
    x = MaxPooling1D(5, padding='same')(x)

    h = Flatten()(x) # None, 5*256
    y = Dense(2, activation='softmax')(h)
    model = Model(inp, y, name="char_cnn")
    model_to_save = Model(inp, y, 'char_cnn')
    model.compile(optimizer=RMSprop(), loss=sparse_categorical_crossentropy, metrics=['accuracy'])
    return model, model_to_save

In [None]:
nn, nn_to_save = create_cnn()

In [None]:
print(nn.summary())

In [None]:
from time import clock

n_epochs = 4
n_batches = (len(original_indexes) + len(fake_indexes)) // 15

lens = [max_len - np.mean([list(x).count(42) for x in fake_indexes])]
sizes = [len(fake_indexes)]

indexes = None
for epoch in range(n_epochs):
    if epoch == 0:
        X, y = split_data_into_correct_batches(original_indexes, fake_indexes, n_batches, max_len, make_equal_folding=True)
    elif epoch % 3 == 0:
        t = clock()
        probs = nn.predict(fake_indexes)[:, 0]
        bool_ind = np.random.uniform(0., 1., probs.shape) < probs
        indexes = np.arange(bool_ind.shape[0])[bool_ind]

        print('epoch', epoch, '- deleting took', clock() - t, 'sec')
        X, y = split_data_into_correct_batches(original_indexes, fake_indexes[indexes], n_batches, max_len, make_equal_folding=True)
        lens.append(max_len - np.mean([list(x).count(42) for x in fake_indexes[indexes]]))
        sizes.append(len(indexes))
    nn.fit(X, y, batch_size=batch_size, shuffle=True, validation_split=0.2)

In [None]:
a = plt.hist([len(x) for x in original_sentences], bins=30)

In [None]:
a = plt.hist([len(x) for x in fake_sentences[indexes]], bins=30)

In [None]:
lens

In [None]:
nn_to_save.save('models/discriminator_believability_rnn_model.h5')