In [None]:
from __future__ import print_function
from keras.models import Model
from keras.layers import Dense, Activation, Input, Embedding, Reshape, MaxPooling1D, Conv1D
from keras.layers import LSTM, GRU, Conv1D
from keras.layers import Dropout, BatchNormalization, Flatten
from keras.layers.wrappers import TimeDistributed
from keras.activations import sigmoid
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.callbacks import TensorBoard
from keras.losses import sparse_categorical_crossentropy
import numpy as np
import random
import sys
import os
import json
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def split_data_into_correct_batches_stratified_by_len(df, shuffle=True):
    real_rows = df[(df.fake == 0)].shape[0]
    max_len = df.padded[0].shape[0]
    
    Xs = []
    y = np.array(([0]*real_rows) + ([1]*real_rows), dtype=np.int32)

    real = np.vstack(df.padded[df.fake == 0])
    Xs.append(real)

    for i in range(1, max_len+1):
        count = df[((df.fake == 0) & (df.len == i))].shape[0]

        add = 0
        sampled = None
        first_loop = True
        while first_loop or fake_fold.shape[0] == 0:
            first_loop = False
            fake_fold = df[(df.fake == 1) & (df.len == i + add)]
            replace = fake_fold.shape[0] < count
            if fake_fold.shape[0] == 0:
                add = (-add) if add > 0 else (-add + 1)
            else:
                sampled = fake_fold.padded.sample(n=count, replace=replace)
        if sampled.shape[0]:
            sampled = np.vstack(sampled)
            Xs.append(sampled)

    X = np.vstack(Xs)
    if shuffle:
        perm = np.random.permutation(X.shape[0])
        X = X[perm]
        y = y[perm]
    return X, y

In [None]:
with open('data/merged_sampled.json', encoding='utf-8') as f:
    json_data = json.load(f)
    
original_sentences = list(json_data.keys())
fake_sentences = []
for x in json_data.values():
    fake_sentences.extend(x)
    
print('original_sentences:\t', len(original_sentences))
print('fake_sentences:\t', len(fake_sentences))

print('mean len of original sentences:\t', np.mean([len(x) for x in original_sentences]), 'chars')
print('mean len of fake sentences:\t', np.mean([len(x) for x in fake_sentences]), 'chars')

# Char-based model

In [None]:
from libs.utils import load_transformer

transformer = load_transformer('models/shm_c3')

chars = transformer.tokens
char_cats = len(chars)
print('total chars:', char_cats)

In [None]:
a = plt.hist([len(x) for x in original_sentences], bins=30)

In [None]:
a = plt.hist([len(x) for x in fake_sentences], bins=30)

In [None]:
import pandas as pd

In [None]:
sentences = original_sentences + fake_sentences
classes = [0]*len(original_sentences) + [1]*len(fake_sentences)
df = pd.DataFrame({"sentence":sentences, "fake":classes})
df["len"] = df.sentence.map(len)

In [None]:
df.len.max()

In [None]:
max_len = 200
pad_idx = char_cats

In [None]:
from libs.utils import pad

df["padded"] = df.sentence.map(lambda x:pad(transformer.transform(x), max_len, pad_idx))

In [None]:
df.head()

In [None]:
def create_cnn():
    inp = Input(shape=(max_len,), dtype="int32")
    v = Embedding(char_cats+1, int(char_cats / 1.5))(inp)
    x = Conv1D(32, kernel_size=8, activation='relu', padding='same')(v)
    x = Dropout(0.5)(BatchNormalization()(x))
    x = MaxPooling1D(2, padding='same')(x)

    x = Conv1D(8, kernel_size=3, activation='relu', padding='same')(x)
    x = Dropout(0.5)(BatchNormalization()(x))
    x = MaxPooling1D(2, padding='same')(x)
    
    h = Flatten()(x)
    y = Dense(2, activation='softmax')(h)
    model = Model(inp, y, name="char_cnn")
    model_to_save = Model(inp, y, 'char_cnn')
    model.compile(optimizer='adam', loss=sparse_categorical_crossentropy, metrics=['accuracy'])
    return model, model_to_save

In [None]:
nn, nn_to_save = create_cnn()

In [None]:
print(nn.summary())

In [None]:
n_epochs = 10
X, y = split_data_into_correct_batches_stratified_by_len(df)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)


from keras.callbacks import ModelCheckpoint

mc = ModelCheckpoint(filepath='models/discriminator_believability_cnn_model_2.h5')
nn.fit(X_train, y_train, epochs=n_epochs, batch_size=128, shuffle=True, validation_data=(X_test, y_test), callbacks=[mc])

In [None]:
nn_to_save.save('models/discriminator_believability_cnn_model.h5')

# Word-based model

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
from libs.utils import Token2IDTransformer

with open('data/words_dictionary.txt', encoding='utf-8') as f:
    words = f.read().split('_')

default_token = '<unk>'
transformer = Token2IDTransformer(default_token=default_token)
transformer = transformer.fit(words)

In [None]:
import pandas as pd

sentences = list(map(word_tokenize, original_sentences)) + list(map(word_tokenize, fake_sentences))
classes = [0]*len(original_sentences) + [1]*len(fake_sentences)
df = pd.DataFrame({"sentence":sentences, "fake":classes})
df["len"] = df.sentence.map(len)

In [None]:
from libs.utils import pad

max_len = max(df.len)
pad_idx = transformer.vocab_size

df["padded"] = df.sentence.map(lambda x:pad(transformer.transform(x), max_len, pad_idx))

In [None]:
df.head()

In [None]:
a = df[df.fake == 0].len
a = plt.hist(a, bins=max(a))

In [None]:
a = df[df.fake == 1].len
a = plt.hist(a, bins=max(a))

In [None]:
def create_rnn():
    inp = Input(shape=(max_len,), dtype="int32")
    v = Embedding(transformer.vocab_size+1, 256)(inp)
    h1 = GRU(256, stateful=False, return_sequences=True, unroll=True, implementation=0)(v)
    h2 = GRU(256, stateful=False, return_sequences=False, unroll=True, implementation=0)(h1)
    y = Dense(2, activation='softmax')(h2)
    
    model = Model(inp, y, name="char_rnn")
    model_to_save = Model(inp, y, "char_rnn")
    model.compile(optimizer='adam', loss=sparse_categorical_crossentropy, metrics=['accuracy'])
    return model, model_to_save

In [None]:
nn, nn_to_save = create_rnn()

In [None]:
print(nn.summary())

In [None]:
part_df.shape

In [None]:
n_epochs = 4

#part_df = df.sample(int(0.1 * df.shape[0]))
#part_df.index = range(int(0.1 * df.shape[0]))

X, y = split_data_into_correct_batches_stratified_by_len(df)
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

print('started learning')

from keras.callbacks import ModelCheckpoint

mc = ModelCheckpoint(filepath='models/discriminator_believability_cnn_model_2.h5')
nn.fit(X_train, y_train, epochs=n_epochs, batch_size=128, shuffle=True, validation_data=(X_test, y_test), callbacks=[mc])

In [None]:
nn_to_save.save('models/discriminator_believability_word_rnn_model.h5')

# Ensembling word- and char-based

In [None]:
from keras.models import load_model
from sklearn.metrics import accuracy_score

char_nn = load_model('models/discriminator_believability_cnn_model_2.h5')
word_nn = load_model('models/discriminator_believability_word_rnn_model.h5')