In [None]:
from __future__ import print_function
from keras.models import Model
from keras.layers import Dense, Activation, Input, Embedding, Reshape, MaxPooling1D, Conv1D
from keras.layers import LSTM, GRU, Conv1D
from keras.layers import Dropout, BatchNormalization, Flatten
from keras.layers.wrappers import TimeDistributed
from keras.activations import sigmoid
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.callbacks import TensorBoard
import numpy as np
import random
import sys
import os

In [None]:
import re

from libs.utils import text_preprocess
from libs.text_utils import split_raw_into_sentences

In [None]:
with open('data/checkov/input.txt', encoding='utf-8') as f:
    sentenced_dostoewskij = text_preprocess(f.read())
with open('data/checkov/non_checkov_texts.txt', encoding='utf-8') as f:
    non_dostoewskij_text = f.read()
    
sentenced_non_dostoewskij = split_raw_into_sentences(non_dostoewskij_text)
sentenced_non_dostoewskij = text_preprocess(sentenced_non_dostoewskij)

dostoewskij_sentences = sentenced_dostoewskij.split('\n')
non_dostoewskij_sentences = sentenced_non_dostoewskij.split('\n')

#dostoewskij_text = clear_text_from_rare_chars(dostoewskij_text, delete_enters=True)
#non_dostoewskij_text = clear_text_from_rare_chars(non_dostoewskij_text, delete_enters=True)

print('dostoewskij_length:\t', len(sentenced_dostoewskij))
print('non_dostoewskij_length:\t', len(sentenced_non_dostoewskij))

In [None]:
s = set(sentenced_dostoewskij)
print(len(s))
s2 = set(sentenced_non_dostoewskij)
print(len(s2))

In [None]:
from libs.utils import load_transformer

transformer = load_transformer('models/shm_c1')

chars = transformer.tokens
char_cats = len(chars)
print('total chars:', char_cats)

In [None]:
n_batches = len(dostoewskij_sentences)
max_len = 200

In [None]:
from libs.utils import pad

# transform text into sequence of indices
pad_idx = char_cats
dostoewskij_indexes     = np.array([pad(transformer.transform(sent), max_len, pad_idx) for sent in dostoewskij_sentences])
non_dostoewskij_indexes = np.array([pad(transformer.transform(sent), max_len, pad_idx) for sent in non_dostoewskij_sentences])

In [None]:
from libs.utils import split_data_into_correct_batches
X, y = split_data_into_correct_batches(dostoewskij_indexes, non_dostoewskij_indexes, 
                                       n_batches, max_len, make_equal_folding=True)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

a = plt.hist(y)

In [None]:
from keras.losses import sparse_categorical_crossentropy

In [None]:
def create_cnn():
    inp = Input(shape=(max_len,), dtype="int32")
    v = Embedding(char_cats+1, int(char_cats / 1.5))(inp)
    x = Conv1D(64, kernel_size=8, activation='relu', padding='same')(v)
    x = Dropout(0.5)(BatchNormalization()(x))
    x = MaxPooling1D(2, padding='same')(x)

    x = Conv1D(32, kernel_size=3, activation='relu', padding='same')(x)
    x = Dropout(0.5)(BatchNormalization()(x))
    x = MaxPooling1D(2, padding='same')(x)
    
    h = Flatten()(x)
    y = Dense(2, activation='softmax')(h)
    model = Model(inp, y, name="char_cnn")
    model_to_save = Model(inp, y, 'char_cnn')
    model.compile(optimizer='adam', loss=sparse_categorical_crossentropy, metrics=['accuracy'])
    return model, model_to_save

In [None]:
nn, nn_to_save = create_cnn()

In [None]:
print(nn.summary())

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
n_epochs=10

from keras.callbacks import ModelCheckpoint

mc = ModelCheckpoint(filepath='models/discriminator_style_cnn_model_2.h5')
nn.fit(X_train, y_train, epochs=n_epochs, batch_size=128, shuffle=True, validation_data=(X_test, y_test), callbacks=[mc])

In [None]:
nn_to_save.save('models/discriminator_style_сnn_model.h5')