In [1]:
import os
import numpy as np
import tensorflow as tf

from tensorflow import keras

In [2]:
def get_data(proper_name, improper_name):
    with open(os.path.join('datasets', proper_name), 'r') as f:
        proper_reber_lines = [line.rstrip() for line in f]

    with open(os.path.join('datasets', improper_name), 'r') as f:
        improper_reber_lines = [line.rstrip() for line in f]

    max_len = len(max([max(proper_reber_lines, key=len), max(improper_reber_lines, key=len)], key=len))
    proper_reber_lines = keras.preprocessing.sequence.pad_sequences(
        [[ord(char) for char in line] for line in proper_reber_lines],
        maxlen=max_len)
    improper_reber_lines = keras.preprocessing.sequence.pad_sequences(
        [[ord(char) for char in line] for line in improper_reber_lines],
        maxlen=max_len)

    return proper_reber_lines, improper_reber_lines, max_len


proper_reber_lines, improper_reber_lines, max_len = get_data('proper_reber.txt', 'improper_reber.txt')

In [3]:
def get_arrays(proper_reber_lines, improper_reber_lines):
    truths = np.ones((proper_reber_lines.shape[0], 1))
    truth_data = np.hstack([proper_reber_lines, truths])
    falses = np.zeros((improper_reber_lines.shape[0], 1))
    false_data = np.hstack([improper_reber_lines, falses])
    all_data = np.vstack([truth_data, false_data])

    np.random.seed(123)
    np.random.shuffle(all_data)

    eighty_percent_count = int(.8 * all_data.shape[0])
    train = all_data[:eighty_percent_count, :]
    twenty_percent_count = int(.1 * train.shape[0])
    valid = train[:twenty_percent_count, :]
    train = train[twenty_percent_count:, :]
    test = all_data[eighty_percent_count:, :]

    return train, valid, test


train, valid, test = get_arrays(proper_reber_lines, improper_reber_lines)

Looking for a classic-ML baseline model

In [4]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

In [5]:
pipeline = make_pipeline(StandardScaler(), LogisticRegressionCV(cv=5, max_iter=1000))

pipeline.fit(train[:, :-1], train[:, -1])

pipeline.score(train[:, :-1], train[:, -1])

0.8710352039037992

In [6]:
lin_svc_grid = make_pipeline(StandardScaler(), LinearSVC(C=.10, max_iter=10000))

lin_svc_grid.fit(train[:, :-1], train[:, -1])

lin_svc_grid.score(train[:, :-1], train[:, -1])

0.8609271523178808

In [7]:
params = {
    'classifier__C': [.01, 1, 2, 4],
    'classifier__degree': [2, 3, 4],
    'classifier__kernel': ['poly', 'rbf', 'sigmoid'],
    'classifier__gamma': ['scale', 'auto']
}
svc_pipeline = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('classifier', SVC())
    ])
svc_search = GridSearchCV(svc_pipeline, params)

svc_search.fit(train[:, :-1], train[:, -1])

svc_search.score(train[:, :-1], train[:, -1])

0.8957825026141513

In [8]:
model_path = os.path.join('saved_models', 'xgb_basic_reber.json')


def find_best_tree_model(save_path: str, train_data: np.array) -> Pipeline:
    if os.path.exists(save_path):
        xgb_model = xgb.XGBRFClassifier()
        scaler = StandardScaler()

        scaler.fit(train_data[:, :-1], train_data[:, -1])
        xgb_model.load_model(save_path)

        return Pipeline(
            steps=[
                ('scaler', scaler),
                ('classifier', xgb_model)
            ])
    else:
        params = {
            'classifier__n_estimators': list(range(10, 200, 10)),
            'classifier__max_depth': [2, 3, 4],
            'classifier__max_leaves': list(range(10, 200, 10)),
            'classifier__grow_policy': ['depthwise', 'lossguide'],
            'classifier__objective': ['binary:logistic', 'binary:hinge']
        }
        xgb_search_pipeline = Pipeline(
            steps=[
                ('scaler', StandardScaler()),
                ('classifier', xgb.XGBRFClassifier())
            ])
        xgb_search = GridSearchCV(xgb_search_pipeline, params)

        xgb_search.fit(train_data[:, :-1], train_data[:, -1])
        xgb_search.best_estimator_.named_steps['classifier'].save_model(save_path)

        return Pipeline(
            steps=[
                ('scaler', xgb_search.best_estimator_.named_steps['scaler']),
                ('classifier', xgb_search.best_estimator_.named_steps['classifier'])
            ])

In [9]:
xgb_pipeline = find_best_tree_model(model_path, train)

print(f'train evaluation: {xgb_pipeline.score(train[:, :-1], train[:, -1])}')
print(f'test evaluation: {xgb_pipeline.score(test[:, :-1], test[:, -1])}')

train evaluation: 0.9226211223422796
test evaluation: 0.9146800501882058


Using datasets in this case is just nonsensical, but I decided to go with them anyway - for practice.

In [29]:
def create_target(batch):
    X = batch[:, :-1]
    Y = batch[:, -1:]

    return X, Y


def get_ds(array: np.ndarray) -> tf.data.Dataset:
    return tf.data.Dataset\
        .from_tensor_slices(array)\
        .shuffle(1000)\
        .batch(128)\
        .map(create_target)\
        .prefetch(1)


train_ds = get_ds(train)
valid_ds = get_ds(valid)
test_ds = get_ds(test)

In [30]:
model_dir = os.path.join(os.curdir, 'saved_models')


def run_nn_model(
        train_dataset: tf.data.Dataset,
        valid_dataset: tf.data.Dataset,
        max_length: int,
        in_model: keras.Model = None,
        dir_name='runs_dense',
        model_name='runs_dense',
        patience=20) -> (keras.Sequential, int):

    run_logdir_root = os.path.join(os.curdir, 'tensor_logs')
    dirs_count = len([
        name
        for name in os.listdir(run_logdir_root)
        if os.path.isdir(os.path.join(run_logdir_root, name)) and name.startswith(dir_name)
    ])
    run_logdir = os.path.join(run_logdir_root, f'{dir_name}_{dirs_count}')
    early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience)
    model_checkpoint = keras.callbacks.ModelCheckpoint(os.path.join(model_dir, f'{model_name}_{dirs_count}.h5'), save_best_only=True)
    tensorboard = keras.callbacks.TensorBoard(run_logdir, histogram_freq=1, profile_batch=10)

    if in_model is None:
        in_model = keras.Sequential([
            keras.layers.Input(shape=(max_length,)),
            keras.layers.BatchNormalization(),
            keras.layers.Dense(4 * max_length, activation='relu'),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(.4),
            keras.layers.Dense(4 * max_length, activation='relu'),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(.4),
            keras.layers.Dense(4 * max_length, activation='relu'),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(.3),
            keras.layers.Dense(1, activation='sigmoid'),
        ])

    in_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    in_model.fit(train_dataset, validation_data=valid_dataset, epochs=200, callbacks=[early_stopping, model_checkpoint, tensorboard])

    return in_model, dirs_count

In [31]:
_, dirs_count = run_nn_model(train_ds, valid_ds, max_len)
model = keras.models.load_model(os.path.join(model_dir, f'runs_dense_{dirs_count}.h5'))

model.evaluate(test_ds)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

[0.16935929656028748, 0.942283570766449]

In [32]:
model_numbers = list(range(22, 32, 1))
best_result = 0
best_dense_model = None

for number in model_numbers:
    try:
        name = f'runs_dense_{number}.h5'
        tmp_model = keras.models.load_model(os.path.join(model_dir, name))
        tmp_result = tmp_model.evaluate(test_ds)[1]

        if tmp_result > best_result:
            best_result = tmp_result
            best_dense_model = tmp_model
    except Exception as exc:
        print(exc)
        print('=========================================================')

in user code:

    File "C:\ProgramData\Anaconda3\envs\deep_learning_with_python\lib\site-packages\keras\engine\training.py", line 1727, in test_function  *
        return step_function(self, iterator)
    File "C:\ProgramData\Anaconda3\envs\deep_learning_with_python\lib\site-packages\keras\engine\training.py", line 1713, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\ProgramData\Anaconda3\envs\deep_learning_with_python\lib\site-packages\keras\engine\training.py", line 1701, in run_step  **
        outputs = model.test_step(data)
    File "C:\ProgramData\Anaconda3\envs\deep_learning_with_python\lib\site-packages\keras\engine\training.py", line 1665, in test_step
        y_pred = self(x, training=False)
    File "C:\ProgramData\Anaconda3\envs\deep_learning_with_python\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\ProgramData\Anacond

Now the fun part - embedded reber grammars

In [33]:
proper_embedded_reber_lines, improper_embedded_reber_lines, max_len_embed = get_data('proper_embedded_reber.txt', 'improper_embedded_reber.txt')
train_embed, valid_embed, test_embed = get_arrays(proper_embedded_reber_lines, improper_embedded_reber_lines)

In [34]:
embed_model_path = os.path.join('saved_models', 'xgb_embed_reber.json')
embed_xbg_pipeline = find_best_tree_model(embed_model_path, train_embed)

In [35]:
print(f'train evaluation: {embed_xbg_pipeline.score(train_embed[:, :-1], train_embed[:, -1])}')
print(f'test evaluation: {embed_xbg_pipeline.score(test_embed[:, :-1], test_embed[:, -1])}')

train evaluation: 0.9555555555555556
test evaluation: 0.9525


In [36]:
train_ds_embed = get_ds(train_embed)
valid_ds_embed = get_ds(valid_embed)
test_ds_embed = get_ds(test_embed)

In [37]:
model_embed, dirs_count = run_nn_model(
    train_ds_embed,
    valid_ds_embed,
    None,
    keras.Sequential([
        keras.layers.Input(shape=(max_len_embed,)),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(4 * max_len_embed, activation='relu'),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(.4),
        keras.layers.Dense(4 * max_len_embed, activation='relu'),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(.4),
        keras.layers.Dense(4 * max_len_embed, activation='relu'),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(.3),
        keras.layers.Dense(1, activation='sigmoid'),
    ]))

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [38]:
model_embed.evaluate(test_ds_embed)



[0.3060115575790405, 0.9737499952316284]

Time to test it with LSTMs

In [45]:
train_embed_4reshape, valid_embed_4reshape, test_embed_4reshape = get_arrays(proper_embedded_reber_lines, improper_embedded_reber_lines)
train_embed_4reshape = train_embed_4reshape.reshape((train_embed_4reshape.shape[0], max_len_embed + 1, 1))
valid_embed_4reshape = valid_embed_4reshape.reshape((valid_embed_4reshape.shape[0], max_len_embed + 1, 1))
test_embed_4reshape = test_embed_4reshape.reshape((test_embed_4reshape.shape[0], max_len_embed + 1, 1))
train_ds_embed_reshaped = get_ds(train_embed_4reshape)
valid_ds_embed_reshaped = get_ds(valid_embed_4reshape)
test_ds_embed_reshaped = get_ds(test_embed_4reshape)
lstm_model, dirs_count = run_nn_model(
    train_ds_embed_reshaped,
    valid_ds_embed_reshaped,
    None,
    keras.Sequential([
        keras.layers.LSTM(4 * max_len_embed, input_shape=(max_len_embed, 1), return_sequences=True),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(.1),
        keras.layers.LSTM(4 * max_len_embed),
        keras.layers.Dropout(.3),
        keras.layers.Dense(1, activation='sigmoid'),
    ]),
    patience=40,
    dir_name='runs_lstm',
    model_name='runs_lstm')

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [46]:
lstm_model.evaluate(test_ds_embed_reshaped)



[0.022035539150238037, 0.9950000047683716]

In [47]:
gru_model, dirs_count = run_nn_model(
    train_ds_embed_reshaped,
    valid_ds_embed_reshaped,
    None,
    keras.Sequential([
        keras.layers.GRU(4 * max_len_embed, input_shape=(max_len_embed, 1), return_sequences=True),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(.1),
        keras.layers.GRU(4 * max_len_embed),
        keras.layers.Dropout(.3),
        keras.layers.Dense(1, activation='sigmoid'),
    ]),
    patience=40,
    dir_name='runs_gru',
    model_name='runs_gru')

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [48]:
gru_model.evaluate(test_ds_embed_reshaped)



[0.02237865887582302, 0.9950000047683716]