In [1]:
import os

import numpy as np
import pandas as pd

from hyperopt import fmin, hp, tpe, STATUS_OK
from tensorflow import keras

In [2]:
train_raw = pd.read_csv('data/train.csv').drop('id', axis=1).to_numpy()
split = int(train_raw.shape[0] * .6)
X_train = train_raw[:split, :14]
y_train = train_raw[:split, 14:]
X_valid = train_raw[split:, :14]
y_valid = train_raw[split:, 14:]
X_valid, X_test = np.array_split(X_valid, 2)
y_valid, y_test = np.array_split(y_valid, 2)

In [3]:
baseline_model = keras.models.Sequential([
    keras.layers.Dense(150, activation='elu'),
    keras.layers.Dense(150, activation='elu'),
    keras.layers.Dense(1, activation='elu')
])

baseline_model.compile(optimizer='adam', loss='mse')
baseline_model.fit(X_train, y_train, epochs=40, batch_size=64,
                   validation_data=(X_valid, y_valid))

Epoch 1/40
Epoch 2/40
Epoch 3/40

KeyboardInterrupt: 

Let's save the result and see the score in Kaggle

In [14]:
test = pd.read_csv('data/test.csv', dtype=float)
X_target = test.drop('id', axis=1).to_numpy()
y_target = baseline_model.predict(X_target)

pd\
    .DataFrame(np.column_stack([test.loc[:, 'id'], y_target]), columns=['id', 'target'])\
    .astype({'id': int})\
    .to_csv(os.path.join('submissions', 'dnn_baseline.csv'), index=False)



Baseline raw data DNN regressor loss is close to 0.52 and Kaggle score is worse than that of XGBoostRegressor. Can I beat it doing some denoising first?

In [3]:
search_space = {
    'activation': hp.choice('activation', ('relu', 'elu')),
    'units': hp.choice('units', (4, 5, 6)),
    'distorting_layer': hp.choice(
        'distorting_layer',
        (
            keras.layers.GaussianNoise(.01),
            keras.layers.GaussianNoise(.02),
            keras.layers.GaussianNoise(.03),
            keras.layers.GaussianNoise(.1),
            keras.layers.GaussianNoise(.2)
        ))
}
model_dir = os.path.join(os.curdir, 'saved_models')
log_dir = os.path.join(os.curdir, 'tensor_logs')


def nn_objective(space: dict) -> dict:
    encoder = keras.models.Sequential([
        keras.layers.Dense(14, activation=space['activation']),
        keras.layers.Dense(28, activation=space['activation']),
        space['distorting_layer'],
        keras.layers.Dense(space['units'], activation=space['activation'])
    ])
    decoder = keras.models.Sequential([
        keras.layers.Dense(space['units'] * 2, activation=space['activation']),
        keras.layers.Dense(28, activation=space['activation']),
        keras.layers.Dense(14, activation=space['activation'])
    ])
    denoising_ae = keras.models.Sequential([encoder, decoder])
    early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=4,
                                                   min_delta=1e-4)
    models_count = len(os.listdir(model_dir)) + 1
    model_checkpoint = keras.callbacks.ModelCheckpoint(
        os.path.join('saved_models', f'dae_{models_count}.h5'),
        save_best_only=True)
    param_combination = f'{space["activation"]}_{space["units"]}_gaussian{space["distorting_layer"].stddev}'
    current_log_dir = os.path.join(log_dir, param_combination)
    tensorboard = keras.callbacks.TensorBoard(current_log_dir, histogram_freq=1, profile_batch=10)

    denoising_ae.compile(optimizer='adam', loss='mse')
    denoising_ae.fit(X_train, X_train, epochs=20, batch_size=64,
                     verbose=0, callbacks=[early_stopping, model_checkpoint, tensorboard],
                     validation_data=(X_valid, X_valid))

    acc = denoising_ae.evaluate(X_test, X_test, verbose=0)

    return {'loss': acc, 'status': STATUS_OK}

In [None]:
best_params = fmin(
    fn=nn_objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50)

In [5]:
from Tabular_012021.constrained_priority_list import ConstrainedPriorityList


prio_list = ConstrainedPriorityList(50)


for path in os.listdir(model_dir):
    model = keras.models.load_model(os.path.join(os.curdir, 'saved_models', path))
    loss = model.evaluate(X_test, X_test)

    prio_list.add(loss, model)

Best denoising autoencoder could be taken from the priority list, but since after a while I cleaned the files, I had to build one from scratch

In [5]:
encoder = keras.models.Sequential([
    keras.layers.Dense(14, 'elu'),
    keras.layers.Dense(28, 'elu'),
    keras.layers.GaussianNoise(.02),
    keras.layers.Dense(6, activation='elu')
])
decoder = keras.models.Sequential([
    keras.layers.Dense(12, activation='elu'),
    keras.layers.Dense(28, activation='elu'),
    keras.layers.Dense(14, activation='elu')
])
denoising_ae = keras.models.Sequential([encoder, decoder])

denoising_ae.compile(optimizer='adam', loss='mse')
denoising_ae.fit(X_train, X_train, epochs=20, batch_size=64,
                 validation_data=(X_valid, X_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x229bde00970>

In [7]:
denoising_ae.evaluate(X_test, X_test)



0.005591393448412418

As it turns out standard DNN combined with a DNN-based denoising autoencoder does worse than raw DNN (not shown here).