# Monk benchmark: Keras NN

Python modules:
- Numpy v1.26.2
- Pandas v2.1.4
- Matplotlib v3.8.2
- Keras v2.15.0:            `pip install --upgrade keras`
- Tensorflow v2.15.0.post1: `pip install --upgrade tensorflow`
- SciKeras v0.12.0:         `pip install scikeras`
- SciKit-Learn v1.3.2:      `pip install --upgrade scikit-learn`

In [None]:
# Suppress debugging information about GPU
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import os.path
import sys
import keras
import numpy as np
import pandas as pd
import tensorflow as tf
import logging
from matplotlib import pyplot as plt
from numpy import mean, std
from keras.initializers import RandomUniform, GlorotUniform, Zeros
from keras.layers import Dense, InputLayer
from keras.losses import MeanSquaredError
from keras.metrics import BinaryAccuracy
from keras.models import Sequential
from keras.optimizers import *
from keras.regularizers import L2, L1
from keras.callbacks import Callback
from scikeras.wrappers import KerasClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV
from pathlib import Path
from monk_helpers import *

# set random seed for reproducible experiments
tf.random.set_seed(42)
keras.utils.set_random_seed(42)

### Utilities

In [None]:
# get_NN: Function that builds up a NN
def get_NN(X_len, initializer="random", seed=42, neurons=4, lr=0.01, alpha=0.5,
          hidden_activation="tanh", lambda_reg=None, penalty=None, nesterov=False):

  # weight initialization
  init, bias = (GlorotUniform(seed=seed), Zeros())  if initializer=="glorot" \
          else (RandomUniform(seed=seed), RandomUniform(seed=seed))

  # regularization
  regularizer = L1(l1=lambda_reg) if penalty == "L1" \
          else  L2(l2=lambda_reg) if penalty == "L2" \
          else  None

  # 1 hidden layer
  NN_model = Sequential()
  NN_model.add(InputLayer(input_shape=(X_len,)))
  NN_model.add(Dense(units=neurons,  activation=hidden_activation, kernel_initializer=init, 
                bias_initializer=bias, kernel_regularizer=regularizer)
              )
  NN_model.add(Dense(units=1, activation="sigmoid", kernel_initializer=init, 
                bias_initializer=bias, kernel_regularizer=regularizer)
              )

  NN_model.compile(
    optimizer=SGD( 
      learning_rate=lr, momentum=alpha, nesterov=nesterov
    ), loss=MeanSquaredError(), metrics=BinaryAccuracy()
  )
  
  return NN_model

In [None]:
# utilities to get the mean of K histories

def add_padding(ls, n):
  ls.extend([ls[-1]] * n)
  return ls

def longest(ls):
  return len(max(ls, key=(lambda history : len(history['loss'])))['loss'])

def mean_epochs(l):
  return int(mean([ len(item['loss']) for item in l ]))

def mean_history(_histories):
  m = mean_epochs(_histories)+1
  # m = longest(_histories)
  for history in _histories:
    l = len(history['loss'])
    for field in _histories[0]:
      if l>= m:
        history[field] = history[field][:m]
      else:
        history[field] = add_padding(history[field], (m-l))
  return \
    { field : 
        [ 
          (sum(x)/len(_histories)) for x in zip(
            *[ history[field] for history in _histories ]
          )
        ] for field in _histories[0]
    }

In [None]:
# static fold counter
def count():
  count.count += 1
  return count.count

def reset_counter():
  count.count =-1

def get_count():
  return count.count

In [None]:
# static history register
def histories():
  histories.histories = []

def register(h):
  histories.histories.append(h)

def get_histories():
  return histories.histories

def clear_histories():
  histories()

In [None]:
# plot utility
def do_NN_plot(history):

  # Plot Accuracy
  plt.plot(history['binary_accuracy'])
  plt.plot(history['val_binary_accuracy'], linestyle="--", color="orange")
  plt.title(f'model accuracy')
  plt.ylabel('accuracy')
  plt.xlabel('epoch')
  plt.legend(['training', 'test'], loc='lower right')
  plt.show()

  # Plot loss
  plt.plot(history['loss'])
  plt.plot(history['val_loss'],  linestyle="--", color="orange")
  plt.title(f'model MSE')
  plt.ylabel('MSE')
  plt.xlabel('epoch')
  plt.legend(['training', 'test'], loc='upper right')
  plt.show()

In [None]:
# KerasClassifier Wrapper for kfold
class KCWrapper(KerasClassifier):

  def __init__(self, val_data, k, *args, **kwargs):
    super(KCWrapper, self).__init__(*args, **kwargs)
    self.val_data = val_data
    self.k = k
    
  def fit(self, X, y, **kwargs):
    h = super().fit(X, y, validation_data=self.val_data[count()], **kwargs)
    register(h.history_)
    # do_NN_plot(h.history_)  # plot single fold curve
    if self.kfold_finished(): # plot mean of k folds curves
      do_NN_plot(mean_history(get_histories()))
    
  def kfold_finished(self):
    return self.k == get_count()+1

### Monk 1

In [None]:
# Datasets Path
TR_PATH = "./monks/datasets/monks-1.train"
TS_PATH = "./monks/datasets/monks-1.test"

X_train, y_train = read_ds(TR_PATH)
X_test, y_test = read_ds(TS_PATH)

#### Model Selection

In [None]:
# Define grids for gridsearchcv
kerasClassifierParams = {
  "model" : get_NN,
  "X_len" : len(X_train.columns),
  "loss" : "mse",
  "optimizer" : "SGD", # fixed into get_NN
  "epochs" : 300,
  "batch_size" : 4,
  "shuffle" : True,
  "verbose" : False
}

NN = KerasClassifier(**kerasClassifierParams)

custom_scores_monk = {
    "accuracy": "accuracy",
    "mse": make_scorer(mean_squared_error,greater_is_better=False)
}

NN_MONK1_GRID_DICT = {
  # "epochs" : [100, 300, 600],
  # "batch_size" : [4, 10, 16, 32, 64],
  "model__lr" : [0.25], # [0.01, 0.2, 0.22, 0.25, 0.3, 0.4, 0.5]
  "model__alpha" : [0.8], # [0.5, 0.55, 0.6, 0.62, 0.7, 0.79, 0.8, 0.81, 0.9]
  "model__hidden_activation" : ["tanh"],
  "model__neurons" : [4], # [2, 3, 4]
  "model__initializer" : ["glorot"], # ["random", "glorot"]
  "model__nesterov" : [True], # [True, False]
  "model__penalty": [None], # [None, "L1", "L2"],
  "model__lambda_reg": [None], # [0.01, 0.005]
  "model__seed" : [15]
}

grid = GridSearchCV(NN,
                    param_grid=NN_MONK1_GRID_DICT,
                    scoring=custom_scores_monk,
                    refit="mse",
                    cv=CV,
                    return_train_score=True,
                    n_jobs=-1,
        )

In [None]:
# exec gridsearch and fit model
grid.fit(X_train, y_train)
print("Best parameters: " + str(grid.best_params_) + " score: " + str(grid.best_score_))

In [None]:
# print top hyperparameters and results
columns = ['rank_test_accuracy', 'param_model__nesterov', 'param_model__alpha', 'param_model__lr', 'mean_test_mse', 'std_test_mse', 'mean_fit_time']
top_models = pd.DataFrame(grid.cv_results_).sort_values(by=['rank_test_accuracy','mean_fit_time']).query('rank_test_accuracy<=3')
top_models[columns]

In [None]:
# second kfold: validation curves, early stopping and mean error of top models

# validation folds
val_split = [ test for (train, test) in CV.split(X_train, y_train) ]

val_data = [ 
  (
    [X_train.iloc[i].tolist() for i in indexes], 
    [y_train.iloc[i].tolist() for i in indexes]
  ) for indexes in val_split 
]

NN_2 = KCWrapper(
  val_data,
  5,
  callbacks=[
    tf.keras.callbacks.EarlyStopping(
      monitor="val_loss", min_delta=0.0001, patience=20, restore_best_weights=True
    )
  ],
  **kerasClassifierParams
)

grid_dict = { "scoring": custom_scores_monk,
              "refit" : False,
              "cv" : CV,
              "return_train_score" : True,
              "n_jobs" : 1,
}

In [None]:
# exec kfold and select the model

seed = NN_MONK1_GRID_DICT['model__seed'][0]

n_epochs = []

for params in top_models['params']:
  print(params)

  tr_err, tr_acc, ts_err, ts_acc = [], [], [], []
  for i in range(seed, seed+5):
    print("Seed: " + str(i))

    # reset fold counter and histories
    reset_counter()   
    clear_histories() 

    # set params and seed
    grid_dict['param_grid'] = { field : [value] for (field, value) in params.items() }
    grid_dict['param_grid']['model__seed'] = [i]
    grid_2 = GridSearchCV(NN_2, **grid_dict)
    grid_2.fit(X_train, y_train)

    # save mse and accuracy 
    tr_err.append( grid_2.cv_results_['mean_train_mse'] ) 
    tr_acc.append( grid_2.cv_results_['mean_train_accuracy'] )
    ts_err.append( grid_2.cv_results_['mean_test_mse'] ) 
    ts_acc.append( grid_2.cv_results_['mean_test_accuracy'] )

    # memorize number of epochs (mean over the five folds)
    n_epochs.append(mean([ len(get_histories()[i]['loss']) for i in range(5) ]))

  print("Tr mean accuracy over 5 inits: {:.3f} +/- {:.3f} (std)\nTr mean mse over 5 inits {:.6f} +/- {:.6f} (std)\n" \
        .format(
          mean(tr_acc), std(tr_acc),
          mean(tr_err), std(tr_err)
        )
  )

  print("Vl mean accuracy over 5 inits: {:.3f} +/- {:.3f} (std)\nVl mean mse over 5 inits {:.6f} +/- {:.6f} (std)\n" \
        .format(
          mean(ts_acc), std(ts_acc),
          mean(ts_err), std(ts_err)
        )
  ) 

#### Model Assessment

In [None]:
# retrain on the whole ds, compute accuracy, mean test error and plot learning curve

seed = NN_MONK1_GRID_DICT['model__seed'][0]
tr_err, tr_acc, ts_err, ts_acc = [], [], [], []
batch_size = kerasClassifierParams['batch_size']

# mean test error and std
for i in range(seed, seed+5):

  NN_monk1_params = { field[7:] : value[0] for (field, value) in NN_MONK1_GRID_DICT.items() }
  NN_monk1_params['seed'] = i
  NN_monk1 = get_NN(len(X_train.columns), **NN_monk1_params)

  h = NN_monk1.fit(X_train, y_train, batch_size=batch_size, epochs=int(mean(n_epochs)),\
    validation_data=(X_test, y_test), shuffle=True, verbose=0)

  # save mse and accuracy
  tr_score = NN_monk1.evaluate(X_train, y_train, verbose=0)
  ts_score = NN_monk1.evaluate(X_test, y_test, verbose=0)
  tr_err.append( tr_score[0] ) 
  ts_err.append( ts_score[0] )

# compute accuracy on test set
NN_monk1_params['seed'] = seed
NN_monk1 = get_NN(len(X_train.columns), **NN_monk1_params)
h = NN_monk1.fit(X_train, y_train, batch_size=batch_size, epochs=int(mean(n_epochs)),\
    validation_data=(X_test, y_test), shuffle=True, verbose=0)
tr_accuracy = NN_monk1.evaluate(X_train, y_train, verbose=0)[1]
ts_accuracy = NN_monk1.evaluate(X_test, y_test, verbose=0)[1]

# print results and plot
print("Tr. accuracy: {:.3f} \n".format(tr_accuracy))

print("Tr. mean mse over 5 inits {:.6f} +/- {:.6f} (std)\n".format( mean(tr_err), std(tr_err) ) )

print("Ts. accuracy: {:.3f} \n".format(ts_accuracy))

print("Ts. mean mse over 5 inits {:.6f} +/- {:.6f} (std)\n".format( mean(ts_err), std(ts_err) ) )

do_NN_plot(h.history)

### Monk 2

In [None]:
# Datasets Path
TR_PATH = "./monks/datasets/monks-2.train"
TS_PATH = "./monks/datasets/monks-2.test"

X_train, y_train = read_ds(TR_PATH)
X_test, y_test = read_ds(TS_PATH)

#### Model Selection

In [None]:
# Define grids for gridsearchcv
kerasClassifierParams = {
  "model" : get_NN,
  "X_len" : len(X_train.columns),
  "loss" : "mse",
  "optimizer" : "SGD", # fixed into get_NN
  "epochs" : 300,
  "batch_size" : 4,
  "shuffle" : True,
  "verbose" : False
}

NN = KerasClassifier(**kerasClassifierParams)

custom_scores_monk = {
    "accuracy": "accuracy",
    "mse": make_scorer(mean_squared_error,greater_is_better=False)
}

NN_MONK2_GRID_DICT = {
  # "epochs" : [100, 300, 600],
  # "batch_size" : [4, 10, 16, 32, 64],
  "model__lr" : [0.25], # [0.01, 0.2, 0.22, 0.25, 0.3, 0.4, 0.5]
  "model__alpha" : [0.8], # [0.5, 0.55, 0.6, 0.62, 0.7, 0.79, 0.8, 0.81, 0.9]
  "model__hidden_activation" : ["tanh"],
  "model__neurons" : [4], # [2, 3, 4]
  "model__initializer" : ["glorot"], # ["random", "glorot"]
  "model__nesterov" : [True], # [True, False]
  "model__penalty": [None], # [None, "L1", "L2"],
  "model__lambda_reg": [None], # [0.01, 0.005]
  "model__seed" : [15]
}

grid = GridSearchCV(NN,
                    param_grid=NN_MONK2_GRID_DICT,
                    scoring=custom_scores_monk,
                    refit="mse",
                    cv=CV,
                    return_train_score=True,
                    n_jobs=-1,
                    
        )

In [None]:
# exec gridsearch and fit model
grid.fit(X_train, y_train)
print("Best parameters: " + str(grid.best_params_) + " score: " + str(grid.best_score_))

In [None]:
# print top hyperparameters and results
columns = ['rank_test_accuracy', 'param_model__nesterov', 'param_model__alpha', 'param_model__lr', 'mean_test_mse', 'std_test_mse', 'mean_fit_time']
top_models = pd.DataFrame(grid.cv_results_).sort_values(by=['rank_test_accuracy','mean_fit_time']).query('rank_test_accuracy<=3')
top_models[columns]

In [None]:
# second kfold: validation curves, early stopping and mean error of top models

# validation folds
val_split = [ test for (train, test) in CV.split(X_train, y_train) ]

val_data = [ 
  (
    [X_train.iloc[i].tolist() for i in indexes], 
    [y_train.iloc[i].tolist() for i in indexes]
  ) for indexes in val_split 
]

NN_2 = KCWrapper(
  val_data,
  5,
  callbacks=[
    tf.keras.callbacks.EarlyStopping(
      monitor="val_loss", min_delta=0.0001, patience=20, restore_best_weights=True
    )
  ],
  **kerasClassifierParams
)

grid_dict = { "scoring": custom_scores_monk,
              "refit" : False,
              "cv" : CV,
              "return_train_score" : True,
              "n_jobs" : 1,
}

In [None]:
# exec kfold and select the model

seed = NN_MONK2_GRID_DICT['model__seed'][0]

n_epochs = []

for params in top_models['params']:
  print(params)

  tr_err, tr_acc, ts_err, ts_acc = [], [], [], []
  for i in range(seed, seed+5):
    print("Seed: " + str(i))

    # reset fold counter and histories
    reset_counter()   
    clear_histories() 

    # set params and seed
    grid_dict['param_grid'] = { field : [value] for (field, value) in params.items() }
    grid_dict['param_grid']['model__seed'] = [i]
    grid_2 = GridSearchCV(NN_2, **grid_dict)
    grid_2.fit(X_train, y_train)

    # save mse and accuracy 
    tr_err.append( grid_2.cv_results_['mean_train_mse'] ) 
    tr_acc.append( grid_2.cv_results_['mean_train_accuracy'] )
    ts_err.append( grid_2.cv_results_['mean_test_mse'] ) 
    ts_acc.append( grid_2.cv_results_['mean_test_accuracy'] )

    # memorize number of epochs (mean over the five folds)
    n_epochs.append(mean([ len(get_histories()[i]['loss']) for i in range(5) ]))

  print("Tr mean accuracy over 5 inits: {:.3f} +/- {:.3f} (std)\nTr mean mse over 5 inits {:.6f} +/- {:.6f} (std)\n" \
        .format(
          mean(tr_acc), std(tr_acc),
          mean(tr_err), std(tr_err)
        )
  )

  print("Vl mean accuracy over 5 inits: {:.3f} +/- {:.3f} (std)\nVl mean mse over 5 inits {:.6f} +/- {:.6f} (std)\n" \
        .format(
          mean(ts_acc), std(ts_acc),
          mean(ts_err), std(ts_err)
        )
  ) 

#### Model Assessment

In [None]:
# retrain on the whole ds, compute accuracy, mean test error and plot learning curve

seed = NN_MONK2_GRID_DICT['model__seed'][0]
tr_err, tr_acc, ts_err, ts_acc = [], [], [], []
batch_size = kerasClassifierParams['batch_size']

# mean test error and std
for i in range(seed, seed+5):

  NN_monk2_params = { field[7:] : value[0] for (field, value) in NN_MONK2_GRID_DICT.items() }
  NN_monk2_params['seed'] = i
  NN_monk2 = get_NN(len(X_train.columns), **NN_monk2_params)

  h = NN_monk2.fit(X_train, y_train, batch_size=batch_size, epochs=int(mean(n_epochs)),\
    validation_data=(X_test, y_test), shuffle=True, verbose=0)

  # save mse and accuracy
  tr_score = NN_monk2.evaluate(X_train, y_train, verbose=0)
  ts_score = NN_monk2.evaluate(X_test, y_test, verbose=0)
  tr_err.append( tr_score[0] ) 
  ts_err.append( ts_score[0] )

# compute accuracy on test set
NN_monk2_params['seed'] = seed
NN_monk2 = get_NN(len(X_train.columns), **NN_monk2_params)
h = NN_monk2.fit(X_train, y_train, batch_size=batch_size, epochs=int(mean(n_epochs)),\
    validation_data=(X_test, y_test), shuffle=True, verbose=0)
tr_accuracy = NN_monk2.evaluate(X_train, y_train, verbose=0)[1]
ts_accuracy = NN_monk2.evaluate(X_test, y_test, verbose=0)[1]

# print results and plot
print("Tr. accuracy: {:.3f} \n".format(tr_accuracy))

print("Tr. mean mse over 5 inits {:.6f} +/- {:.6f} (std)\n".format( mean(tr_err), std(tr_err) ) )

print("Ts. accuracy: {:.3f} \n".format(ts_accuracy))

print("Ts. mean mse over 5 inits {:.6f} +/- {:.6f} (std)\n".format( mean(ts_err), std(ts_err) ) )

do_NN_plot(h.history)

### Monk 3

In [None]:
# Datasets Path
TR_PATH = "./monks/datasets/monks-3.train"
TS_PATH = "./monks/datasets/monks-3.test"

X_train, y_train = read_ds(TR_PATH)
X_test, y_test = read_ds(TS_PATH)

#### Model Selection

In [None]:
# Define grids for gridsearchcv
kerasClassifierParams = {
  "model" : get_NN,
  "X_len" : len(X_train.columns),
  "loss" : "mse",
  "optimizer" : "SGD", # fixed into get_NN
  "epochs" : 300,
  "batch_size" : 32,
  "shuffle" : True,
  "verbose" : False
}

NN = KerasClassifier(**kerasClassifierParams)

custom_scores_monk = {
    "accuracy": "accuracy",
    "mse": make_scorer(mean_squared_error,greater_is_better=False)
}

NN_MONK3_GRID_DICT = {
  # "epochs" : [100, 300, 600],
  # "batch_size" : #[4, 8, 16, 32, 64],
  "model__lr" : [0.02], # [0.01, 0.02, 0.05, 0.2]
  "model__alpha" : [0.69], # [0.5, 0.55, 0.6, 0.65, 0.7, 0.8]
  "model__hidden_activation" : ["tanh"],
  "model__neurons" : [4], # [2, 3, 4]
  "model__initializer" : ["glorot"], # ["random", "glorot"]
  "model__nesterov" : [True], # [True, False]
  "model__penalty": ["L1"], # ["L1", "L2"],
  "model__lambda_reg": [0.002], # [0.01, 0.005]
  "model__seed" : [15]
}

grid = GridSearchCV(NN,
                    param_grid=NN_MONK3_GRID_DICT,
                    scoring=custom_scores_monk,
                    refit="mse",
                    cv=CV,
                    return_train_score=True,
                    n_jobs=-1,
                    
        )

In [None]:
# exec gridsearch and fit model
grid.fit(X_train, y_train)
print("Best parameters: " + str(grid.best_params_) + " score: " + str(grid.best_score_))

In [None]:
# print top hyperparameters and results
columns = ['rank_test_accuracy', 'param_model__penalty', 'param_model__lambda_reg', 'param_model__nesterov', 'param_model__alpha', 'param_model__lr', 'mean_test_mse', 'std_test_mse', 'mean_fit_time']
top_models = pd.DataFrame(grid.cv_results_).sort_values(by=['rank_test_accuracy','mean_fit_time']).query('rank_test_accuracy<=3')
top_models[columns]

In [None]:
# second kfold: validation curves, early stopping and mean error of top models

# validation folds
val_split = [ test for (train, test) in CV.split(X_train, y_train) ]

val_data = [ 
  (
    [X_train.iloc[i].tolist() for i in indexes], 
    [y_train.iloc[i].tolist() for i in indexes]
  ) for indexes in val_split 
]

NN_2 = KCWrapper(
  val_data,
  5,
  callbacks=[
    tf.keras.callbacks.EarlyStopping(
      monitor="val_loss", min_delta=0.0001, patience=20, restore_best_weights=True
    )
  ],
  **kerasClassifierParams
)

grid_dict = { "scoring": custom_scores_monk,
              "refit" : False,
              "cv" : CV,
              "return_train_score" : True,
              "n_jobs" : 1,
}

In [None]:
# exec second kfold for plotting validation learning curves, for early stopping
# and to compute mean error and std over 5 inits.
# On these criteria, choose the model

seed = NN_MONK3_GRID_DICT['model__seed'][0]

n_epochs = []

for params in top_models['params']:
  print(params)

  tr_err, tr_acc, ts_err, ts_acc = [], [], [], []
  for i in range(seed, seed+5):
    print("Seed: " + str(i))

    # reset fold counter and histories
    reset_counter()   
    clear_histories() 

    # set params and seed
    grid_dict['param_grid'] = { field : [value] for (field, value) in params.items() }
    grid_dict['param_grid']['model__seed'] = [i]
    grid_2 = GridSearchCV(NN_2, **grid_dict)
    grid_2.fit(X_train, y_train)

    # save mse and accuracy 
    tr_err.append( grid_2.cv_results_['mean_train_mse'] ) 
    tr_acc.append( grid_2.cv_results_['mean_train_accuracy'] )
    ts_err.append( grid_2.cv_results_['mean_test_mse'] ) 
    ts_acc.append( grid_2.cv_results_['mean_test_accuracy'] )

    # memorize number of epochs (mean over the five folds)
    n_epochs.append(mean([ len(get_histories()[i]['loss']) for i in range(5) ]))

  print("Tr mean accuracy over 5 inits: {:.3f} +/- {:.3f} (std)\nTr mean mse over 5 inits {:.6f} +/- {:.6f} (std)\n" \
        .format(
          mean(tr_acc), std(tr_acc),
          mean(tr_err), std(tr_err)
        )
  )

  print("Vl mean accuracy over 5 inits: {:.3f} +/- {:.3f} (std)\nVl mean mse over 5 inits {:.6f} +/- {:.6f} (std)\n" \
        .format(
          mean(ts_acc), std(ts_acc),
          mean(ts_err), std(ts_err)
        )
  ) 


#### Model Assessment

In [None]:
# retrain on the whole ds, compute accuracy, mean test error and plot learning curve

seed = NN_MONK3_GRID_DICT['model__seed'][0]
tr_err, tr_acc, ts_err, ts_acc = [], [], [], []
batch_size = kerasClassifierParams['batch_size']

# mean test error and std
for i in range(seed, seed+5):

  NN_monk3_params = { field[7:] : value[0] for (field, value) in NN_MONK3_GRID_DICT.items() }
  NN_monk3_params['seed'] = i
  NN_monk3 = get_NN(len(X_train.columns), **NN_monk3_params)

  h = NN_monk3.fit(X_train, y_train, batch_size=batch_size, epochs=int(mean(n_epochs)),\
    validation_data=(X_test, y_test), shuffle=True, verbose=0)

  # save mse and accuracy
  tr_score = NN_monk3.evaluate(X_train, y_train, verbose=0)
  ts_score = NN_monk3.evaluate(X_test, y_test, verbose=0)
  tr_err.append( tr_score[0] ) 
  ts_err.append( ts_score[0] )

# compute accuracy on test set
NN_monk3_params['seed'] = seed
NN_monk3 = get_NN(len(X_train.columns), **NN_monk3_params)
h = NN_monk3.fit(X_train, y_train, batch_size=batch_size, epochs=int(mean(n_epochs)),\
    validation_data=(X_test, y_test), shuffle=True, verbose=0)
tr_accuracy = NN_monk3.evaluate(X_train, y_train, verbose=0)[1]
ts_accuracy = NN_monk3.evaluate(X_test, y_test, verbose=0)[1]

# print results and plot
print("Tr. accuracy: {:.3f} \n".format(tr_accuracy))

print("Tr. mean mse over 5 inits {:.6f} +/- {:.6f} (std)\n".format( mean(tr_err), std(tr_err) ) )

print("Ts. accuracy: {:.3f} \n".format(ts_accuracy))

print("Ts. mean mse over 5 inits {:.6f} +/- {:.6f} (std)\n".format( mean(ts_err), std(ts_err) ) )

do_NN_plot(h.history)