# Hyperparameter tuning for Keras model with Dense Layer

### Load data

In [None]:
import numpy as np
import tensorflow
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import tensorflow
from tensorflow.python.keras import backend as K
from tensorflow.keras.optimizers import Adam
import pandas as pd

In [None]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[?25l[K     |███▎                            | 10 kB 30.0 MB/s eta 0:00:01[K     |██████▌                         | 20 kB 22.0 MB/s eta 0:00:01[K     |█████████▉                      | 30 kB 16.8 MB/s eta 0:00:01[K     |█████████████                   | 40 kB 15.0 MB/s eta 0:00:01[K     |████████████████▍               | 51 kB 7.3 MB/s eta 0:00:01[K     |███████████████████▋            | 61 kB 8.6 MB/s eta 0:00:01[K     |██████████████████████▉         | 71 kB 8.9 MB/s eta 0:00:01[K     |██████████████████████████▏     | 81 kB 8.7 MB/s eta 0:00:01[K     |█████████████████████████████▍  | 92 kB 9.7 MB/s eta 0:00:01[K     |████████████████████████████████| 100 kB 6.2 MB/s 
Collecting pyaml>=16.9
  Downloading pyaml-21.10.1-py2.py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-21.10.1 scikit-optimize-0.9.0


In [None]:
import skopt
from skopt import gbrt_minimize, gp_minimize
from skopt.utils import use_named_args
from skopt.space import Real, Categorical, Integer

In [None]:
from keras.datasets import mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


Scale the data to between 0 & 1

In [None]:
X_train = X_train/ 255
X_test = X_test/ 255
print(X_train.min(),X_train.max())

0.0 1.0


In [None]:
y_test[0:10]

array([7, 2, 1, 0, 4, 1, 4, 9, 5, 9], dtype=uint8)

In [None]:
X_train.shape

(60000, 28, 28)

Need to Flatten the Arrays

In [None]:
X_train = X_train.reshape(60000,784)
X_test = X_test.reshape(10000,784)

Convert the y's to used with softmax function

In [None]:
from keras.utils import np_utils
y_train = np_utils.to_categorical(y_train, 10)
y_test = np_utils.to_categorical(y_test, 10)

In [None]:
y_train.shape

(60000, 10)

In [None]:
input_shape= X_train[0].shape
print(input_shape)

(784,)


## Checking result against a baseline

A simple 2 layer neural network


In [None]:
model =Sequential()
model.add(Dense(16, input_shape=input_shape, activation='relu',name = 'input_layer'))
model.add(Dense(16, activation='relu', name="hidden_layer"))

model.add(Dense(10,activation='softmax',name="output_layer"))
model.compile(optimizer = 'adam', loss='categorical_crossentropy', metrics=["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (Dense)         (None, 16)                12560     
                                                                 
 hidden_layer (Dense)        (None, 16)                272       
                                                                 
 output_layer (Dense)        (None, 10)                170       
                                                                 
Total params: 13,002
Trainable params: 13,002
Non-trainable params: 0
_________________________________________________________________


In [None]:
blackbox = model.fit(X_train, y_train, batch_size=128, epochs = 20, validation_split=.15)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
accuracy = model.evaluate(X_test,y_test)[1]
print(accuracy)

0.9552000164985657


## Using Skopt (scikit-optimize)

Creating our search parameters.

This code focuses on: 
* Number of Layers
* Number of Nodes per layer
* Learning Rate & Weight Decay for the Adam Optimizer
* activation functions
* batch size

In [None]:
dim_learning_rate = Real(low=1e-4, high=1e-1, prior='log-uniform',
                         name='learning_rate')
dim_num_dense_layers = Integer(low=1, high=5, name='num_dense_layers')
dim_num_input_nodes = Integer(low=1, high=512, name='num_input_nodes')
dim_num_dense_nodes = Integer(low=1, high=28, name='num_dense_nodes')
dim_activation = Categorical(categories=['relu', 'sigmoid'],
                             name='activation')
dim_batch_size = Integer(low=1, high=128, name='batch_size')
dim_adam_decay = Real(low=1e-6,high=1e-2,name="adam_decay")

dimensions = [dim_learning_rate,
              dim_num_dense_layers,
              dim_num_input_nodes,
              dim_num_dense_nodes,
              dim_activation,
              dim_batch_size,
              dim_adam_decay
             ]
default_parameters = [1e-3, 1,512, 13, 'relu',64, 1e-3]

### Create Model

The Adam optimizer used to get the ability to adjust its learning rate and decay.


In [None]:
def create_model(learning_rate, num_dense_layers,num_input_nodes,
                 num_dense_nodes, activation, adam_decay):
    #start the model making process and create our first layer
    model = Sequential()
    model.add(Dense(num_input_nodes, input_shape= input_shape, activation=activation
                   ))
    #create a loop making a new dense layer for the amount passed to this model.
    #naming the layers helps avoid tensorflow error deep in the stack trace.
    for i in range(num_dense_layers):
        name = 'layer_dense_{0}'.format(i+1)
        model.add(Dense(num_dense_nodes,
                 activation=activation,
                        name=name
                 ))
    #add our classification layer.
    model.add(Dense(10,activation='softmax'))
    
    #setup our optimizer and compile
    adam = Adam(learning_rate=learning_rate, decay= adam_decay)
    model.compile(optimizer=adam, loss='categorical_crossentropy',
                 metrics=['accuracy'])
    return model

In [None]:
@use_named_args(dimensions=dimensions)
def fitness(learning_rate, num_dense_layers, num_input_nodes, 
            num_dense_nodes,activation, batch_size,adam_decay):

    model = create_model(learning_rate=learning_rate,
                         num_dense_layers=num_dense_layers,
                         num_input_nodes=num_input_nodes,
                         num_dense_nodes=num_dense_nodes,
                         activation=activation,
                         adam_decay=adam_decay
                        )
    

    #named blackbox becuase it represents the structure
    blackbox = model.fit(x=X_train,
                        y=y_train,
                        epochs=20,
                        batch_size=batch_size,
                        validation_split=0.15,
                        )
    #return the validation accuracy for the last epoch.
    accuracy = blackbox.history['val_accuracy'][-1]

    # Print the classification accuracy.
    print()
    print("Accuracy: {0:.2%}".format(accuracy))
    print()


    # Delete the Keras model with these hyper-parameters from memory.
    del model
    
    # Clear the Keras session, otherwise it will keep adding new
    # models to the same TensorFlow graph each time we create
    # a model with a different set of hyper-parameters.
    K.clear_session()
    ops.reset_default_graph()
    
    return -accuracy

### Hyper parameters for hyper parameter search

For the Gaussian Project search, a few extra parameters to try to improve the search.

In [None]:
gp_result = gp_minimize(func=fitness,
                            dimensions=dimensions,
                            n_calls=100,
                            noise= 0.01,
                            n_jobs=-1,
                            kappa = 5,
                            x0=default_parameters)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Accuracy: 98.01%

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Accuracy: 97.36%

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Accuracy: 96.78%

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Accuracy: 97.16%



## Find best accuracy

In [None]:
print("best accuracy was " + str(round(gp_result.fun *-100,2))+"%.")

best accuracy was 98.2%.


### The best function

In [None]:
gp_result.x

[0.0014480753787065842, 1, 447, 13, 'relu', 74, 0.0015325746259423636]

In [None]:
gp_result.func_vals

array([-0.98011112, -0.97355556, -0.96777779, -0.97155553, -0.972     ,
       -0.97677779, -0.96244442, -0.38766667, -0.91533333, -0.94711113,
       -0.96888888, -0.10644444, -0.84244442, -0.92211109, -0.10633333,
       -0.30700001, -0.95033336, -0.10633333, -0.10633333, -0.9682222 ,
       -0.97522223, -0.10633333, -0.94466668, -0.97344446, -0.93088889,
       -0.97222221, -0.98199999, -0.97388887, -0.8241111 , -0.93711114,
       -0.90455556, -0.97777778, -0.71533334, -0.97799999, -0.97055554,
       -0.10633333, -0.96466666, -0.98133332, -0.97355556, -0.97233331,
       -0.97266668, -0.97611111, -0.96388888, -0.96611112, -0.98066664,
       -0.97277778, -0.96866667, -0.86322224, -0.97833335, -0.95844442,
       -0.97799999, -0.10633333, -0.9683333 , -0.97733331, -0.97033334,
       -0.9668889 , -0.96666664, -0.95044446, -0.10688889, -0.96700001,
       -0.97766668, -0.97366667, -0.97677779, -0.10633333, -0.98011112,
       -0.97244442, -0.97288889, -0.96622223, -0.97755557, -0.98

### All the models and paremeters

In [None]:
df_temp = pd.concat([pd.DataFrame(gp_result.x_iters, columns = ["learning rate","hidden layers","input layer nodes","hidden layer nodes",
                                           "activation function","batch size","adam learning rate decay"]),
                                  (pd.Series(gp_result.func_vals*-100, name="accuracy"))], axis=1)

In [None]:
df_temp.head()

Unnamed: 0,learning rate,hidden layers,input layer nodes,hidden layer nodes,activation function,batch size,adam learning rate decay,accuracy
0,0.001,1,512,13,relu,64,0.001,98.011112
1,0.005828,2,254,16,sigmoid,9,0.002643,97.355556
2,0.013022,3,285,16,sigmoid,90,0.00988,96.777779
3,0.004085,3,473,15,sigmoid,67,0.001773,97.155553
4,0.002228,4,219,20,sigmoid,53,0.00072,97.2


In [None]:
df_temp = df_temp.sort_values(by=['accuracy'], ascending=False)

In [None]:
df_temp

Unnamed: 0,learning rate,hidden layers,input layer nodes,hidden layer nodes,activation function,batch size,adam learning rate decay,accuracy
26,0.001448,1,447,13,relu,74,0.001533,98.199999
69,0.001088,1,512,28,relu,123,0.001037,98.144442
79,0.001267,1,512,12,relu,128,0.000001,98.133332
37,0.001200,1,512,11,relu,128,0.001242,98.133332
44,0.002269,2,193,27,relu,128,0.000215,98.066664
...,...,...,...,...,...,...,...,...
51,0.000100,2,181,1,relu,124,0.001292,10.633333
14,0.072794,2,472,23,sigmoid,6,0.000608,10.633333
35,0.003725,1,512,1,relu,1,0.000001,10.633333
21,0.000100,5,512,17,sigmoid,128,0.002175,10.633333


In [None]:
df_temp.to_csv("DF_TEMP.csv")

In [None]:
gp_result.x

[0.0014480753787065842, 1, 447, 13, 'relu', 74, 0.0015325746259423636]

In [None]:
gp_model = create_model(gp_result.x[0],gp_result.x[1],gp_result.x[2],gp_result.x[3],gp_result.x[4],gp_result.x[6])
gp_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 447)               350895    
                                                                 
 layer_dense_1 (Dense)       (None, 13)                5824      
                                                                 
 dense_1 (Dense)             (None, 10)                140       
                                                                 
Total params: 356,859
Trainable params: 356,859
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (Dense)         (None, 16)                12560     
                                                                 
 hidden_layer (Dense)        (None, 16)                272       
                                                                 
 output_layer (Dense)        (None, 10)                170       
                                                                 
Total params: 13,002
Trainable params: 13,002
Non-trainable params: 0
_________________________________________________________________


### Retrain the best model architecture

In [None]:
gp_model.fit(X_train, y_train, batch_size=gp_result.x[5], epochs =20, validation_split=0.15)
gp_model.evaluate(X_train,y_train)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[0.01735677570104599, 0.996566653251648]

In [None]:
gp_model.evaluate(X_test,y_test)



[0.07222875207662582, 0.979200005531311]

## Random search

In [None]:
import random

In [None]:
dim_learning_rate = Real(low=1e-4, high=1e-1, prior='log-uniform',
                         name='learning_rate')
dim_num_dense_layers = Integer(low=1, high=5, name='num_dense_layers')
dim_num_input_nodes = Integer(low=1, high=512, name='num_input_nodes')
dim_num_dense_nodes = Integer(low=1, high=28, name='num_dense_nodes')
dim_activation = Categorical(categories=['relu', 'sigmoid'],
                             name='activation')
dim_batch_size = Integer(low=1, high=128, name='batch_size')
dim_adam_decay = Real(low=1e-6,high=1e-2,name="adam_decay")

dimensions = [dim_learning_rate,
              dim_num_dense_layers,
              dim_num_input_nodes,
              dim_num_dense_nodes,
              dim_activation,
              dim_batch_size,
              dim_adam_decay
             ]

param_grid = {
              'dim_learning_rate' : list(np.logspace(np.log(0.005), np.log(0.2), base = np.exp(1), num = 1000)),
              'dim_num_dense_layers' : list(range(1, 5)),
              'dim_num_input_nodes' : list(range(2, 512, 2)),
              'dim_num_dense_nodes' : list(range(1, 28)),
              'dim_activation' : ['relu', 'sigmoid'],
              'dim_batch_size' : list(range(1, 128))
}

In [None]:
random.seed(50)

# Iterate through the specified number of evaluations
for i in range(5):
    
    # Randomly sample parameters for gbm
    params = [random.sample(value, 1)[0] for key, value in param_grid.items()]
    
    print(params)
    

[0.03275177220475209, 3, 188, 21, 'relu', 89]
[0.029863672437724486, 3, 44, 18, 'sigmoid', 29]
[0.06437322298735856, 1, 478, 5, 'sigmoid', 106]
[0.007233298202346897, 3, 164, 8, 'relu', 9]
[0.19706765150537875, 3, 506, 20, 'sigmoid', 82]


In [None]:
def create_model(learning_rate, num_dense_layers,num_input_nodes,
                 num_dense_nodes, activation):
    #start the model making process and create our first layer
    model = Sequential()
    model.add(Dense(num_input_nodes, input_shape= input_shape, activation=activation
                   ))
    #create a loop making a new dense layer for the amount passed to this model.
    #naming the layers helps avoid tensorflow error deep in the stack trace.
    for i in range(num_dense_layers):
        name = 'layer_dense_{0}'.format(i+1)
        model.add(Dense(num_dense_nodes,
                 activation=activation,
                        name=name
                 ))
    #add our classification layer.
    model.add(Dense(10,activation='softmax'))
    
    #setup our optimizer and compile
    adam = Adam(learning_rate=learning_rate)
    model.compile(optimizer=adam, loss='categorical_crossentropy',
                 metrics=['accuracy'])
    return model

In [None]:

def fitness(learning_rate, num_dense_layers, num_input_nodes, 
            num_dense_nodes,activation, batch_size):

    model = create_model(learning_rate=learning_rate,
                         num_dense_layers=num_dense_layers,
                         num_input_nodes=num_input_nodes,
                         num_dense_nodes=num_dense_nodes,
                         activation=activation
                        )
    

    #named blackbox becuase it represents the structure
    blackbox = model.fit(x=X_train,
                        y=y_train,
                        epochs=20,
                        batch_size=batch_size,
                        validation_split=0.15,
                        )
    #return the validation accuracy for the last epoch.
    accuracy = blackbox.history['val_accuracy'][-1]

    # Print the classification accuracy.
    print()
    print("Accuracy: {0:.2%}".format(accuracy))
    print()


    # Delete the Keras model with these hyper-parameters from memory.
    del model
    
    # Clear the Keras session, otherwise it will keep adding new
    # models to the same TensorFlow graph each time we create
    # a model with a different set of hyper-parameters.
    K.clear_session()
    ops.reset_default_graph()
    
    return [learning_rate, num_dense_layers, num_input_nodes, 
            num_dense_nodes,activation, batch_size, accuracy]

In [None]:
random.seed(50)

results = []
best_results = [0, 0, 0, 0, 0, 0, 0]

# Iterate through the specified number of evaluations
for i in range(100):

    print("Iteration: ", i+1)
    
    # Randomly sample parameters for gbm
    params = [random.sample(value, 1)[0] for key, value in param_grid.items()]
    
    result = fitness(params[0], params[1], params[2], params[3], params[4], params[5])
    results.append(result)
    if(best_results[6] < result[6]):
      best_results = result

Iteration:  1
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Accuracy: 72.78%

Iteration:  2
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Accuracy: 93.96%

Iteration:  3
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Accuracy: 9.59%

Iteration:  4
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20


In [None]:
print(best_results)

[0.008384610951889282, 3, 194, 26, 'relu', 44, 0.9786666631698608]


In [None]:
results

[[0.03275177220475209, 3, 188, 21, 'relu', 89, 0.7277777791023254],
 [0.029863672437724486, 3, 44, 18, 'sigmoid', 29, 0.9395555257797241],
 [0.06437322298735856, 1, 478, 5, 'sigmoid', 106, 0.0958888903260231],
 [0.007233298202346897, 3, 164, 8, 'relu', 9, 0.9636666774749756],
 [0.19706765150537875, 3, 506, 20, 'sigmoid', 82, 0.0958888903260231],
 [0.006996863427233354, 1, 356, 20, 'sigmoid', 102, 0.9758889079093933],
 [0.06803935820807058, 4, 4, 8, 'sigmoid', 35, 0.4238888919353485],
 [0.10213232309588141, 4, 360, 4, 'relu', 117, 0.0989999994635582],
 [0.026633585716561717, 3, 464, 23, 'sigmoid', 17, 0.10633333027362823],
 [0.013854191076355483, 2, 274, 19, 'relu', 53, 0.9660000205039978],
 [0.06581535611293016, 1, 438, 21, 'sigmoid', 55, 0.0958888903260231],
 [0.14292118324159045, 2, 256, 19, 'sigmoid', 12, 0.109333336353302],
 [0.12793421050863038, 3, 390, 17, 'relu', 1, 0.10022222250699997],
 [0.019315720035733164, 1, 464, 19, 'sigmoid', 61, 0.9446666836738586],
 [0.0991594025047352

In [None]:
random_res = np.array(results)

In [None]:
random_res_df = pd.DataFrame(random_res, columns = ["learning rate","hidden layers","input layer nodes","hidden layer nodes",
                                           "activation function","batch size","accuracy"])

In [None]:
random_res_df.head()

Unnamed: 0,learning rate,hidden layers,input layer nodes,hidden layer nodes,activation function,batch size,accuracy
0,0.032751772204752,3,188,21,relu,89,0.7277777791023254
1,0.0298636724377244,3,44,18,sigmoid,29,0.939555525779724
2,0.0643732229873585,1,478,5,sigmoid,106,0.0958888903260231
3,0.0072332982023468,3,164,8,relu,9,0.9636666774749756
4,0.1970676515053787,3,506,20,sigmoid,82,0.0958888903260231


In [None]:
random_res_df.to_csv("random_df.csv")

In [None]:
gp_model = create_model(best_results[0], best_results[1], best_results[2], best_results[3], best_results[4])
gp_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 194)               152290    
                                                                 
 layer_dense_1 (Dense)       (None, 26)                5070      
                                                                 
 layer_dense_2 (Dense)       (None, 26)                702       
                                                                 
 layer_dense_3 (Dense)       (None, 26)                702       
                                                                 
 dense_1 (Dense)             (None, 10)                270       
                                                                 
Total params: 159,034
Trainable params: 159,034
Non-trainable params: 0
_________________________________________________________________


#### Retrain the best model architecture with random search

In [None]:
gp_model.fit(X_train, y_train, batch_size=best_results[5], epochs =20, validation_split=0.15)
gp_model.evaluate(X_train,y_train)

In [None]:
gp_model.evaluate(X_test,y_test)



[0.13056840002536774, 0.9778000116348267]