# Chapter 11: Training Deep Neural Networks

In [1]:
from functools import partial

import tensorflow as tf
from tensorflow import keras
from keras import (
    datasets,
    callbacks,
    optimizers,
    regularizers,
    layers,
    Sequential,
    activations,
    metrics,
    losses,
)


### 8. Practice training a deep neural network on the CIFAR10 image dataset:

> #### Build a DNN with 20 hidden layers of 100 neurons each (that’s too many, but it’s the point of this exercise). Use He initialization and the ELU activation function.

In [2]:
dense_layer = partial(
    layers.Dense,
    activation=activations.elu,
    kernel_initializer='he_normal',
    kernel_regularizer=regularizers.l2(0.01),
)

In [3]:
model = Sequential([
    layers.Flatten(input_shape=[32, 32, 3]),
])

Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 3072)              0         
                                                                 
 dense (Dense)               (None, 100)               307300    
                                                                 
 dense_1 (Dense)             (None, 100)               10100     
                                                                 
 dense_2 (Dense)             (None, 100)               10100     
                                                                 
 dense_3 (Dense)             (None, 100)               10100     
                                                                 
 dense_4 (Dense)             (None, 100)               10100     
                                        

> #### Using Nadam optimization and early stopping, train the network on the CIFAR10 dataset. You can load it with keras.datasets.cifar10.load_data(). The dataset is composed of 60,000 32 × 32–pixel color images (50,000 for training, 10,000 for testing) with 10 classes, so you’ll need a softmax output layer with 10 neurons. Remember to search for the right learning rate each time you change the model’s architecture or hyperparameters.

In [4]:
(x_train, y_train), (x_test, y_test) = datasets.cifar10.load_data()
x_train, x_valid = x_train[:40000] / 255.0, x_train[40000:] / 255.0
x_test = x_test / 255.0
y_train, y_valid = y_train[:40000], y_train[40000:]

In [5]:
y_train[:30]

array([[6],
       [9],
       [9],
       [4],
       [1],
       [1],
       [2],
       [7],
       [8],
       [3],
       [4],
       [7],
       [7],
       [2],
       [9],
       [9],
       [9],
       [3],
       [2],
       [6],
       [4],
       [3],
       [6],
       [6],
       [2],
       [6],
       [3],
       [5],
       [4],
       [0]], dtype=uint8)

In [6]:
optimizer = optimizers.Nadam()

# Callbacks
logdir = '../logs/chapter_11/'
early_stop_cb = callbacks.EarlyStopping(patience=5, monitor='val_loss')
tensorboard_cb = callbacks.TensorBoard(log_dir=logdir)

In [7]:
model.compile(
    optimizer=optimizer,
    metrics=[metrics.accuracy],
    loss=losses.sparse_categorical_crossentropy,
)

In [8]:
model.fit(
    x_train,
    y_train,
    validation_data=[x_valid, y_valid],
    callbacks=[early_stop_cb, tensorboard_cb],
    batch_size=512,
    epochs=100,
)


Epoch 1/100


ValueError: in user code:

    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/engine/training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/engine/training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/engine/training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/engine/training.py", line 998, in train_step
        return self.compute_metrics(x, y, y_pred, sample_weight)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/engine/training.py", line 1092, in compute_metrics
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 605, in update_state
        metric_obj.update_state(y_t, y_p, sample_weight=mask)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/utils/metrics_utils.py", line 77, in decorated
        update_op = update_state_fn(*args, **kwargs)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/metrics/base_metric.py", line 143, in update_state_fn
        return ag_update_state(*args, **kwargs)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/metrics/base_metric.py", line 700, in update_state  **
        matches = ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/metrics/metrics.py", line 3571, in accuracy  **
        y_true.shape.assert_is_compatible_with(y_pred.shape)

    ValueError: Shapes (None, 1) and (None, 10) are incompatible


> #### Now try adding Batch Normalization and compare the learning curves: Is it converging faster than before? Does it produce a better model? How does it affect training speed?

> #### Try replacing Batch Normalization with SELU, and make the necessary adjustements to ensure the network self-normalizes (i.e., standardize the input features, use LeCun normal initialization, make sure the DNN contains only a sequence of dense layers, etc.).

> #### Try regularizing the model with alpha dropout. Then, without retraining your model, see if you can achieve better accuracy using MC Dropout.

> #### Retrain your model using 1cycle scheduling and see if it improves training speed and model accuracy.