# Chapter 11: Training Deep Neural Networks

In [1]:
import os
from functools import partial
import numpy as np

import tensorflow as tf
from tensorflow import keras
from keras import (
    datasets,
    callbacks,
    optimizers,
    regularizers,
    layers,
    Sequential,
    activations,
    metrics,
    losses,
    backend,
)


In [2]:
LEARNING_RATE = 5e-4

LOGS_PATH = os.path.join("..", "logs", "chapter_11")
MODEL_PATH = os.path.join("models", "my_cifar_model.h5")

### 8. Practice training a deep neural network on the CIFAR10 image dataset:

*Build a DNN with 20 hidden layers of 100 neurons each (that’s too many, but it’s the point of this exercise). Use He initialization and the ELU activation function.*

In [3]:
backend.clear_session()
tf.random.set_seed(1992)
np.random.seed(1992)

In [4]:
model = Sequential()
model.add(layers.Flatten(input_shape=[32, 32, 3]))
for _ in range(20):
    model.add(layers.Dense(100, activation="elu", kernel_initializer="he_normal"))

model.add(layers.Dense(10, activation="softmax"))


Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



*Using Nadam optimization and early stopping, train the network on the CIFAR10 dataset. You can load it with keras.datasets.cifar10.load_data(). The dataset is composed of 60,000 32 × 32–pixel color images (50,000 for training, 10,000 for testing) with 10 classes, so you’ll need a softmax output layer with 10 neurons. Remember to search for the right learning rate each time you change the model’s architecture or hyperparameters.*

In [5]:
(x_train, y_train), (x_test, y_test) = datasets.cifar10.load_data()
x_train, x_valid = x_train[:40000], x_train[40000:]
y_train, y_valid = y_train[:40000], y_train[40000:]

In [6]:
optimizer = optimizers.Nadam(learning_rate=LEARNING_RATE)

model.compile(
    optimizer=optimizer,
    metrics=["accuracy"],
    loss="sparse_categorical_crossentropy",
)

# Callbacks
early_stop_cb = callbacks.EarlyStopping(patience=5, monitor='val_loss')
tensorboard_cb = callbacks.TensorBoard(log_dir=LOGS_PATH)
model_checkpoint_cb = callbacks.ModelCheckpoint(MODEL_PATH, save_best_only=True)

callbacks_ = [early_stop_cb, tensorboard_cb, model_checkpoint_cb]

In [7]:

model.fit(
    x_train, 
    y_train,
    validation_data=[x_valid, y_valid],
    callbacks=callbacks_,
    epochs=100,
    batch_size=512,
)

model = keras.models.load_model(MODEL_PATH)
model.evaluate(x_test, y_test)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100


[1.7520089149475098, 0.37040001153945923]

> #### Now try adding Batch Normalization and compare the learning curves: Is it converging faster than before? Does it produce a better model? How does it affect training speed?

In [8]:
backend.clear_session()
tf.random.set_seed(1992)
np.random.seed(1992)

In [9]:
model = Sequential(
    [layers.Flatten(input_shape=[32, 32, 3]), layers.BatchNormalization()]
)

for _ in range(20):
    model.add(layers.Dense(100, kernel_initializer="he_normal"))
    model.add(layers.BatchNormalization())
    model.add(layers.Activation("elu"))

model.add(layers.Dense(10, activation="softmax"))

In [10]:
optimizer = optimizers.Nadam(learning_rate=LEARNING_RATE)

model.compile(
    optimizer=optimizer,
    metrics=["accuracy"],
    loss="sparse_categorical_crossentropy",
)

# Callbacks
early_stop_cb = callbacks.EarlyStopping(patience=5, monitor='val_loss')
tensorboard_cb = callbacks.TensorBoard(log_dir=LOGS_PATH)
model_checkpoint_cb = callbacks.ModelCheckpoint(MODEL_PATH, save_best_only=True)

callbacks_ = [early_stop_cb, tensorboard_cb, model_checkpoint_cb]

In [11]:
model.fit(
    x_train, 
    y_train,
    validation_data=[x_valid, y_valid],
    epochs=100,
    callbacks=callbacks_,
    batch_size=512,
)

model = keras.models.load_model(MODEL_PATH)
model.evaluate(x_test, y_test)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100


[1.6154439449310303, 0.4564000070095062]

> #### Try replacing Batch Normalization with SELU, and make the necessary adjustements to ensure the network self-normalizes (i.e., standardize the input features, use LeCun normal initialization, make sure the DNN contains only a sequence of dense layers, etc.).

In [12]:
backend.clear_session()
tf.random.set_seed(1992)
np.random.seed(1992)

In [14]:
model = Sequential(
    [layers.Flatten(input_shape=[32, 32, 3]), layers.BatchNormalization()]
)

for _ in range(20):
    model.add(layers.Dense(100, activation="selu", kernel_initializer="lecun_normal"))

model.add(layers.Dense(10, activation="softmax"))


> #### Try regularizing the model with alpha dropout. Then, without retraining your model, see if you can achieve better accuracy using MC Dropout.

> #### Retrain your model using 1cycle scheduling and see if it improves training speed and model accuracy.