# Chapter 11: Training Deep Neural Networks

In [52]:
import os
from functools import partial
import numpy as np
from datetime import datetime
from matplotlib import pyplot as plt
import math

import tensorflow as tf
from tensorflow import keras
from keras import (
    datasets,
    callbacks,
    optimizers,
    regularizers,
    layers,
    Sequential,
    activations,
    metrics,
    losses,
    backend,
)


In [6]:
LEARNING_RATE = 5e-4

LOGS_PATH = os.path.join("..", "logs", "chapter_11")
MODEL_PATH = os.path.join("models", "my_cifar_model.h5")

In [15]:
def get_logdir():
    now = datetime.now()
    id_logdir = "run_" + now.strftime("%d_%m_%H%M%S")
    return os.path.join(LOGS_PATH, id_logdir)

### 8. Practice training a deep neural network on the CIFAR10 image dataset:

*Build a DNN with 20 hidden layers of 100 neurons each (that’s too many, but it’s the point of this exercise). Use He initialization and the ELU activation function.*

In [17]:
backend.clear_session()
tf.random.set_seed(1992)
np.random.seed(1992)

In [18]:
model = Sequential()
model.add(layers.Flatten(input_shape=[32, 32, 3]))
for _ in range(20):
    model.add(layers.Dense(100, activation="elu", kernel_initializer="he_normal"))

model.add(layers.Dense(10, activation="softmax"))


Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



*Using Nadam optimization and early stopping, train the network on the CIFAR10 dataset. You can load it with keras.datasets.cifar10.load_data(). The dataset is composed of 60,000 32 × 32–pixel color images (50,000 for training, 10,000 for testing) with 10 classes, so you’ll need a softmax output layer with 10 neurons. Remember to search for the right learning rate each time you change the model’s architecture or hyperparameters.*

In [19]:
(x_train, y_train), (x_test, y_test) = datasets.cifar10.load_data()
x_train, x_valid = x_train[:40000], x_train[40000:]
y_train, y_valid = y_train[:40000], y_train[40000:]

In [20]:
optimizer = optimizers.Nadam(learning_rate=LEARNING_RATE)

model.compile(
    optimizer=optimizer,
    metrics=["accuracy"],
    loss="sparse_categorical_crossentropy",
)

# Callbacks
early_stop_cb = callbacks.EarlyStopping(patience=5, monitor='val_loss')
tensorboard_cb = callbacks.TensorBoard(log_dir=get_logdir())
model_checkpoint_cb = callbacks.ModelCheckpoint(MODEL_PATH, save_best_only=True)

callbacks_ = [early_stop_cb, tensorboard_cb, model_checkpoint_cb]

In [21]:

model.fit(
    x_train, 
    y_train,
    validation_data=[x_valid, y_valid],
    callbacks=callbacks_,
    epochs=100,
    batch_size=512,
)

model = keras.models.load_model(MODEL_PATH)
model.evaluate(x_test, y_test)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100


[1.7451152801513672, 0.380700021982193]

> #### Now try adding Batch Normalization and compare the learning curves: Is it converging faster than before? Does it produce a better model? How does it affect training speed?

In [22]:
backend.clear_session()
tf.random.set_seed(1992)
np.random.seed(1992)

In [23]:
model = Sequential(
    [layers.Flatten(input_shape=[32, 32, 3]), layers.BatchNormalization()]
)

for _ in range(20):
    model.add(layers.Dense(100, kernel_initializer="he_normal"))
    model.add(layers.BatchNormalization())
    model.add(layers.Activation("elu"))

model.add(layers.Dense(10, activation="softmax"))

In [24]:
optimizer = optimizers.Nadam(learning_rate=LEARNING_RATE)

model.compile(
    optimizer=optimizer,
    metrics=["accuracy"],
    loss="sparse_categorical_crossentropy",
)

# Callbacks
early_stop_cb = callbacks.EarlyStopping(patience=5, monitor='val_loss')
tensorboard_cb = callbacks.TensorBoard(log_dir=get_logdir())
model_checkpoint_cb = callbacks.ModelCheckpoint(MODEL_PATH, save_best_only=True)

callbacks_ = [early_stop_cb, tensorboard_cb, model_checkpoint_cb]

In [None]:
model.fit(
    x_train, 
    y_train,
    validation_data=[x_valid, y_valid],
    epochs=100,
    callbacks=callbacks_,
    batch_size=512,
)

model = keras.models.load_model(MODEL_PATH)
model.evaluate(x_test, y_test)

> #### Try replacing Batch Normalization with SELU, and make the necessary adjustements to ensure the network self-normalizes (i.e., standardize the input features, use LeCun normal initialization, make sure the DNN contains only a sequence of dense layers, etc.).

In [35]:
backend.clear_session()
tf.random.set_seed(1992)
np.random.seed(1992)

In [36]:
model = Sequential(
    [layers.Flatten(input_shape=[32, 32, 3]), layers.BatchNormalization()]
)

for _ in range(20):
    model.add(layers.Dense(100, activation="selu", kernel_initializer="lecun_normal"))

model.add(layers.AlphaDropout(rate=0.1))
model.add(layers.Dense(10, activation="softmax"))


In [37]:
optimizer = optimizers.Nadam(learning_rate=LEARNING_RATE)

model.compile(
    optimizer=optimizer,
    metrics=["accuracy"],
    loss="sparse_categorical_crossentropy",
)

# Callbacks
early_stop_cb = callbacks.EarlyStopping(patience=5, monitor='val_loss')
tensorboard_cb = callbacks.TensorBoard(log_dir=get_logdir())
model_checkpoint_cb = callbacks.ModelCheckpoint(MODEL_PATH, save_best_only=True)

callbacks_ = [early_stop_cb, tensorboard_cb, model_checkpoint_cb]

In [38]:
normalizer = layers.Normalization()
normalizer.adapt(x_train)
x_train_norm = normalizer(x_train)
x_valid_norm = normalizer(x_valid)
x_test_norm = normalizer(x_test)

In [39]:
model.fit(
    x_train_norm,
    y_train,
    validation_data=[x_valid_norm, y_valid],
    epochs=100,
    callbacks=callbacks_,
    batch_size=512,
)

model = keras.models.load_model(MODEL_PATH)
model.evaluate(x_test_norm, y_test)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100


[1.7656570672988892, 0.44520002603530884]

*Try regularizing the model with alpha dropout. Then, without retraining your model, see if you can achieve better accuracy using MC Dropout.*

In [40]:
class MCAlphaDropout(layers.AlphaDropout):
    def call(self, inputs):
        return super().call(inputs, training=True)

In [43]:
mc_model = Sequential(
    [
        MCAlphaDropout(layer.rate) if isinstance(layer, layers.AlphaDropout) else layer
        for layer in model.layers
    ]
)


In [44]:
def mc_dropout_predict_probas(mc_model, x, n_samples=10):
    y_probas = [mc_model.predict(x) for _ in range(n_samples)]
    return np.mean(y_probas, axis=0)

def mc_dropout_predic_classes(mc_model, x, n_samples=10):
    y_probas = mc_dropout_predict_probas(mc_model, x, n_samples)
    return np.argmax(y_probas, axis=1)

In [45]:
backend.clear_session()
tf.random.set_seed(1992)
np.random.seed(1992)

y_pred = mc_dropout_predic_classes(mc_model, x_valid_norm)
accuracy = np.mean(y_pred == y_valid[:, 0])
accuracy



0.45

> #### Retrain your model using 1cycle scheduling and see if it improves training speed and model accuracy.

In [53]:
backend.clear_session()
tf.random.set_seed(1992)
np.random.seed(1992)

In [54]:
model = Sequential(
    [layers.Flatten(input_shape=[32, 32, 3]), layers.BatchNormalization()]
)

for _ in range(20):
    model.add(layers.Dense(100, activation="selu", kernel_initializer="lecun_normal"))

model.add(layers.AlphaDropout(rate=0.1))
model.add(layers.Dense(10, activation="softmax"))

In [55]:
K = keras.backend

class ExponentialLearningRate(keras.callbacks.Callback):
    def __init__(self, factor):
        self.factor = factor
        self.rates = []
        self.losses = []
    def on_batch_end(self, batch, logs):
        self.rates.append(K.get_value(self.model.optimizer.learning_rate))
        self.losses.append(logs["loss"])
        K.set_value(self.model.optimizer.learning_rate, self.model.optimizer.learning_rate * self.factor)

def find_learning_rate(model, X, y, epochs=1, batch_size=32, min_rate=10**-5, max_rate=10):
    init_weights = model.get_weights()
    iterations = math.ceil(len(X) / batch_size) * epochs
    factor = np.exp(np.log(max_rate / min_rate) / iterations)
    init_lr = K.get_value(model.optimizer.learning_rate)
    K.set_value(model.optimizer.learning_rate, min_rate)
    exp_lr = ExponentialLearningRate(factor)
    history = model.fit(X, y, epochs=epochs, batch_size=batch_size,
                        callbacks=[exp_lr])
    K.set_value(model.optimizer.learning_rate, init_lr)
    model.set_weights(init_weights)
    return exp_lr.rates, exp_lr.losses

def plot_lr_vs_loss(rates, losses):
    plt.plot(rates, losses)
    plt.gca().set_xscale('log')
    plt.hlines(min(losses), min(rates), max(rates))
    plt.axis([min(rates), max(rates), min(losses), (losses[0] + min(losses)) / 2])
    plt.xlabel("Learning rate")
    plt.ylabel("Loss")