In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
from tensorflow import keras
import numpy as np
tf.random.set_seed(42)
np.random.seed(42)

In [2]:
# Weight initialization

In [3]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.initializers import VarianceScaling

Dense(10, activation='relu', kernel_initializer='he_normal')
# or
he_avg_init = VarianceScaling(scale=2., mode='fan_avg',   # avg of input and output/no of neurons of each layer
                             distribution='uniform')
Dense(10, activation='sigmoid', kernel_initializer=he_avg_init)

<tensorflow.python.keras.layers.core.Dense at 0x2b47ef6d358>

In [4]:
# Manual implementations of activation functions
import numpy as np

In [5]:
# Leaky ReLU
#(for keras implementaiton add a LeakyReLU() layer)
def leaky_relu(z, alpha=0.03):
    return np.maximum(alpha*z, z)

In [6]:
# ELU
#(for keras implementation set actvation='elu')
def elu(z, alpha=1):
    return np.where(z<0, alpha * (np.exp(z) - 1), z)

In [7]:
# SELU
#(might not work well with l1, l2 regularization, dropout, max-norm, non-sequential architecture)
#(works well with sequential CNN)
#(for keras specify activation='selu' and kernel_initializer='lecun_normal')
from scipy.special import erfc

alpha_0_1 = -np.sqrt(2/np.pi) / (erfc(1/np.sqrt(2)) * np.exp(1/2) - 1)
scale_0_1 = (1 - erfc(1/np.sqrt(2)) * np.sqrt(np.e)) * np.sqrt(2*np.pi) * (2*erfc(np.sqrt(2))*np.e**2 + 
            np.pi*erfc(1/np.sqrt(2))**2*np.e - 2*(2+np.pi)*erfc(1/np.sqrt(2))*np.sqrt(np.e) + 
            np.pi+2)**(-1/2)

def selu(z, scale=scale_0_1, alpha=alpha_0_1):
    return scale * elu(z, alpha)

In [8]:
# Batch Normalization

In [None]:
# learn to use the optimal scaling and shifting for each layer

In [None]:
# for each batch in training
# 1.compute the mean
# 2.compute the std
# 3.compute zero centered and normalized values
# 4.compute the optimal scale and mean

In [9]:
# for prediction, either
# use the mean and std of the whole training set
# (in actual implementations, computes a moving mean and std during training)

In [10]:
# + also provides some regularization
# - adds complexity
# - adds computation time in prediction
#(mitigated by updating the weights and biases with also normalized weights ans biases)

In [12]:
# Batch Norm in keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, BatchNormalization, Dense

In [15]:
model = Sequential([
    Flatten(input_shape=[28, 28]),
    BatchNormalization(),
    Dense(300, activation='elu', kernel_initializer='he_normal'),
    BatchNormalization(),
    Dense(100, activation='elu', kernel_initializer='he_normal'),
    BatchNormalization(),
    Dense(10, activation='softmax')
])

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 784)               0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 784)               3136      
_________________________________________________________________
dense_4 (Dense)              (None, 300)               235500    
_________________________________________________________________
batch_normalization_4 (Batch (None, 300)               1200      
_________________________________________________________________
dense_5 (Dense)              (None, 100)               30100     
_________________________________________________________________
batch_normalization_5 (Batch (None, 100)               400       
_________________________________________________________________
dense_6 (Dense)              (None, 10)                1

In [22]:
model.layers[1]

<tensorflow.python.keras.layers.normalization_v2.BatchNormalization at 0x2b47f2d9f98>

In [21]:
[(var.name, var.trainable) for var in model.layers[1].variables]

[('batch_normalization_3/gamma:0', True),
 ('batch_normalization_3/beta:0', True),
 ('batch_normalization_3/moving_mean:0', False),
 ('batch_normalization_3/moving_variance:0', False)]

In [23]:
# Alternative, put batch norm layers before activation functions
#(remove activation functions from the layers and make own seperate layers)
#(remove biases)
from tensorflow.keras.layers import Activation

In [24]:
model = Sequential([
    Flatten(input_shape=[28, 28]),
    BatchNormalization(),
    Dense(300, kernel_initializer='he_normal', use_bias=False),
    BatchNormalization(),
    Activation('elu'),
    Dense(100, kernel_initializer='he_normal', use_bias=False),
    BatchNormalization(),
    Activation('elu'),
    Dense(10, activation='softmax')
])

In [25]:
# Gradient Clipping
#(clip gradient values outside some threshold)
#(frequntly used in RNN)

In [27]:
# Gradient clipping in keras
optimizer = keras.optimizers.SGD(clipvalue=1)   #alternatively use clipnorm
model.compile(loss='mse', optimizer=optimizer)

In [28]:
# Transfer Learning

In [30]:
# Dataset set up
(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()
X_train_full = X_train_full / 255
X_test = X_test / 255
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]

#(Set A: images without sandals and shirts)
#(Set B: images of only sandals and shirts)
def split_dataset(X, y):
    y_5_or_6 = (y==5) | (y==6)   #(5, 6:labels for sandals and shirts)
    y_A = y[~y_5_or_6]
    y_A[y_A > 6] -= 2   #label indexes adjustment
    y_B = (y[y_5_or_6] == 6).astype(np.float32)
    
    return ((X[~y_5_or_6], y_A),
           (X[y_5_or_6], y_B))

(X_train_A, y_train_A), (X_train_B, y_train_B) = split_dataset(X_train, y_train)
(X_valid_A, y_valid_A), (X_valid_B, y_valid_B) = split_dataset(X_valid, y_valid)
(X_test_A, y_test_A), (X_test_B, y_test_B) = split_dataset(X_test, y_test)
X_train_B = X_train_B[:200]
y_train_B = y_train_B[:200]

In [32]:
# Models set up
model_A = Sequential()
model_A.add(Flatten(input_shape=[28, 28]))
for n_hidden in (300, 100, 50, 50, 50):
    model_A.add(Dense(n_hidden, activation='selu'))
model_A.add(Dense(8, activation='softmax'))

model_A.compile(loss='sparse_categorical_crossentropy',
               optimizer=keras.optimizers.SGD(learning_rate=1e-3),
               metrics=['accuracy'])

In [33]:
history = model_A.fit(X_train_A, y_train_A, epochs=20,
                     validation_data=(X_valid_A, y_valid_A))

Train on 43986 samples, validate on 4014 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [34]:
model_B = Sequential()
model_B.add(Flatten(input_shape=[28, 28]))
for n_hidden in (300, 100, 50, 50, 50):
    model_B.add(Dense(n_hidden, activation='selu'))
model_B.add(Dense(1, activation='sigmoid'))

model_B.compile(loss='binary_crossentropy',
               optimizer=keras.optimizers.SGD(learning_rate=1e-3),
               metrics=['accuracy'])

In [35]:
history = model_B.fit(X_train_B, y_train_B, epochs=20, 
                     validation_data=(X_valid_B, y_valid_B))

Train on 200 samples, validate on 986 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [36]:
# Re-use model A for task B
# Remove output layer
model_B_on_A = Sequential(model_A.layers[:-1])
model_B_on_A.add(Dense(1, activation='sigmoid'))
# Save a clone of model A
#(since training B_on_A will affect A)
model_A_clone = keras.models.clone_model(model_A)   #clone architecture
model_A_clone.set_weights(model_A.get_weights())   #clone weights
# Freezing layers
for layer in model_B_on_A.layers[:-1]:
    layer.trainable = False
    
model_B_on_A.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])

In [37]:
# Train for a few epochs
#(to let the last layer learn some weights)
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=4,
                          validation_data=(X_valid_B, y_valid_B))

Train on 200 samples, validate on 986 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [39]:
# Unfreeze and train
for layer in model_B_on_A.layers[:-1]:
    layer.trainable = True
optimizer = keras.optimizers.SGD(learning_rate=1e-4)   #reduced learning rate to avoid the transfered weights being changed drastically
model_B_on_A.compile(loss='binary_crossentropy', optimizer=optimizer,
                    metrics=['accuracy'])

In [40]:
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=16,
                          validation_data=(X_valid_B, y_valid_B))

Train on 200 samples, validate on 986 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [41]:
# Optimizers

In [42]:
# Momentum Optimization
#1.uses the gradient to determine a momentum vector
#2.updates weights with the momentum vector
#(has a 'friction' hyperparam, is like a learning rate)
#(almost always faster than gradient descent)

In [43]:
# Momentum optimization in keras
optimizer = keras.optimizers.SGD(lr=0.001, momentum=0.9)

In [44]:
# Nesterov Accelerated Gradient
#(momentum optimization but calculate the gradient of the local gradient plus the momentum)
#(this works because the original gradient direction is outdated when computing the momentum vector)
#((because the weight updates are already moved in the direction of the momentum before the local gradient is taken into account))
#(almost always faster than momentum optimization)

In [45]:
# NAG in keras
optimizer = keras.optimizers.SGD(lr=0.001, momentum=0.9, nesterov=True)

In [46]:
# AdaGrad
#(scales the weight updates to the steepness of the dimensions)
#(equivalently decays the learning rate for dimensions with higher steepness)
#1.calculates a scaler being a matrix of squared gradients of each param/dimension
#(through iterations the scaler also gets updated with new gradients)
#2.updates the gradients with the gradient vector scaled (divided by the sqrt sum of the scaler and a smoothing term)
# - might stop training before convergence because of increasing decay

In [2]:
# RMSProp
#(adagrad but with a decay element to decay the earlier gradients)
#(only uses gradients from recent iterations to update the scaler)
#(almost always better than AdaGrad)

In [3]:
# RMSProp in keras
optimizer = keras.optimizers.RMSprop(lr=0.001, rho=0.9)

In [4]:
# Adam and Nadam
#(momentum and rmsprop combined, like momentum but scaled)
# 1.Calculate a momentum vector (with decaying average rather than sum)
# 2.Calculate a scaling thing (exponentially decaying scaling thing)
# 3.Filler step to speed up training for momentum
# 4.Filler step to speed up training for scaling
# 5.Weight update with scaled momentum
#(learning rate doesnt need much tuning)

In [5]:
# Adam in keras
optimizer = keras.optimizers.Adam(lr=0.001,
                                 beta_1 = 0.9,   #decaying rate for momentum
                                 beta_2=0.999)   #decaying rate for scaling

In [6]:
# Adam alternatives
# AdaMax
#(use l-infinity (the max) rather than l2 norm to scale)
#(might be more stable than Adam, might not)
# Nadam
#(Like Adam but with Nesterov rather than vanilla momentum)
#(Generally better than Adam, but can be worse than RMSProp?)

In [7]:
# Sidenotes
#(adaptive method might generalize poorly, in which case use nesterov)
#(there are optimizers that use second order partial derivatives, but they are slow)
#(apply strong l1 regularization during training to get sparse model (which is faster))
#(avoid SGD and adagrad)

In [8]:
# Learning rate scheduling

In [9]:
# Power scheduling
#(learning rate drops at each step)
#(first drops quickly and then more slowly)

In [10]:
# Power scheduling in keras
optimizer = keras.optimizers.SGD(lr=0.01, decay=1e-4)

In [None]:
# Exponential scheduling
#(learning rate drops by a factor of 10 every s steps)

In [None]:
# Exponential scheduling not in keras
#(passes in initial learning rate and steps before decay,
#returns a learning rate for an epoch)
def expo_decay(lr0, s):
    def expo_decay_fn(epoch):
        return lr0 * 0.1**(epoch/s)
    return exp_decay_fn

expo_decay_fn = expo_decay(lr0=0.01, s=20)

# Use the function in a callback
#(learning rate is updated at the beginning at every epoch)
lr_scheduler = keras.callbacks.LearningRateScheduler(expo_decay_fn)
history = model.fit(X_train, y_train, epochs=n_epochs, callbacks=[lr_scheduler])

In [None]:
# Exponential scheduling in tf.keras
learning_rate = keras.optimizers.schedules.ExponentialDecay(0.01, 20*len(X_train), 0.1)
optimizer = keras.optimizer.SGD(learning_rate)

In [None]:
# Piecewise constant scheduling
#(use a fixed learning ratefor a fixed number of epochs)

In [None]:
# Pcs, also not in keras
def piecewise_constant_fn(epoch):
    if epoch < 5:
        return 0.01
    elif epoch < 15:
        return 0.005
    else:
        return 0.001
# Alternatively
#(passes in an array length n of epoch index, array n+1 of learning rate)
def piecewise_constant(boundaries, values):
    boundaries = np.array([0]+boundaries)
    values = np.array(values)
    def piecewise_constant_fn(epoch):
        return values[np.argmax(boundaries > epoch)-1]
    return piecewise_constant_fn

piecewise_constant_fn = piecewise_constant([5, 15], [0.01, 0.005, 0.001])

# Make callback
lr_scheduler = keras.callbacks.LearningRateScheduler(piecewise_constant_fn)
history = model.fit(X_train, y_train, epochs=n_epochs, callbacks=[lr_scheduler])

In [None]:
# Performance scheduling
#(measure error every n steps, reduce learning rate by a factor if not dropping)

In [None]:
# Performance scheduling in keras
lr_scheduler = keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5)
history = model.fit(X_train, y_train, epochs=n_epochs, callbacks=[lr_scheduler])

In [None]:
# 1 cycle scheduling
#(increases the learning rate linearly to a maximum n for half of training)
#(decreases for the second half, decreases drastically for last few epochs)
#(inital learning rate is typically about 10 times lower than max)
#(when using momentum, do the reverse)

In [19]:
# 1cs also also not in keras
K = keras.backend

class OneCycleScheduler(keras.callbacks.Callback):
    def __init__(self, iterations, max_rate, start_rate=None,
                 last_iterations=None, last_rate=None):
        self.iterations = iterations
        self.max_rate = max_rate
        self.start_rate = start_rate or max_rate / 10
        self.last_iterations = last_iterations or iterations // 10 + 1
        self.half_iteration = (iterations - self.last_iterations) // 2
        self.last_rate = last_rate or self.start_rate / 1000
        self.iteration = 0
    def _interpolate(self, iter1, iter2, rate1, rate2):
        return ((rate2-rate1) * (self.iteration-iter1) / (iter2-iter1) + rate1)
    def on_batch_begin(self, batch, logs):
        if self.iteration < self.half_iteration:
            rate = self._interpolate(0, self.half_iteration, self.start_rate, self.max_rate)
        elif self.iteration < 2 * self.half_iteration:
            rate = self._interpolate(self.half_iteration, 2 * self.half_iteration,
                                     self.max_rate, self.start_rate)
        else:
            rate = self._interpolate(2 * self.half_iteration, self.iterations,
                                     self.start_rate, self.last_rate)
            rate = max(rate, self.last_rate)
        self.iteration += 1
        K.set_value(self.model.optimizer.lr, rate)

In [None]:
onecycle = OneCycleScheduler(len(X_train) // batch_size * n_epochs, max_rate=0.05)
history = model.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size, callbacks=[onecycle])

In [20]:
# Regularization

In [None]:
# l1 an l2 regularization
# l2 in keras
layers = Dense(100, activation='elu', kernel_initializer='he_normal',
              kernel_regularizer=keras.regularizers.l2(0.01))   # for l1 use regularizers.l1(), l1_l2() for both

In [23]:
# partial() to create architecture with the same layer type, activation, etc.
from functools import partial
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense

RegularizedDense = partial(keras.layers.Dense,
                          activation='elu',
                          kernel_initializer='he_normal',
                          kernel_regularizer=keras.regularizers.l2(0.01))

model = Sequential([
    Flatten(input_shape=[28, 28]),
    RegularizedDense(300),
    RegularizedDense(100),
    RegularizedDense(10, activation='softmax', kernel_initializer='glorot_uniform')
])

In [24]:
# Dropout
#(dropout rate 0.2-0.3 for RNN, 0.4-0.5 for CNN)
#(in practice usually apply dropout to top 1 to 3 layers excluding output(?))
#(multiply input weight or divide neuron output, by keep proba in prediction time)
#(re-evaluate training loss if needed after training (when dropout is not applied anymore))
# - slows down convergence, but worth it
#(use alpha dropout for selu activation function)

In [26]:
# Dropout in keras
from tensorflow.keras.layers import Dropout

model = Sequential([
    Flatten(input_shape=[28, 28]),
    Dropout(rate=0.2),
    Dense(300, activation='elu', kernel_initializer='he_normal'),
    Dropout(rate=0.2),
    Dense(100, activation='elu', kernel_initializer='he_normal'),
    Dropout(rate=0.2),
    Dense(10, activation='softmax')
])

In [None]:
# Monte Carlo Dropout
#(ensemble of dropout neural nets)
y_probas = np.stack([model(X_test, training=True) for sample in range(100)])   #generate 100 predictions
y_proba = y_probas.mean(axis=0)

In [None]:
np.round(y_probas[:, :1], 2)   #a prediction probas
np.round(y_proba[:1], 2)   #mean proba of the same prediction

y_std = y_probas.std(axis=0)   #std of predictions
np.round(y_std[:1], 2)   #std of the above prediction

In [30]:
# MCDropout class
#(for when there are input manipulation layers other than Dropout, like batchnorm)
from tensorflow.keras.layers import Dropout

class MCDropout(Dropout):
    def call(self, inputs):
        return super().call(inputs, training=True)

In [32]:
# Copy model over, replace orginal dropout layers
mc_model = Sequential([
    MCDropout(layer.rate) if isinstance(layer, keras.layers.Dropout) else layer for layer in model.layers
])
mc_model.set_weights(model.get_weights())
mc_model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 784)               0         
_________________________________________________________________
mc_dropout_3 (MCDropout)     (None, 784)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 300)               235500    
_________________________________________________________________
mc_dropout_4 (MCDropout)     (None, 300)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 100)               30100     
_________________________________________________________________
mc_dropout_5 (MCDropout)     (None, 100)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 10)               

In [None]:
# Monte Carlo dropout with possible batchnorm
#(not forcing training=True)
y_proba = np.mean([mc_model.predict(X_test) for sample in range(100)], axis=0)

In [33]:
# Max-Norm Regularization
#(constraint the L2-ed weights under a max-norm r)

In [34]:
# Max-Norm in keras
keras.layers.Dense(100, activation='elu', kernel_initializer='he_normal',
                  kernel_constraint=keras.constraints.max_norm(1, axis=0))
#(when max-norm-ing convolutional layers, change hyperparam axis=[0, 1, 2])

<tensorflow.python.keras.layers.core.Dense at 0x15b7affa6a0>

In [35]:
# Recommended Settings
#(simple dense network settings in brackets)
# Kernel initializer: he init  (lecun init)
# Activation function: elu   (selu)
# Regularization: early stopping and l2   (alpha dropout)
# Optimizer: momentum, rmsprop, or nadam   (same)
# Learning rate schedule: 1cycle   (same)

In [36]:
# More stuff
#(normalize input features)
#(find a pretrained model)
#(unsupervised pretraining for unlabeled data)
#(pretrain on auxiliary on similar task)
#(use l1 regularization for sparse model)
#(use fewer layers, fold batchnorm into previous layer, use leaky relu or relu, sparse model, 
#reduce float precision from 32 to 16 or 8 
#to speed up predition)