In [None]:
# 1. No, they would return the same gradients, makes learning impossible

In [None]:
# 2. Yes

In [None]:
# 3. No 0/dead neurons, average output at 0 negates the vanishing gradients problem, self normalizing effect for networks with all dense layers

In [None]:
# 4.
# SELU: networks with all dense layers
# leaky ReLU(s): everything else
# ReLU: simplicity
# tanh: for output layers with result -1 to 1
# logistic: for output layers with probability-like output
# softmax: for output layers with mutually exclusive results

In [None]:
# 5. High momentum can cause it to roll pass optimum

In [None]:
# 6. l1 regularization, manual forcing weights to 0, and tensorflow optimization all can produce sparse model

In [None]:
# 7. Yes, it equals to training many networks. No
# Yes, it equals to predicting with many networks

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [None]:
# 8. Deep neural net with CIFAR 10

In [2]:
# Import dataset
from tensorflow.keras.datasets import cifar10

(X_train_full, y_train_full), (X_test, y_test) = cifar10.load_data()
X_train_full.shape

(50000, 32, 32, 3)

In [3]:
X_valid, X_train = X_train_full[:10000], X_train_full[10000:]
y_valid, y_train = y_train_full[:10000], y_train_full[10000:]

In [4]:
# Scaling for non SELU activations
X_valid = X_valid.astype('float32') / 255
X_train = X_train.astype('float32') / 255
X_test = X_test.astype('float32') / 255

In [5]:
# Model architecture
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, BatchNormalization

model = Sequential()
model.add(Flatten(input_shape=[32, 32, 3]))
for i in range(0, 20):
    if i > 15 and i <19:
        model.add(Dropout(rate=0.25))
    model.add(BatchNormalization())
    model.add(Dense(100, activation='elu', kernel_initializer='lecun_normal'))
model.add(Dense(10, activation='softmax'))

In [7]:
optimizer = keras.optimizers.RMSprop(lr=0.001, rho=0.9)

In [8]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 3072)              0         
_________________________________________________________________
batch_normalization (BatchNo (None, 3072)              12288     
_________________________________________________________________
dense (Dense)                (None, 100)               307300    
_________________________________________________________________
batch_normalization_1 (Batch (None, 100)               400       
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
batch_normalization_2 (Batch (None, 100)               400       
_________________________________________________________________
dense_2 (Dense)              (None, 100)               1

In [10]:
# define callback
from tensorflow.keras.callbacks import EarlyStopping

early_cb = EarlyStopping(patience=10, restore_best_weights=True)

In [None]:
# 1 cycle scheduling learning rate
K = keras.backend

class OneCycleScheduler(keras.callbacks.Callback):
    def __init__(self, iterations, max_rate, start_rate=None,
                last_iter=None, last_rate=None):
        self.iterations = iterations
        self.max_rate = max_rate
        self.start_rate = start_rate or max_rate/10
        self.last_iterations = last_iter or iterations//10 + 1
        self.half_iteration = (iterations - self.last_iterations) // 2
        self.last_rate = last_rate or self.start_rate/1000
        self.iteration = 0
    
    def _interpolate(self, iter1, iter2, rate1, rate2):
        return((rate2-rate1) * (self.iteration-iter1) / (iter2-iter1) + rate1)
    
    def on_batch_begin(self, batch, logs):
        if self.iteration < self.half_iteration:
            rate = self._interpolate(0, self.half_iteration, 
                                     self.start_rate, self.max_rate)
        elif self.iteration < 2 * self.half_iteration:
            rate = self._interpolate(self.half_iteration, 2*self.half_iteration,
                                    self.max_rate, self.start_rate)
        else:
            rate = self._interpolate(2*self.half_iteration, self.iterations,
                                    self.start_rate, self.last_rate)
            rate = max(rate, self.last_rate)
        self.iteration += 1
        K.set_value(self.model.optimizer.lr, rate)   

In [11]:
n_epochs = 200
batch_size = 32

In [None]:
onecycle = OneCycleScheduler(len(X_train) // batch_size*n_epochs, max_rate=0.005)

In [12]:
history = model.fit(X_train, y_train,
                   epochs=n_epochs,
                   batch_size=batch_size,
                   validation_data=(X_valid, y_valid),
                   callbacks=[early_cb])

Train on 40000 samples, validate on 10000 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200


In [13]:
# MC Dropout prediction
y_probas = np.stack([model(X_test, training=True) for sample in range(100)])
y_proba = y_probas.mean(axis=0)
y_pred = np.argmax(y_proba, axis=1)
y_pred

array([3, 9, 0, ..., 5, 4, 7], dtype=int64)

In [14]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)

0.534