In [None]:
# 1. No, they would return the same gradients, makes learning impossible

In [None]:
# 2. Yes

In [None]:
# 3. No 0/dead neurons, average output at 0 negates the vanishing gradients problem, self normalizing effect for networks with all dense layers

In [None]:
# 4.
# SELU: networks with all dense layers
# leaky ReLU(s): everything else
# ReLU: simplicity
# tanh: for output layers with result -1 to 1
# logistic: for output layers with probability-like output
# softmax: for output layers with mutually exclusive results

In [None]:
# 5. High momentum can cause it to roll pass optimum

In [None]:
# 6. l1 regularization, manual forcing weights to 0, and tensorflow optimization all can produce sparse model

In [None]:
# 7. Yes, it equals to training many networks. No
# Yes, it equals to predicting with many networks

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [2]:
# 8. Deep neural net with CIFAR 10

In [3]:
# Import dataset
from tensorflow.keras.datasets import cifar10

(X_train_full, y_train_full), (X_test, y_test) = cifar10.load_data()
X_train_full.shape

(50000, 32, 32, 3)

In [4]:
X_valid, X_train = X_train_full[:10000], X_train_full[10000:]
y_valid, y_train = y_train_full[:10000], y_train_full[10000:]

In [5]:
# Scaling for non SELU activations
X_valid = X_valid.astype('float32') / 255
X_train = X_train.astype('float32') / 255
X_test = X_test.astype('float32') / 255

In [26]:
# Model architecture
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, BatchNormalization

model = Sequential()
model.add(Flatten(input_shape=[32, 32, 3]))
for i in range(0, 20):
    if i > 15 and i <19:
        model.add(Dropout(rate=0.25))
    model.add(BatchNormalization())
    model.add(Dense(100, activation='elu', kernel_initializer='lecun_normal'))
model.add(Dense(10, activation='softmax'))

In [27]:
optimizer = keras.optimizers.RMSprop(lr=0.001, rho=0.9)

In [28]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [29]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 3072)              0         
_________________________________________________________________
batch_normalization_40 (Batc (None, 3072)              12288     
_________________________________________________________________
dense_42 (Dense)             (None, 100)               307300    
_________________________________________________________________
batch_normalization_41 (Batc (None, 100)               400       
_________________________________________________________________
dense_43 (Dense)             (None, 100)               10100     
_________________________________________________________________
batch_normalization_42 (Batc (None, 100)               400       
_________________________________________________________________
dense_44 (Dense)             (None, 100)              

In [10]:
# define callback
from tensorflow.keras.callbacks import EarlyStopping

early_cb = EarlyStopping(patience=10, restore_best_weights=True)

In [30]:
# 1 cycle scheduling learning rate
K = keras.backend

class OneCycleScheduler(keras.callbacks.Callback):
    def __init__(self, iterations, max_rate, start_rate=None,
                last_iter=None, last_rate=None):
        self.iterations = iterations
        self.max_rate = max_rate
        self.start_rate = start_rate or max_rate/10
        self.last_iterations = last_iter or iterations//10 + 1
        self.half_iteration = (iterations - self.last_iterations) // 2
        self.last_rate = last_rate or self.start_rate/1000
        self.iteration = 0
    
    def _interpolate(self, iter1, iter2, rate1, rate2):
        return((rate2-rate1) * (self.iteration-iter1) / (iter2-iter1) + rate1)
    
    def on_batch_begin(self, batch, logs):
        if self.iteration < self.half_iteration:
            rate = self._interpolate(0, self.half_iteration, 
                                     self.start_rate, self.max_rate)
        elif self.iteration < 2 * self.half_iteration:
            rate = self._interpolate(self.half_iteration, 2*self.half_iteration,
                                    self.max_rate, self.start_rate)
        else:
            rate = self._interpolate(2*self.half_iteration, self.iterations,
                                    self.start_rate, self.last_rate)
            rate = max(rate, self.last_rate)
        self.iteration += 1
        K.set_value(self.model.optimizer.lr, rate)   

In [31]:
n_epochs = 100
batch_size = 32

In [32]:
onecycle = OneCycleScheduler(len(X_train) // batch_size*n_epochs, max_rate=0.01)

In [33]:
history = model.fit(X_train, y_train,
                   epochs=n_epochs,
                   batch_size=batch_size,
                   validation_data=(X_valid, y_valid),
                   callbacks=[onecycle])

Train on 40000 samples, validate on 10000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100


Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100




In [34]:
# MC Dropout prediction
y_probas = np.stack([model(X_test, training=True) for sample in range(100)])
y_proba = y_probas.mean(axis=0)
y_pred = np.argmax(y_proba, axis=1)
y_pred

array([3, 8, 0, ..., 3, 4, 7], dtype=int64)

In [35]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)

0.5395