In [1]:
import numpy as np
import matplotlib.pyplot as plt
import keras
import keras.backend as K

Using TensorFlow backend.


In [2]:
(X_train, Y_train), (X_test, Y_test) = keras.datasets.mnist.load_data()
X_train, X_test = X_train[:,:,:,np.newaxis]/255, X_test[:,:,:,np.newaxis]/255

In [3]:
X = X_input = keras.layers.Input(X_train.shape[1:])
X = keras.layers.BatchNormalization()(X)
X = keras.layers.Conv2D(8, (5,5), padding='same', activation='sigmoid')(X)
X = keras.layers.MaxPooling2D()(X)
X = keras.layers.Conv2D(16, (5,5), padding='same', activation='sigmoid')(X)
X = keras.layers.MaxPooling2D()(X)
X = keras.layers.Conv2D(32, (2,2), padding='valid', activation='sigmoid')(X)
X = keras.layers.MaxPooling2D()(X)
X = keras.layers.Conv2D(64, (2,2), padding='valid', activation='sigmoid')(X)
X = keras.layers.MaxPooling2D()(X)
X = keras.layers.Flatten()(X)
X = keras.layers.Dense(np.max(Y_train)+1, activation='softmax')(X)
M = keras.Model(X_input, X)
M.compile('nadam', 'sparse_categorical_crossentropy', ['acc'])
M.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 28, 28, 1)         0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 28, 28, 1)         4         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 28, 28, 8)         208       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 14, 14, 8)         0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 14, 14, 16)        3216      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 7, 7, 16)          0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 6, 6, 32)          2080      
__________

### search for best cosine annealing max learning rate

In [4]:
hist = M.fit(X_train, Y_train, validation_data=(X_test, Y_test), batch_size=64, epochs=50, callbacks=[
    keras.callbacks.LearningRateScheduler(lambda epoch,lr: 0.1*((epoch+1)/50), verbose=1),
    keras.callbacks.EarlyStopping(monitor='loss', patience=2, verbose=1),
])

Train on 60000 samples, validate on 10000 samples
Epoch 1/50

Epoch 00001: LearningRateScheduler setting learning rate to 0.002.
Epoch 2/50

Epoch 00002: LearningRateScheduler setting learning rate to 0.004.
Epoch 3/50

Epoch 00003: LearningRateScheduler setting learning rate to 0.006.
Epoch 4/50

Epoch 00004: LearningRateScheduler setting learning rate to 0.008.
Epoch 5/50

Epoch 00005: LearningRateScheduler setting learning rate to 0.010000000000000002.
Epoch 6/50

Epoch 00006: LearningRateScheduler setting learning rate to 0.012.
Epoch 7/50

Epoch 00007: LearningRateScheduler setting learning rate to 0.014000000000000002.
Epoch 8/50

Epoch 00008: LearningRateScheduler setting learning rate to 0.016.
Epoch 00008: early stopping


### cosine annealing

In [5]:
def cosine_annealing(i, lr_min, lr_max):
    i = i + np.power(2,2)
    log_i = np.log2(i)
    t_min,t_max = np.power(2,np.floor(log_i)), np.power(2,np.floor(log_i)+1)-1
    return lr_min + np.cos(((i-t_min)/(t_max-t_min))*(np.pi/2))*(lr_max-lr_min)
M.fit(X_train, Y_train, validation_data=(X_test, Y_test), batch_size=64, epochs=60, callbacks=[
    keras.callbacks.LearningRateScheduler(lambda epoch,lr: cosine_annealing(epoch,0.012*0.25,0.012), verbose=1)
])

Train on 60000 samples, validate on 10000 samples
Epoch 1/60

Epoch 00001: LearningRateScheduler setting learning rate to 0.012.
Epoch 2/60

Epoch 00002: LearningRateScheduler setting learning rate to 0.01079422863405995.
Epoch 3/60

Epoch 00003: LearningRateScheduler setting learning rate to 0.0075000000000000015.
Epoch 4/60

Epoch 00004: LearningRateScheduler setting learning rate to 0.0030000000000000005.
Epoch 5/60

Epoch 00005: LearningRateScheduler setting learning rate to 0.012.
Epoch 6/60

Epoch 00006: LearningRateScheduler setting learning rate to 0.011774351209636415.
Epoch 7/60

Epoch 00007: LearningRateScheduler setting learning rate to 0.011108719811121773.
Epoch 8/60

Epoch 00008: LearningRateScheduler setting learning rate to 0.01003648334221227.
Epoch 9/60

Epoch 00009: LearningRateScheduler setting learning rate to 0.008611408216728603.
Epoch 10/60

Epoch 00010: LearningRateScheduler setting learning rate to 0.006904953652058024.
Epoch 11/60

Epoch 00011: LearningRateS

Epoch 38/60

Epoch 00038: LearningRateScheduler setting learning rate to 0.011080240856136677.
Epoch 39/60

Epoch 00039: LearningRateScheduler setting learning rate to 0.010869119545301238.
Epoch 40/60

Epoch 00040: LearningRateScheduler setting learning rate to 0.010637798317452759.
Epoch 41/60

Epoch 00041: LearningRateScheduler setting learning rate to 0.010386870970865488.
Epoch 42/60

Epoch 00042: LearningRateScheduler setting learning rate to 0.010116981632439289.
Epoch 43/60

Epoch 00043: LearningRateScheduler setting learning rate to 0.009828823104235117.
Epoch 44/60

Epoch 00044: LearningRateScheduler setting learning rate to 0.009523135085062081.
Epoch 45/60

Epoch 00045: LearningRateScheduler setting learning rate to 0.00920070227168118.
Epoch 46/60

Epoch 00046: LearningRateScheduler setting learning rate to 0.008862352344500002.
Epoch 47/60

Epoch 00047: LearningRateScheduler setting learning rate to 0.008508953842928966.
Epoch 48/60

Epoch 00048: LearningRateScheduler set

<keras.callbacks.History at 0x7f1e16ed3668>