<img src='img/logo.png'>
<img src='img/title.png'>

In [None]:
# You will need to have Keras 1.2.1 and TensorFlow/Theano installed
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# Based on Keras example at 
# https://github.com/fchollet/keras/blob/master/examples/mnist_cnn.py

In [None]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'  # No GPU support on OSX for 'tensorflow' :-(
                                            # Getting theano to see GPU also broken!

import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 10, 5

from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D, Conv2D
from keras.utils import np_utils
from keras import backend as K
from keras.layers.advanced_activations import LeakyReLU

In [None]:
# Limit GPU memory consumption to 75%
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.75
set_session(tf.Session(config=config))

In [None]:
batch_size = 128
nb_classes = 10
epochs = 20

# input image dimensions
img_rows, img_cols = 28, 28
# number of convolutional filters to use
nb_filters = 32
# size of pooling area for max pooling
pool_size = (2, 2)
# convolution kernel size
kernel_size = (3, 3)

# the data, shuffled and split between train and test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()

In [None]:
print(X_train.shape)
for i in range(21):
    plt.subplot(3, 7, i+1)
    plt.imshow(X_train[i], cmap='gray_r')
    plt.title('Digit: %d' % y_train[i])
    plt.xticks([])
    plt.yticks([])

In [None]:
if K.image_dim_ordering() == 'th':
    X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols)
    X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1)
    X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255
print('X_train shape:', X_train.shape)
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)

In [None]:
model = Sequential()

model.add(Conv2D(nb_filters, kernel_size, padding='valid', input_shape=input_shape))
model.add(Activation('relu', name='activation_1'))
# Do we want a second convolutional 2D layer?
# This makes it start out better, but after 20 epochs it gets same place (much faster)
# model.add(Conv2D(nb_filters, kernel_size))
model.add(MaxPooling2D(pool_size=pool_size))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(128))
model.add(Activation('relu', name='activation_2'))
model.add(Dense(64))
model.add(Activation('relu', name='activation_3'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax', name='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adadelta',
              metrics=['accuracy'])

In [None]:
%%time
history = model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs,
          verbose=1, validation_data=(X_test, Y_test))
score = model.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

In [None]:
with plt.style.context('seaborn-darkgrid'):
    h = history
    x, y = [x+1 for x in h.epoch], [100*x for x in h.history['acc']]
    plt.plot(x, y)
    plt.title('Accuracy')
    plt.xlabel('Epoch')
    plt.xticks(np.arange(1,21))
    plt.ylabel('%');

In [None]:
# Let's try a different first level activation
model.get_layer(name='activation_1').activation = LeakyReLU(alpha=0.05)
model.get_layer(name='activation_2').activation = LeakyReLU(alpha=0.2)
model.get_layer(name='activation_3').activation = LeakyReLU(alpha=0.1)

In [None]:
%%time
history2 = model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs,
          verbose=1, validation_data=(X_test, Y_test))
score2 = model.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score2[0])
print('Test accuracy:', score2[1])

Using more "sensitive" activation functions obtains notably better accuracy in the training set, but only nominally better fit in the test set.  This is demonstrates a degree of overfitting.

In contrast, using the simpler ReLU activations, the training accuracy almost exactly matches the test accuracy.

In [None]:
with plt.style.context('seaborn-darkgrid'):
    h = history2
    x, y = [x+1 for x in h.epoch], [100*x for x in h.history['acc']]
    plt.plot(x, y)
    plt.title('Accuracy')
    plt.xlabel('Epoch')
    plt.xticks(np.arange(1,21))
    plt.ylabel('%');

In [None]:
model.summary()

In [None]:
y_predict = model.predict_classes(X_test)
fails = y_predict != y_test

In [None]:
X_test_fails = X_test[fails]
y_test_fails = y_test[fails]
y_predict_fails = y_predict[fails]

In [None]:
for i in range(12):
    plt.subplot(3, 4, i+1)
    plt.imshow(np.squeeze(X_test_fails[i]), cmap='gray_r')
    plt.title('Predict: %d, Actual: %d' % (y_predict_fails[i], y_test_fails[i]))
    plt.xticks([])
    plt.yticks([])
plt.tight_layout()

<img src='img/copyright.png'>