In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.utils import to_categorical, plot_model
from keras.datasets import mnist

Using TensorFlow backend.


In deep learning, data is stored in tensors. The term tensor applies to a scalar (0D
tensor), vector (1D tensor), matrix (2D tensor), and a multi-dimensional tensor.
From this point, the term tensor is used unless scalar, vector, or matrix makes the
explanation clearer.

The proposed model is based on MLP layers. Therefore, the input is expected to
be a 1D tensor. As such, x_train and x_test are reshaped to [60000, 28 * 28] and
[10000, 28 * 28], respectively.

In [2]:
# load mnist dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# compute the number of labels
num_labels = len(np.unique(y_train))

# convert to one-hot vector
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

# image dimensions (assumed square)
image_size = x_train.shape[1]
input_size = image_size * image_size

# resize and normalize
x_train = np.reshape(x_train, [-1, input_size])
x_train = x_train.astype('float32') / 255
x_test = np.reshape(x_test, [-1, input_size])
x_test = x_test.astype('float32') / 255

In Keras, an MLP layer is referred to as Dense, which stands for
the densely connected layer. Both the first and second MLP layers are identical in
nature with 256 units each, followed by relu activation and dropout. 256 units are
chosen since 128, 512 and 1,024 units have lower performance metrics. At 128 units,
the network converges quickly, but has a lower test accuracy. The added number
units for 512 or 1,024 does not increase the test accuracy significantly.


Important:
Since a Dense layer is a linear operation, a sequence of Dense layers can only
approximate a linear function. The problem is that the MNIST digit classification is
inherently a non-linear process. Inserting a relu activation between Dense layers will
enable MLPs to model non-linear mappings. relu or Rectified Linear Unit (ReLU)
is a simple non-linear function.

Dropout is not used in the output layer and it is only active during
training. Moreover, dropout is not present during prediction.

In [3]:
# network parameters
batch_size = 128
hidden_units = 256
dropout = 0.45

# model is a 3-layer MLP with ReLU and dropout after each layer
model = Sequential()
model.add(Dense(hidden_units, input_dim=input_size))
model.add(Activation('relu'))
model.add(Dropout(dropout))
model.add(Dense(hidden_units))
model.add(Activation('relu'))
model.add(Dropout(dropout))
model.add(Dense(num_labels))
# this is the output for one-hot vector
model.add(Activation('softmax'))
model.summary()
plot_model(model, to_file='mlp-mnist.png', show_shapes=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               200960    
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
activation_2 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                2570      
__________

ImportError: Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.

In [4]:
# loss function for one-hot vector
# use of adam optimizer
# accuracy is good metric for classification tasks
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# train the network
model.fit(x_train, y_train, epochs=20, batch_size=batch_size)

# validate the model on test dataset to determine generalization
loss, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print("\nTest accuracy: %.1f%%" % (100.0 * acc))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Test accuracy: 98.3%
