# MNIST Dataset Keras MLP Baseline

In [1]:
# Baseline MLP for MNIST dataset
import numpy
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

## Keras has a helper function which will download the MNIST data

Let's grab the data and load it in to variables.

In [None]:
# load data
(X_train, y_train), (X_test, y_test) = mnist.load_data()

## First important point!

We take the 28x28 image and flatten it to a 1x784 "image". This really doesn't make sense, does it? It's an image. So pixels that are close together should be kept close together. That is, there is some natural spatial covariance in pixels-- if pixel A is close in space to pixel B, then pixel A is probably correlated with pixel B. That spatial covariance is lost when we flatten.

In [3]:
# flatten 28*28 images to a 784 vector for each image
num_pixels = X_train.shape[1] * X_train.shape[2]
X_train = X_train.reshape(X_train.shape[0], num_pixels).astype('float32')
X_test = X_test.reshape(X_test.shape[0], num_pixels).astype('float32')

# normalize inputs from 0-255 to 0-1
X_train = X_train / 255
X_test = X_test / 255

## One-hot encoding

We just want to "one-hot" encode the output labels. So instead of 0 through 9, we have labels [1,0,0,0,0,0,0,0,0,0] through [0,0,0,0,0,0,0,0,0,1]. So label 5 would be [0,0,0,0,1,0,0,0,0,0].

In [4]:
# one hot encode outputs
y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)
num_classes = y_test.shape[1]

## Second important point

We just have an input layer that goes to a hidden layer (ReLu activation). Then output layer is a 10 class softmax.

This is as simple as a neural network can get, but it does a pretty good job as classifying the MNIST images.

In [5]:
# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(num_pixels, input_dim=num_pixels, kernel_initializer='normal', activation='relu'))
    model.add(Dense(num_classes, kernel_initializer='normal', activation='softmax'))
    
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [6]:
# build the model
model = baseline_model()
# Fit the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=200, verbose=2)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Baseline Error: %.2f%%" % (100-scores[1]*100))

Train on 60000 samples, validate on 10000 samples
Epoch 1/10
5s - loss: 0.2793 - acc: 0.9209 - val_loss: 0.1413 - val_acc: 0.9568
Epoch 2/10
5s - loss: 0.1116 - acc: 0.9676 - val_loss: 0.0919 - val_acc: 0.9714
Epoch 3/10
5s - loss: 0.0716 - acc: 0.9797 - val_loss: 0.0780 - val_acc: 0.9774
Epoch 4/10
5s - loss: 0.0503 - acc: 0.9857 - val_loss: 0.0743 - val_acc: 0.9763
Epoch 5/10
5s - loss: 0.0371 - acc: 0.9894 - val_loss: 0.0686 - val_acc: 0.9791
Epoch 6/10
5s - loss: 0.0267 - acc: 0.9928 - val_loss: 0.0634 - val_acc: 0.9796
Epoch 7/10
5s - loss: 0.0207 - acc: 0.9946 - val_loss: 0.0629 - val_acc: 0.9808
Epoch 8/10
5s - loss: 0.0139 - acc: 0.9968 - val_loss: 0.0639 - val_acc: 0.9804
Epoch 9/10
5s - loss: 0.0111 - acc: 0.9977 - val_loss: 0.0593 - val_acc: 0.9808
Epoch 10/10
5s - loss: 0.0077 - acc: 0.9987 - val_loss: 0.0589 - val_acc: 0.9807
Baseline Error: 1.93%
