In [1]:
from __future__ import division, absolute_import
from __future__ import print_function, unicode_literals

import time
import numpy as np
import sklearn.datasets
import sklearn.cross_validation
import sklearn.metrics
import theano
import theano.tensor as T
import lasagne

# ############################### prepare data ###############################

mnist = sklearn.datasets.fetch_mldata('MNIST original')

X = mnist['data'].astype(np.float32) / 255.0
y = mnist['target'].astype("int32")
X_train, X_valid, y_train, y_valid = sklearn.cross_validation.train_test_split(
    X, y, random_state=42, train_size=50000, test_size=10000)
X_train = X_train.reshape(-1, 1, 28, 28)
X_valid = X_valid.reshape(-1, 1, 28, 28)

In [2]:
# plot a bunch of examples

In [3]:
# ############################## prepare model ##############################
# architecture:
# - 5x5 conv, 32 filters
# - ReLU
# - 2x2 maxpool
# - 5x5 conv, 32 filters
# - ReLU
# - 2x2 maxpool
# - fully connected layer - 256 units
# - ReLU
# - 50% dropout
# - fully connected layer- 10 units
# - softmax

# - conv layers take in 4-tensors with the following dimensions:
#   (batch size, number of channels, image dim 1, image dim 2)
# - the batch size can be provided as `None` to make the network
#   work for multiple different batch sizes
l_in = lasagne.layers.InputLayer(
    shape=(None, 1, 28, 28),
)

# - GlorotUniform is an intelligent initialization for conv layers
#   that people like to use (: named after Xavier Glorot
# - by default, a "valid" convolution
# - note that ReLUs are specified in the nonlinearity
l_conv1 = lasagne.layers.Conv2DLayer(
    l_in,
    num_filters=32,
    filter_size=(5, 5),
    nonlinearity=lasagne.nonlinearities.rectify,
    W=lasagne.init.GlorotUniform(),
)
# - by default, the stride of the max pool is the same as it's
#   receptive area
l_pool1 = lasagne.layers.MaxPool2DLayer(l_conv1, pool_size=(2, 2))

l_conv2 = lasagne.layers.Conv2DLayer(
    l_pool1,
    num_filters=32,
    filter_size=(5, 5),
    nonlinearity=lasagne.nonlinearities.rectify,
    W=lasagne.init.GlorotUniform(),
)
l_pool2 = lasagne.layers.MaxPool2DLayer(l_conv2, pool_size=(2, 2))

l_hidden1 = lasagne.layers.DenseLayer(
    l_pool2,
    num_units=256,
    nonlinearity=lasagne.nonlinearities.rectify,
    W=lasagne.init.GlorotUniform(),
)

# - applies the softmax after computing the final layer units
# - note that there is no ReLU
l_out = lasagne.layers.DenseLayer(
    l_hidden1,
    num_units=10,
    nonlinearity=lasagne.nonlinearities.softmax,
    W=lasagne.init.GlorotUniform(),
)

In [4]:
# ############################### network loss ###############################
# int32 vector
target_vector = T.ivector('y')


def loss_fn(output):
    return T.mean(lasagne.objectives.categorical_crossentropy(output,
                                                              target_vector))

output = lasagne.layers.get_output(l_out)
loss = loss_fn(output)

In [5]:
# ######################## compiling theano functions ########################

print("Compiling theano functions")

# - takes out all weight tensors from the network, in order to compute
#   how the weights should be updated
all_params = lasagne.layers.get_all_params(l_out)

# - calculate how the parameters should be updated
# - theano keeps a graph of operations, so that gradients w.r.t.
#   the loss can be calculated
updates = lasagne.updates.sgd(
    loss_or_grads=loss,
    params=all_params,
    learning_rate=0.001)

# - create a function that also updates the weights
# - this function takes in 2 arguments: the input batch of images and a
#   target vector (the y's) and returns a list with a single scalar
#   element (the loss)
train_fn = theano.function(inputs=[l_in.input_var, target_vector],
                           outputs=[loss],
                           updates=updates)

# - same interface as previous the previous function, but now the
#   output is a list where the first element is the loss, and the
#   second element is the actual predicted probabilities for the
#   input data
valid_fn = theano.function(inputs=[l_in.input_var, target_vector],
                           outputs=[loss, output])

Compiling theano functions


In [6]:
# ################################# training #################################

print("Starting training...")

num_epochs = 25
batch_size = 600
for epoch_num in range(num_epochs):
    start_time = time.time()
    # iterate over training minibatches and update the weights
    num_batches_train = int(np.ceil(len(X_train) / batch_size))
    train_losses = []
    for batch_num in range(num_batches_train):
        batch_slice = slice(batch_size * batch_num,
                            batch_size * (batch_num + 1))
        X_batch = X_train[batch_slice]
        y_batch = y_train[batch_slice]

        loss, = train_fn(X_batch, y_batch)
        train_losses.append(loss)
    # aggregate training losses for each minibatch into scalar
    train_loss = np.mean(train_losses)

    # calculate validation loss
    num_batches_valid = int(np.ceil(len(X_valid) / batch_size))
    valid_losses = []
    list_of_probabilities_batch = []
    for batch_num in range(num_batches_valid):
        batch_slice = slice(batch_size * batch_num,
                            batch_size * (batch_num + 1))
        X_batch = X_valid[batch_slice]
        y_batch = y_valid[batch_slice]

        loss, probabilities_batch = valid_fn(X_batch, y_batch)
        valid_losses.append(loss)
        list_of_probabilities_batch.append(probabilities_batch)
    valid_loss = np.mean(valid_losses)
    # concatenate probabilities for each batch into a matrix
    probabilities = np.concatenate(list_of_probabilities_batch)
    # calculate classes from the probabilities
    predicted_classes = np.argmax(probabilities, axis=1)
    # calculate accuracy for this epoch
    accuracy = sklearn.metrics.accuracy_score(y_valid, predicted_classes)

    total_time = time.time() - start_time
    print("Epoch: %d, train_loss=%f, valid_loss=%f, valid_accuracy=%f, time=%fs"
          % (epoch_num + 1, train_loss, valid_loss, accuracy, total_time))

Starting training...
Epoch: 1, train_loss=2.308017, valid_loss=2.300523, valid_accuracy=0.132400, time=68.693907s
Epoch: 2, train_loss=2.291289, valid_loss=2.284946, valid_accuracy=0.161500, time=67.171433s
Epoch: 3, train_loss=2.276676, valid_loss=2.270492, valid_accuracy=0.176400, time=64.029539s
Epoch: 4, train_loss=2.262301, valid_loss=2.255666, valid_accuracy=0.185800, time=63.871153s
Epoch: 5, train_loss=2.246940, valid_loss=2.239383, valid_accuracy=0.196100, time=67.484218s
Epoch: 6, train_loss=2.229594, valid_loss=2.220664, valid_accuracy=0.219200, time=70.405469s
Epoch: 7, train_loss=2.209297, valid_loss=2.198494, valid_accuracy=0.266500, time=64.971325s
Epoch: 8, train_loss=2.184988, valid_loss=2.171744, valid_accuracy=0.330200, time=67.851746s
Epoch: 9, train_loss=2.155298, valid_loss=2.138783, valid_accuracy=0.416400, time=61.432628s
Epoch: 10, train_loss=2.118418, valid_loss=2.097501, valid_accuracy=0.503700, time=61.723150s
Epoch: 11, train_loss=2.072133, valid_loss=2.045