In [1]:
import numpy as np
import theano
import theano.tensor as T
import lasagne as L
import matplotlib.pyplot as plt
%matplotlib inline

Load the data:

In [2]:
import sys
import os
import time

def load_dataset():
    # We first define a download function, supporting both Python 2 and 3.
    if sys.version_info[0] == 2:
        from urllib import urlretrieve
    else:
        from urllib.request import urlretrieve

    def download(filename, source='http://yann.lecun.com/exdb/mnist/'):
        print("Downloading %s" % filename)
        urlretrieve(source + filename, filename)

    # We then define functions for loading MNIST images and labels.
    # For convenience, they also download the requested files if needed.
    import gzip

    def load_mnist_images(filename):
        if not os.path.exists(filename):
            download(filename)
        # Read the inputs in Yann LeCun's binary format.
        with gzip.open(filename, 'rb') as f:
            data = np.frombuffer(f.read(), np.uint8, offset=16)
        # The inputs are vectors now, we reshape them to monochrome 2D images,
        # following the shape convention: (examples, channels, rows, columns)
        data = data.reshape(-1, 1, 28, 28)
        # The inputs come as bytes, we convert them to float32 in range [0,1].
        # (Actually to range [0, 255/256], for compatibility to the version
        # provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.)
        return (data / np.float32(256)).squeeze()

    def load_mnist_labels(filename):
        if not os.path.exists(filename):
            download(filename)
        # Read the labels in Yann LeCun's binary format.
        with gzip.open(filename, 'rb') as f:
            data = np.frombuffer(f.read(), np.uint8, offset=8)
        # The labels are vectors of integers now, that's exactly what we want.
        return data

    # We can now download and read the training and test set images and labels.
    X_train = load_mnist_images('train-images-idx3-ubyte.gz')
    y_train = load_mnist_labels('train-labels-idx1-ubyte.gz')
    X_test = load_mnist_images('t10k-images-idx3-ubyte.gz')
    y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz')

    # We reserve the last 10000 training examples for validation.
    #X_train, X_val = X_train[:-10000], X_train[-10000:]
    #y_train, y_val = y_train[:-10000], y_train[-10000:]

    # We just return all the arrays in order, as expected in main().
    # (It doesn't matter how we do this as long as we can read them again.)
    return X_train, y_train, X_test, y_test

In [3]:
X_train, y_train, X_test, y_test = load_dataset()

Downloading train-images-idx3-ubyte.gz
Downloading train-labels-idx1-ubyte.gz
Downloading t10k-images-idx3-ubyte.gz
Downloading t10k-labels-idx1-ubyte.gz


For convinience, reshape dataset as images consisting of one channel:

In [4]:
len_train, h, w = X_train.shape
len_test = len(X_test)

In [5]:
X_train = X_train.reshape((len_train, 1, h, w))
X_test = X_test.reshape((len_test, 1, h, w))

Build and train the baseline network:

In [6]:
input_X = T.tensor4("X")

input_shape = [None, 1, h, w]
target_y = T.vector("target Y integer",dtype='int32')

In [7]:
input_layer = L.layers.InputLayer(shape=input_shape, input_var=input_X)
conv1_1     = L.layers.Conv2DLayer(input_layer, num_filters=8, filter_size=(3, 3), nonlinearity=L.nonlinearities.LeakyRectify(leakiness=0.01))
conv1_2     = L.layers.Conv2DLayer(conv1_1, num_filters=8, filter_size=(3, 3), nonlinearity=L.nonlinearities.LeakyRectify(leakiness=0.01))
pool1       = L.layers.MaxPool2DLayer(conv1_2, pool_size=(2, 2))
conv2_1     = L.layers.Conv2DLayer(pool1, num_filters=12, filter_size=(3, 3), nonlinearity=L.nonlinearities.LeakyRectify(leakiness=0.01))
conv2_2     = L.layers.Conv2DLayer(conv2_1, num_filters=12, filter_size=(3, 3), nonlinearity=L.nonlinearities.LeakyRectify(leakiness=0.01))
pool2       = L.layers.MaxPool2DLayer(conv2_2, pool_size=(2, 2))
conv3       = L.layers.Conv2DLayer(pool2, num_filters=16, filter_size=(3, 3), nonlinearity=L.nonlinearities.LeakyRectify(leakiness=0.01))
pool3       = L.layers.MaxPool2DLayer(conv3, pool_size=(2, 2))
dense       = L.layers.DenseLayer(pool3, num_units=10, nonlinearity=L.nonlinearities.softmax)

predicted_y = L.layers.get_output(dense)

loss = L.objectives.categorical_crossentropy(predicted_y, target_y).mean()
accuracy = L.objectives.categorical_accuracy(predicted_y, target_y).mean()
updates = L.updates.adamax(loss, L.layers.get_all_params(dense, trainable=True))

train_fun = theano.function([input_X, target_y], [loss, accuracy], updates=updates)
accuracy_fun = theano.function([input_X, target_y], accuracy)

Helper function for iterating batches:

In [8]:
def iterate_minibatches(X, y, batch_size):
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    for start_index in range(0, len(X) - batch_size + 1, batch_size):
        excerpt = indices[start_index:(start_index + batch_size)]

        yield X[excerpt], y[excerpt]

Helper function for resetting all network weights:

In [9]:
def reset_weights():
    params = L.layers.get_all_params(dense, trainable=True)
    
    for v in params:
        val = v.get_value()
        if(len(val.shape) < 2):
            v.set_value(L.init.Constant(0.0)(val.shape))
        else:
            v.set_value(L.init.GlorotUniform()(val.shape))

Main training loop:

In [12]:
def run_train(num_epochs=10, batch_size=500):
    reset_weights()
    
    for epoch in range(num_epochs):
        train_err = 0
        train_acc = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train, y_train, batch_size):
            inputs, targets = batch
            train_err_batch, train_acc_batch = train_fun(inputs, targets)
            train_err += train_err_batch
            train_acc += train_acc_batch
            train_batches += 1

        val_acc = 0
        val_batches = 0
        for batch in iterate_minibatches(X_test, y_test, batch_size):
            inputs, targets = batch
            val_acc += accuracy_fun(inputs, targets)
            val_batches += 1

        print("Epoch {} of {} took {:.3f}s".format(
            epoch + 1, num_epochs, time.time() - start_time))

        print("  training loss (in-iteration):\t\t{:.6f}".format(train_err / train_batches))
        print("  train accuracy:\t\t{:.2f} %".format(
            train_acc / train_batches * 100))
        print("  validation accuracy:\t\t{:.2f} %".format(
            val_acc / val_batches * 100))

In [13]:
run_train()

Epoch 1 of 10 took 58.183s
  training loss (in-iteration):		2.196295
  train accuracy:		27.74 %
  validation accuracy:		45.31 %
Epoch 2 of 10 took 57.583s
  training loss (in-iteration):		0.989165
  train accuracy:		69.23 %
  validation accuracy:		84.00 %
Epoch 3 of 10 took 57.324s
  training loss (in-iteration):		0.444850
  train accuracy:		86.65 %
  validation accuracy:		90.19 %
Epoch 4 of 10 took 57.094s
  training loss (in-iteration):		0.312953
  train accuracy:		90.69 %
  validation accuracy:		91.89 %
Epoch 5 of 10 took 57.700s
  training loss (in-iteration):		0.254560
  train accuracy:		92.42 %
  validation accuracy:		93.07 %
Epoch 6 of 10 took 56.133s
  training loss (in-iteration):		0.218833
  train accuracy:		93.45 %
  validation accuracy:		93.92 %
Epoch 7 of 10 took 57.269s
  training loss (in-iteration):		0.190671
  train accuracy:		94.22 %
  validation accuracy:		94.39 %
Epoch 8 of 10 took 57.476s
  training loss (in-iteration):		0.171113
  train accuracy:		94.84 %
  valida

Baseline model accuracy after 10 epochs is about 95.6%. It can be trained further up to 98%, after 10-20 more epochs. But for brief comparison of different architectures, this should be enough.

### (0.125) change non-linearity of convolutional layers to sigmoid;

In [14]:
input_layer = L.layers.InputLayer(shape=input_shape, input_var=input_X)
conv1_1     = L.layers.Conv2DLayer(input_layer, num_filters=8, filter_size=(3, 3), nonlinearity=L.nonlinearities.sigmoid)
conv1_2     = L.layers.Conv2DLayer(conv1_1, num_filters=8, filter_size=(3, 3), nonlinearity=L.nonlinearities.sigmoid)
pool1       = L.layers.MaxPool2DLayer(conv1_2, pool_size=(2, 2))
conv2_1     = L.layers.Conv2DLayer(pool1, num_filters=12, filter_size=(3, 3), nonlinearity=L.nonlinearities.sigmoid)
conv2_2     = L.layers.Conv2DLayer(conv2_1, num_filters=12, filter_size=(3, 3), nonlinearity=L.nonlinearities.sigmoid)
pool2       = L.layers.MaxPool2DLayer(conv2_2, pool_size=(2, 2))
conv3       = L.layers.Conv2DLayer(pool2, num_filters=16, filter_size=(3, 3), nonlinearity=L.nonlinearities.sigmoid)
pool3       = L.layers.MaxPool2DLayer(conv3, pool_size=(2, 2))
dense       = L.layers.DenseLayer(pool3, num_units=10, nonlinearity=L.nonlinearities.sigmoid)

predicted_y = L.layers.get_output(dense)

loss = L.objectives.categorical_crossentropy(predicted_y, target_y).mean()
accuracy = L.objectives.categorical_accuracy(predicted_y, target_y).mean()
updates = L.updates.adamax(loss, L.layers.get_all_params(dense, trainable=True))

train_fun = theano.function([input_X, target_y], [loss, accuracy], updates=updates)
accuracy_fun = theano.function([input_X, target_y], accuracy)

In [15]:
run_train()

Epoch 1 of 10 took 76.429s
  training loss (in-iteration):		0.218000
  train accuracy:		10.44 %
  validation accuracy:		10.28 %
Epoch 2 of 10 took 75.975s
  training loss (in-iteration):		0.068098
  train accuracy:		10.44 %
  validation accuracy:		10.28 %
Epoch 3 of 10 took 77.436s
  training loss (in-iteration):		0.038532
  train accuracy:		10.44 %
  validation accuracy:		10.28 %
Epoch 4 of 10 took 76.923s
  training loss (in-iteration):		0.025923
  train accuracy:		10.44 %
  validation accuracy:		10.28 %
Epoch 5 of 10 took 76.091s
  training loss (in-iteration):		0.018941
  train accuracy:		10.44 %
  validation accuracy:		10.28 %
Epoch 6 of 10 took 77.550s
  training loss (in-iteration):		0.014529
  train accuracy:		10.44 %
  validation accuracy:		10.28 %
Epoch 7 of 10 took 77.134s
  training loss (in-iteration):		0.011505
  train accuracy:		10.44 %
  validation accuracy:		10.28 %
Epoch 8 of 10 took 76.313s
  training loss (in-iteration):		0.009318
  train accuracy:		10.44 %
  valida

Here we can see that LeakyReLU is better than sigmoid! With latter, are stuck in some local optima and can't go further than 10% accuracy. Probably that can be fixed by changing learning rate or other training parameters.

### (0.125) change non-linearity of convolutional layers to ELU;

In [16]:
input_layer = L.layers.InputLayer(shape=input_shape, input_var=input_X)
conv1_1     = L.layers.Conv2DLayer(input_layer, num_filters=8, filter_size=(3, 3), nonlinearity=L.nonlinearities.elu)
conv1_2     = L.layers.Conv2DLayer(conv1_1, num_filters=8, filter_size=(3, 3), nonlinearity=L.nonlinearities.elu)
pool1       = L.layers.MaxPool2DLayer(conv1_2, pool_size=(2, 2))
conv2_1     = L.layers.Conv2DLayer(pool1, num_filters=12, filter_size=(3, 3), nonlinearity=L.nonlinearities.elu)
conv2_2     = L.layers.Conv2DLayer(conv2_1, num_filters=12, filter_size=(3, 3), nonlinearity=L.nonlinearities.elu)
pool2       = L.layers.MaxPool2DLayer(conv2_2, pool_size=(2, 2))
conv3       = L.layers.Conv2DLayer(pool2, num_filters=16, filter_size=(3, 3), nonlinearity=L.nonlinearities.elu)
pool3       = L.layers.MaxPool2DLayer(conv3, pool_size=(2, 2))
dense       = L.layers.DenseLayer(pool3, num_units=10, nonlinearity=L.nonlinearities.elu)

predicted_y = L.layers.get_output(dense)

loss = L.objectives.categorical_crossentropy(predicted_y, target_y).mean()
accuracy = L.objectives.categorical_accuracy(predicted_y, target_y).mean()
updates = L.updates.adamax(loss, L.layers.get_all_params(dense, trainable=True))

train_fun = theano.function([input_X, target_y], [loss, accuracy], updates=updates)
accuracy_fun = theano.function([input_X, target_y], accuracy)

In [17]:
run_train()

Epoch 1 of 10 took 87.807s
  training loss (in-iteration):		nan
  train accuracy:		9.74 %
  validation accuracy:		9.74 %
Epoch 2 of 10 took 87.801s
  training loss (in-iteration):		nan
  train accuracy:		9.75 %
  validation accuracy:		9.74 %
Epoch 3 of 10 took 87.989s
  training loss (in-iteration):		nan
  train accuracy:		9.75 %
  validation accuracy:		9.74 %
Epoch 4 of 10 took 89.569s
  training loss (in-iteration):		nan
  train accuracy:		9.75 %
  validation accuracy:		9.74 %
Epoch 5 of 10 took 92.911s
  training loss (in-iteration):		nan
  train accuracy:		9.75 %
  validation accuracy:		9.74 %
Epoch 6 of 10 took 100.801s
  training loss (in-iteration):		nan
  train accuracy:		9.75 %
  validation accuracy:		9.74 %
Epoch 7 of 10 took 90.738s
  training loss (in-iteration):		nan
  train accuracy:		9.75 %
  validation accuracy:		9.74 %
Epoch 8 of 10 took 92.707s
  training loss (in-iteration):		nan
  train accuracy:		9.75 %
  validation accuracy:		9.74 %
Epoch 9 of 10 took 97.285s
  tr

Oh no, that's even worse, we get the gradient explosion with ELU and can't train.

### (0.25) residual connection (connection may bypass 1 conv layer, or you may stack 2 pairs of convs like in the original paper).

Base residual block definition:

In [18]:
def resBlock(incoming, num_filters):
    conv1 = L.layers.Conv2DLayer(incoming, num_filters=num_filters, filter_size=(3, 3), nonlinearity=L.nonlinearities.LeakyRectify(leakiness=0.01), pad='same')
    conv2 = L.layers.Conv2DLayer(conv1, num_filters=num_filters, filter_size=(3, 3), nonlinearity=L.nonlinearities.LeakyRectify(leakiness=0.01), pad='same')
    shortcut = L.layers.Conv2DLayer(incoming, num_filters=num_filters, filter_size=1, nonlinearity=None, b=None)
    add = L.layers.ElemwiseSumLayer([conv2, shortcut])
    
    return add   

Now we replace convolutional layer pairs with resBlocks:

In [19]:
input_layer = L.layers.InputLayer(shape=input_shape, input_var=input_X)
res1        = resBlock(input_layer, 8)
pool1       = L.layers.MaxPool2DLayer(res1, pool_size=(2, 2))
res2        = resBlock(input_layer, 12)
pool2       = L.layers.MaxPool2DLayer(res2, pool_size=(2, 2))
conv3       = L.layers.Conv2DLayer(pool2, num_filters=16, filter_size=(3, 3), nonlinearity=L.nonlinearities.LeakyRectify(leakiness=0.01))
pool3       = L.layers.GlobalPoolLayer(conv3, pool_function=theano.tensor.max)
dense       = L.layers.DenseLayer(pool3, num_units=10, nonlinearity=L.nonlinearities.softmax)

predicted_y = L.layers.get_output(dense)

loss = L.objectives.categorical_crossentropy(predicted_y, target_y).mean()
accuracy = L.objectives.categorical_accuracy(predicted_y, target_y).mean()
updates = L.updates.adamax(loss, L.layers.get_all_params(dense, trainable=True))

train_fun = theano.function([input_X, target_y], [loss, accuracy], updates=updates)
accuracy_fun = theano.function([input_X, target_y], accuracy)

In [20]:
run_train()

Epoch 1 of 10 took 95.949s
  training loss (in-iteration):		2.103647
  train accuracy:		32.63 %
  validation accuracy:		52.35 %
Epoch 2 of 10 took 94.768s
  training loss (in-iteration):		1.212154
  train accuracy:		62.31 %
  validation accuracy:		72.93 %
Epoch 3 of 10 took 95.628s
  training loss (in-iteration):		0.741283
  train accuracy:		76.67 %
  validation accuracy:		81.11 %
Epoch 4 of 10 took 95.568s
  training loss (in-iteration):		0.571315
  train accuracy:		82.07 %
  validation accuracy:		83.92 %
Epoch 5 of 10 took 94.914s
  training loss (in-iteration):		0.483713
  train accuracy:		84.94 %
  validation accuracy:		86.41 %
Epoch 6 of 10 took 94.766s
  training loss (in-iteration):		0.424209
  train accuracy:		86.78 %
  validation accuracy:		87.97 %
Epoch 7 of 10 took 93.931s
  training loss (in-iteration):		0.380580
  train accuracy:		88.19 %
  validation accuracy:		88.99 %
Epoch 8 of 10 took 95.944s
  training loss (in-iteration):		0.347996
  train accuracy:		89.21 %
  valida

So our mini-ResNet starts pretty fast, but after 10 epochs accuracy is lower than for baseline net. It's possible that this architecture behaves better for more complicated datasets, like CIFAR.

### (0.25) conv maxout network (4 units within one maxout unit).

In [21]:
input_layer = L.layers.InputLayer(shape=input_shape, input_var=input_X)
conv1_1     = L.layers.Conv2DLayer(input_layer, num_filters=8, filter_size=(3, 3), nonlinearity=L.nonlinearities.LeakyRectify(leakiness=0.01))
conv1_2     = L.layers.Conv2DLayer(conv1_1, num_filters=8, filter_size=(3, 3), nonlinearity=L.nonlinearities.LeakyRectify(leakiness=0.01))
maxout1     = L.layers.FeaturePoolLayer(conv1_2, pool_size=4)
pool1       = L.layers.MaxPool2DLayer(maxout1, pool_size=(2, 2))
conv2_1     = L.layers.Conv2DLayer(pool1, num_filters=12, filter_size=(3, 3), nonlinearity=L.nonlinearities.LeakyRectify(leakiness=0.01))
conv2_2     = L.layers.Conv2DLayer(conv2_1, num_filters=12, filter_size=(3, 3), nonlinearity=L.nonlinearities.LeakyRectify(leakiness=0.01))
maxout2     = L.layers.FeaturePoolLayer(conv2_2, pool_size=4)
pool2       = L.layers.MaxPool2DLayer(maxout2, pool_size=(2, 2))
conv3       = L.layers.Conv2DLayer(pool2, num_filters=16, filter_size=(3, 3), nonlinearity=L.nonlinearities.LeakyRectify(leakiness=0.01))
pool3       = L.layers.MaxPool2DLayer(conv3, pool_size=(2, 2))
dense       = L.layers.DenseLayer(pool3, num_units=10, nonlinearity=L.nonlinearities.softmax)

predicted_y = L.layers.get_output(dense)

loss = L.objectives.categorical_crossentropy(predicted_y, target_y).mean()
accuracy = L.objectives.categorical_accuracy(predicted_y, target_y).mean()
updates = L.updates.adamax(loss, L.layers.get_all_params(dense, trainable=True))

train_fun = theano.function([input_X, target_y], [loss, accuracy], updates=updates)
accuracy_fun = theano.function([input_X, target_y], accuracy)

In [22]:
run_train()

Epoch 1 of 10 took 49.743s
  training loss (in-iteration):		1.797599
  train accuracy:		36.10 %
  validation accuracy:		69.49 %
Epoch 2 of 10 took 49.429s
  training loss (in-iteration):		0.741970
  train accuracy:		76.88 %
  validation accuracy:		82.34 %
Epoch 3 of 10 took 48.987s
  training loss (in-iteration):		0.516449
  train accuracy:		84.12 %
  validation accuracy:		86.41 %
Epoch 4 of 10 took 47.667s
  training loss (in-iteration):		0.419172
  train accuracy:		87.11 %
  validation accuracy:		89.25 %
Epoch 5 of 10 took 49.223s
  training loss (in-iteration):		0.353978
  train accuracy:		89.12 %
  validation accuracy:		90.81 %
Epoch 6 of 10 took 49.512s
  training loss (in-iteration):		0.309210
  train accuracy:		90.65 %
  validation accuracy:		91.60 %
Epoch 7 of 10 took 49.156s
  training loss (in-iteration):		0.278759
  train accuracy:		91.57 %
  validation accuracy:		92.71 %
Epoch 8 of 10 took 48.884s
  training loss (in-iteration):		0.254703
  train accuracy:		92.38 %
  valida

This is similar to the previous case: fast start, but lower accuracy after 10 epochs. But we can also notice that training time decreased, since each maxout layer halves the amount of channels.

### (0.5) replace convolution with fire module from SqueezeNet.

Here we define a fire module:

In [23]:
def fireBlock(incoming, num_s1, num_e1, num_e3):
    conv_s  = L.layers.Conv2DLayer(incoming, num_filters=num_s1, filter_size=1, nonlinearity=L.nonlinearities.LeakyRectify(leakiness=0.01), pad='same')
    conv_e1 = L.layers.Conv2DLayer(conv_s, num_filters=num_e1, filter_size=1, nonlinearity=L.nonlinearities.LeakyRectify(leakiness=0.01), pad='same')
    conv_e3 = L.layers.Conv2DLayer(conv_s, num_filters=num_e3, filter_size=3, nonlinearity=L.nonlinearities.LeakyRectify(leakiness=0.01), pad='same')
    concat  = L.layers.ConcatLayer([conv_e1, conv_e3], axis=1)
    
    return concat   

Baseline architecture with all convolutions replaced by fire modules:

In [24]:
input_layer = L.layers.InputLayer(shape=input_shape, input_var=input_X)
fire1_1     = fireBlock(input_layer, 8, 8, 8)
fire1_2     = fireBlock(fire1_1, 8, 8, 8)
pool1       = L.layers.MaxPool2DLayer(fire1_2, pool_size=(2, 2))
fire2_1     = fireBlock(pool1, 12, 12, 12)
fire2_2     = fireBlock(fire2_1, 12, 12, 12)
pool2       = L.layers.MaxPool2DLayer(fire2_2, pool_size=(2, 2))
fire3       = fireBlock(pool2, 16, 16, 16)
pool3       = L.layers.MaxPool2DLayer(fire3, pool_size=(2, 2))
dense       = L.layers.DenseLayer(pool3, num_units=10, nonlinearity=L.nonlinearities.softmax)

predicted_y = L.layers.get_output(dense)

loss = L.objectives.categorical_crossentropy(predicted_y, target_y).mean()
accuracy = L.objectives.categorical_accuracy(predicted_y, target_y).mean()
updates = L.updates.adamax(loss, L.layers.get_all_params(dense, trainable=True))

train_fun = theano.function([input_X, target_y], [loss, accuracy], updates=updates)
accuracy_fun = theano.function([input_X, target_y], accuracy)

In [25]:
run_train()

Epoch 1 of 10 took 216.430s
  training loss (in-iteration):		1.096469
  train accuracy:		65.08 %
  validation accuracy:		92.66 %
Epoch 2 of 10 took 216.319s
  training loss (in-iteration):		0.203112
  train accuracy:		93.74 %
  validation accuracy:		95.55 %
Epoch 3 of 10 took 216.477s
  training loss (in-iteration):		0.137267
  train accuracy:		95.75 %
  validation accuracy:		96.32 %


KeyboardInterrupt: 

I stopped training after just 3 epochs, since I'm short on time, but fire blocks seem to work very well (validation accuracy >90% after first epoch, beats the baseline after 3 epochs). They also take more time to train.

### (0.25) train GAN on MNIST;

Well, I also tried this, but didn't succeed. Maybe I just needed more iterations or different hyperparameters. Some intermediary results are below:

In [15]:
code_size = 100
noise = T.matrix('noise')

gen_input_layer = L.layers.InputLayer([None, code_size], input_var=noise)

gen_dense = L.layers.DenseLayer(gen_input_layer, 128 * 7 * 7, nonlinearity=T.nnet.elu)

gen_reshape = L.layers.ReshapeLayer(gen_dense, (-1, 128, 7, 7))
gen_deconv1 = L.layers.Deconv2DLayer(gen_reshape, 64, filter_size=5, stride=2, nonlinearity=T.nnet.elu)
gen_deconv2 = L.layers.Deconv2DLayer(gen_deconv1, 32, filter_size=5, stride=2, crop=3, nonlinearity=T.nnet.elu)
gen_output = L.layers.Conv2DLayer(gen_deconv2, 1, filter_size=4, nonlinearity=T.nnet.elu)
#print ("Generator output:", gen_deconv3.output_shape)



I used maxout-modification for discriminator network:

In [16]:
input_image = T.tensor4('inputs')

disc_input_layer = L.layers.InputLayer(shape=input_shape, input_var=input_image)
disc_conv1_1     = L.layers.Conv2DLayer(disc_input_layer, num_filters=8, filter_size=(3, 3), nonlinearity=L.nonlinearities.LeakyRectify(leakiness=0.01))
disc_maxout1     = L.layers.FeaturePoolLayer(disc_conv1_1, pool_size=4)
disc_pool1       = L.layers.MaxPool2DLayer(disc_maxout1, pool_size=(2, 2))
disc_conv2_1     = L.layers.Conv2DLayer(disc_pool1, num_filters=12, filter_size=(3, 3), nonlinearity=L.nonlinearities.LeakyRectify(leakiness=0.01))
disc_maxout2     = L.layers.FeaturePoolLayer(disc_conv2_1, pool_size=4)
disc_pool2       = L.layers.MaxPool2DLayer(disc_maxout2, pool_size=(2, 2))
disc_conv3       = L.layers.Conv2DLayer(disc_pool2, num_filters=16, filter_size=(3, 3), nonlinearity=L.nonlinearities.LeakyRectify(leakiness=0.01))
disc_pool3       = L.layers.MaxPool2DLayer(disc_conv3, pool_size=(2, 2))
disc_output      = L.layers.DenseLayer(disc_pool3, num_units=1, nonlinearity=L.nonlinearities.sigmoid)

In [20]:
real_out  = L.layers.get_output(disc_output)
gen_out   = L.layers.get_output(disc_output, L.layers.get_output(gen_output))

gen_loss  = L.objectives.binary_crossentropy(gen_out, 1).mean()
disc_loss = (L.objectives.binary_crossentropy(real_out, 1) + L.objectives.binary_crossentropy(gen_out, 0)).mean()
    
gen_params  = L.layers.get_all_params(gen_output, trainable=True)
disc_params = L.layers.get_all_params(disc_output, trainable=True)

updates = L.updates.adam(generator_loss, generator_params)
updates.update(L.updates.sgd(discriminator_loss, discriminator_params, 1.0))

train_fn = theano.function([noise, input_image], 
                           #[(real_out > .5).mean(), (gen_out < .5).mean()], 
                           [gen_loss, disc_loss]
                           updates=updates)
    
gen_fn = theano.function([noise], L.layers.get_output(gen_output, deterministic=True))

In [22]:
num_epochs = 100
batch_size = 100

for epoch in range(num_epochs):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train, y_train, batch_size):
            inputs, targets = batch
            noise = L.utils.floatX(np.random.rand(len(inputs), code_size))
            train_err += np.array(train_fn(noise, inputs))
            train_batches += 1
            
            if train_batches % 10 == 0:
                samples = gen_fn(L.utils.floatX(np.random.rand(42, code_size)))
                try:
                    import matplotlib.pyplot as plt
                except ImportError:
                    pass
                else:
                    plt.imsave('mnist_samples.png',
                               (samples.reshape(6, 7, 28, 28)
                                       .transpose(0, 2, 1, 3)
                                       .reshape(6*28, 7*28)),
                               cmap='gray')
                
            print("  training loss:\t\t{}".format(train_err / train_batches))

        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(
            epoch + 1, num_epochs, time.time() - start_time))
        print("  training loss:\t\t{}".format(train_err / train_batches))

        # And finally, we plot some generated data

        # After half the epochs, we start decaying the learn rate towards zero
        #if epoch >= num_epochs // 2:
            #progress = float(epoch) / num_epochs
            #eta.set_value(lasagne.utils.floatX(initial_eta*2*(1 - progress)))

  training loss:		[ 0.  1.]
  training loss:		[ 0.  1.]
  training loss:		[ 0.  1.]
  training loss:		[ 0.  1.]
  training loss:		[ 0.  1.]
  training loss:		[ 0.  1.]
  training loss:		[ 0.  1.]
  training loss:		[ 0.  1.]
  training loss:		[ 0.          0.99913194]
  training loss:		[ 0.1       0.984375]
  training loss:		[ 0.18181818  0.98082386]
  training loss:		[ 0.25        0.98046875]
  training loss:		[ 0.30769231  0.98076923]
  training loss:		[ 0.35714286  0.98158482]
  training loss:		[ 0.4        0.9828125]
  training loss:		[ 0.4375      0.98388672]
  training loss:		[ 0.47058824  0.98483456]
  training loss:		[ 0.5         0.98567708]
  training loss:		[ 0.52631579  0.98643092]
  training loss:		[ 0.55        0.98710937]
  training loss:		[ 0.57142857  0.98772321]
  training loss:		[ 0.59090909  0.98828125]
  training loss:		[ 0.60869565  0.98879076]
  training loss:		[ 0.625       0.98925781]
  training loss:		[ 0.64       0.9896875]
  training loss:		[ 0.65384615  0.99

KeyboardInterrupt: 

Current result of generator is below:

<img src="mnist_samples.png">