Deep Learning
=============

Assignment 4
------------

Previously in `2_fullyconnected.ipynb` and `3_regularization.ipynb`, we trained fully connected networks to classify [notMNIST](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html) characters.

The goal of this assignment is make the neural network convolutional.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save  # hint to help gc free up memory
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (18724, 28, 28) (18724,)


Reformat into a TensorFlow-friendly shape:
- convolutions need the image data formatted as a cube (width by height by #channels)
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10
num_channels = 1 # grayscale

def reformat(dataset, labels):
    dataset = dataset.reshape(
        (-1, image_size, image_size, num_channels)).astype(np.float32)
    labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
    return dataset, labels

train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28, 1) (200000, 10)
Validation set (10000, 28, 28, 1) (10000, 10)
Test set (18724, 28, 28, 1) (18724, 10)


In [4]:
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
            / predictions.shape[0])

Let's build a small network with two convolutional layers, followed by one fully connected layer. Convolutional networks are more expensive computationally, so we'll limit its depth and number of fully connected nodes.

In [8]:
batch_size = 16
patch_size = 5
depth = 16
num_hidden = 64

graph = tf.Graph()

with graph.as_default():

    # Input data.
    tf_train_dataset = tf.placeholder(
        tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables.
    layer1_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, num_channels, depth], stddev=0.1))
    layer1_biases = tf.Variable(tf.zeros([depth]))
    layer2_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, depth, depth], stddev=0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
    layer3_weights = tf.Variable(tf.truncated_normal(
        [image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1))
    layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
    layer4_weights = tf.Variable(tf.truncated_normal(
        [num_hidden, num_labels], stddev=0.1))
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
    # Model.
    def model(data):
        conv = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME')
        hidden = tf.nn.relu(conv + layer1_biases)
        conv = tf.nn.conv2d(hidden, layer2_weights, [1, 2, 2, 1], padding='SAME')
        hidden = tf.nn.relu(conv + layer2_biases)
        shape = hidden.get_shape().as_list()
        reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
        hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
        return tf.matmul(hidden, layer4_weights) + layer4_biases
  
    # Training computation.
    logits = model(tf_train_dataset)
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
    test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [11]:
num_steps = 1001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 50 == 0):
            print('Step %d - Loss %f - Minibatch %.1f%% - Validation %.1f%%' %
                  (step, l, accuracy(predictions, batch_labels),accuracy(
                    valid_prediction.eval(), valid_labels)))
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

Initialized
Step 0 - Loss 2.534234 - Minibatch 18.8% - Validation 11.1%
Step 50 - Loss 1.474175 - Minibatch 50.0% - Validation 50.1%
Step 100 - Loss 0.775141 - Minibatch 81.2% - Validation 73.2%
Step 150 - Loss 0.863082 - Minibatch 68.8% - Validation 74.9%
Step 200 - Loss 1.171659 - Minibatch 75.0% - Validation 78.3%
Step 250 - Loss 0.956151 - Minibatch 75.0% - Validation 76.8%
Step 300 - Loss 0.832626 - Minibatch 68.8% - Validation 78.8%
Step 350 - Loss 1.001116 - Minibatch 75.0% - Validation 79.2%
Step 400 - Loss 0.715934 - Minibatch 87.5% - Validation 81.1%
Step 450 - Loss 0.728667 - Minibatch 75.0% - Validation 76.1%
Step 500 - Loss 0.778035 - Minibatch 75.0% - Validation 81.1%
Step 550 - Loss 0.738171 - Minibatch 75.0% - Validation 81.5%
Step 600 - Loss 1.016576 - Minibatch 62.5% - Validation 80.7%
Step 650 - Loss 0.951612 - Minibatch 68.8% - Validation 82.3%
Step 700 - Loss 0.267146 - Minibatch 87.5% - Validation 81.4%
Step 750 - Loss 0.629344 - Minibatch 81.2% - Validation 82.0%

---
Problem 1
---------

The convolutional model above uses convolutions with stride 2 to reduce the dimensionality. Replace the strides by a max pooling operation (`nn.max_pool()`) of stride 2 and kernel size 2.

---

In [12]:
from tensorflow.python.framework import ops
ops.reset_default_graph()

batch_size = 16
patch_size = 5
depth = 16
num_hidden = 64

graph = tf.Graph()

with graph.as_default():

    # Input data.
    tf_train_dataset = tf.placeholder(
        tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables.
    layer1_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, num_channels, depth], stddev=0.1))
    layer1_biases = tf.Variable(tf.zeros([depth]))
    layer2_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, depth, depth], stddev=0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
    layer3_weights = tf.Variable(tf.truncated_normal(
        [image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1))
    layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
    layer4_weights = tf.Variable(tf.truncated_normal(
        [num_hidden, num_labels], stddev=0.1))
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
    # Model.
    def model(data):
        # layer 1 convo. max_pool 2x2
        conv = tf.nn.conv2d(data, layer1_weights, [1, 1, 1, 1], padding='SAME')
        pool = tf.nn.max_pool(conv, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
        hidden = tf.nn.relu(pool + layer1_biases)
        # layer 2 convo. max_pool 2x2
        conv = tf.nn.conv2d(hidden, layer2_weights, [1, 1, 1, 1], padding='SAME')
        pool = tf.nn.max_pool(conv, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
        hidden = tf.nn.relu(pool + layer2_biases)
        # layer 3 fully connected.
        shape = hidden.get_shape().as_list()
        reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
        hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
        return tf.matmul(hidden, layer4_weights) + layer4_biases
  
    # Training computation.
    logits = model(tf_train_dataset)
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
    test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [13]:
num_steps = 1001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    for step in range(num_steps):
        offset = (step * batch_size)b % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 50 == 0):
            print('Step %d - Loss %f - Minibatch %.1f%% - Validation %.1f%%' %
                  (step, l, accuracy(predictions, batch_labels),accuracy(
                    valid_prediction.eval(), valid_labels)))
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

Initialized
Step 0 - Loss 3.287148 - Minibatch 6.2% - Validation 10.1%
Step 50 - Loss 1.270621 - Minibatch 62.5% - Validation 42.4%
Step 100 - Loss 0.914505 - Minibatch 81.2% - Validation 74.7%
Step 150 - Loss 0.958688 - Minibatch 68.8% - Validation 76.1%
Step 200 - Loss 1.037660 - Minibatch 75.0% - Validation 77.6%
Step 250 - Loss 1.005606 - Minibatch 75.0% - Validation 77.7%
Step 300 - Loss 0.861946 - Minibatch 68.8% - Validation 79.6%
Step 350 - Loss 1.015530 - Minibatch 68.8% - Validation 76.6%
Step 400 - Loss 0.662101 - Minibatch 75.0% - Validation 81.4%
Step 450 - Loss 0.697596 - Minibatch 87.5% - Validation 79.0%
Step 500 - Loss 0.629703 - Minibatch 81.2% - Validation 82.0%
Step 550 - Loss 0.636615 - Minibatch 75.0% - Validation 81.8%
Step 600 - Loss 0.932539 - Minibatch 81.2% - Validation 81.4%
Step 650 - Loss 0.902850 - Minibatch 75.0% - Validation 82.4%
Step 700 - Loss 0.134424 - Minibatch 100.0% - Validation 83.8%
Step 750 - Loss 0.446201 - Minibatch 81.2% - Validation 83.8%

---
Problem 2
---------

Try to get the best performance you can using a convolutional net. Look for example at the classic [LeNet5](http://yann.lecun.com/exdb/lenet/) architecture, adding Dropout, and/or adding learning rate decay.

---

In [5]:
import datetime

def eta(start, n, total):
    now = datetime.datetime.now()
    diff = now - start
    secs = (total-n) * 1.0 * diff.seconds / (n+1) # +1 to avoid zero division.
    ends = now + datetime.timedelta(seconds=secs)
    return ends.strftime("%H:%M:%S")


In [33]:
from tensorflow.python.framework import ops
ops.reset_default_graph()

batch_size = 16
patch_size = 5
depth = 16
num_hidden_full_1 = 512
num_hidden_full_2 = 64

def init_weights(shape, method='xavier'):
    if method == 'zeros':
        return tf.Variable(tf.zeros(shape, dtype=tf.float32))
    elif method == 'ones':
        return tf.Variable(tf.ones(shape, dtype=tf.float32))
    elif method == 'uniform':
        return tf.Variable(tf.random_normal(shape, stddev=0.01, dtype=tf.float32))
    elif method == 'altxavier':
        low = -4*np.sqrt(6.0/(shape[0] + shape[1])) # {sigmoid:4, tanh:1} 
        high = 4*np.sqrt(6.0/(shape[0] + shape[1]))
        return tf.Variable(tf.random_uniform(shape, minval=low, maxval=high, dtype=tf.float32))
    elif method == 'xavier':
        sd = np.sqrt(3.0/(shape[0] + shape[1]))
        return tf.Variable(tf.truncated_normal(shape, stddev=sd))
    else: # method == 'kaiming':
        sd = np.sqrt(2.0/(shape[0]))
        return tf.Variable(tf.truncated_normal(shape, stddev=sd))


graph = tf.Graph()

with graph.as_default():

    # Input data.
    tf_train_dataset = tf.placeholder(
        tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables.
    layer1_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, num_channels, depth], stddev=0.1))
    layer1_biases = tf.Variable(tf.zeros([depth]))
    layer2_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, depth, depth * 2], stddev=0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth * 2]))
    
    layer3_weights = init_weights([image_size // 4 * image_size // 4 * depth * 2, num_hidden_full_1])
    layer3_biases = init_weights([num_hidden_full_1], method='ones')
    keep3 = tf.placeholder("float")
    layer4_weights = init_weights([num_hidden_full_1, num_hidden_full_2])
    layer4_biases = init_weights([num_hidden_full_2], method='ones')
    keep4 = tf.placeholder("float")
    layer5_weights = init_weights([num_hidden_full_2, num_labels])
    layer5_biases = init_weights([num_labels], method='ones')
  
    # Model. using elu not relu.
    def model(data):
        # layer 1 convo. max_pool 2x2.
        conv = tf.nn.conv2d(data, layer1_weights, [1, 1, 1, 1], padding='SAME')
        conv = tf.nn.elu(conv + layer1_biases)
        hidden = tf.nn.max_pool(conv, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
        # layer 2 convo. max_pool 2x2
        conv = tf.nn.conv2d(hidden, layer2_weights, [1, 1, 1, 1], padding='SAME')
        conv = tf.nn.elu(conv + layer2_biases)
        hidden = tf.nn.max_pool(conv, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
        # print('post conv', hidden.get_shape())
        # layer 3 fully connected.
        shape = hidden.get_shape().as_list()
        reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
        hidden = tf.nn.elu(tf.matmul(reshape, layer3_weights) + layer3_biases)
        hidden = tf.nn.dropout(hidden, keep3)
        # layer 4 fully connected
        hidden = tf.nn.elu(tf.matmul(hidden, layer4_weights) + layer4_biases)
        hidden = tf.nn.dropout(hidden, keep4)
        # layer 5 output
        output = tf.matmul(hidden, layer5_weights) + layer5_biases
        return output
  
    # Training computation.
    logits = model(tf_train_dataset)
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    
    # Optimizer.
    # optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
    optimizer = tf.train.AdamOptimizer(1e-4).minimize(loss)
  
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
    test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [34]:
num_steps = 40001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    start = datetime.datetime.now()
    print('Initialized')
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels,
                    keep3:0.9, keep4:0.9}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            ends = eta(start, step, num_steps)
            valpred = valid_prediction.eval(feed_dict={keep3:1.0, keep4:1.0})            
            print('Step %d - Loss %f - Minibatch %.1f%% - Validation %.1f%% - ETA %s' %
                  (step, l, accuracy(predictions, batch_labels), accuracy(valpred, valid_labels), ends))
    print('Test accuracy: %.1f%%' %
          accuracy(test_prediction.eval(feed_dict={keep3:1.0, keep4:1.0}), test_labels))

Initialized
Step 0 - Loss 4.741639 - Minibatch 12.5% - Validation 6.2% - ETA 13:32:02
Step 500 - Loss 1.037983 - Minibatch 68.8% - Validation 77.2% - ETA 14:38:35
Step 1000 - Loss 1.005712 - Minibatch 68.8% - Validation 78.1% - ETA 14:35:58
Step 1500 - Loss 0.407815 - Minibatch 87.5% - Validation 82.2% - ETA 14:32:53
Step 2000 - Loss 1.493692 - Minibatch 56.2% - Validation 82.1% - ETA 14:31:01
Step 2500 - Loss 0.976753 - Minibatch 75.0% - Validation 81.5% - ETA 14:29:53
Step 3000 - Loss 0.303868 - Minibatch 87.5% - Validation 83.8% - ETA 14:29:21
Step 3500 - Loss 0.618233 - Minibatch 81.2% - Validation 84.2% - ETA 14:30:18
Step 4000 - Loss 0.858066 - Minibatch 81.2% - Validation 84.5% - ETA 14:29:51
Step 4500 - Loss 0.574704 - Minibatch 93.8% - Validation 85.1% - ETA 14:29:39
Step 5000 - Loss 0.336020 - Minibatch 87.5% - Validation 84.6% - ETA 14:29:22
Step 5500 - Loss 0.457557 - Minibatch 81.2% - Validation 86.0% - ETA 14:29:07
Step 6000 - Loss 0.706136 - Minibatch 75.0% - Validation 

In [36]:
# DL with inception.

from tensorflow.python.framework import ops
ops.reset_default_graph()

batch_size = 16
patch_size = 5
depth = 16
num_hidden_full_1 = 96
num_hidden_full_2 = 96

graph = tf.Graph()

with graph.as_default():

    # Input data.
    tf_train_dataset = tf.placeholder(
        tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables.
    #layer1_weights = tf.Variable(tf.truncated_normal(
    #    [patch_size, patch_size, num_channels, depth], stddev=0.1))
    #layer1_biases = tf.Variable(tf.zeros([depth]))
    #layer2_weights = tf.Variable(tf.truncated_normal(
    #    [patch_size, patch_size, depth, depth], stddev=0.1))
    #layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
    # standard conv2d:
    # layer3_weights = tf.Variable(tf.truncated_normal(
    #     [image_size // 4 * image_size // 4 * depth, num_hidden_full_1], stddev=0.1))
    # inception is = [16, 28, 28, 64] reshaped (16, 50176)
    layer3_weights = init_weights([image_size * image_size * 64, num_hidden_full_1])
    layer3_biases = init_weights([num_hidden_full_1], method="ones")
    keep3 = tf.placeholder("float")
    layer4_weights = init_weights([num_hidden_full_1, num_hidden_full_2])
    layer4_biases = init_weights([num_hidden_full_2], method="ones")
    keep4 = tf.placeholder("float")
    layer5_weights = init_weights([num_hidden_full_2, num_labels])
    layer5_biases = init_weights([num_labels], method="ones")
    # vars for inception
    inception_1x1_weights = tf.Variable(tf.truncated_normal(
        [1, 1, num_channels, depth], stddev=0.1))
    inception_1x1_biases = tf.Variable(tf.zeros([depth]))
    pre_inception_1x1_weights = tf.Variable(tf.truncated_normal(
        [1, 1, num_channels, depth], stddev=0.1))
    pre_inception_1x1_biases = tf.Variable(tf.zeros([depth]))
    inception_1x1_pool_weights = tf.Variable(tf.truncated_normal(
        [1, 1, num_channels, depth], stddev=0.1))
    inception_1x1_pool_biases = tf.Variable(tf.zeros([depth]))
    inception_3x3_weights = tf.Variable(tf.truncated_normal(
        [3, 3, depth, depth], stddev=0.1))
    inception_3x3_biases = tf.Variable(tf.zeros([depth]))
    inception_5x5_weights = tf.Variable(tf.truncated_normal(
        [5, 5, depth, depth], stddev=0.1))
    inception_5x5_biases = tf.Variable(tf.zeros([depth]))

    def inception_layer(data):
        # Inception 1x1
        conv_1x1 = tf.nn.conv2d(data, inception_1x1_weights, [1, 1, 1, 1], padding='SAME')
        conv_1x1 = tf.nn.relu(conv_1x1 + inception_1x1_biases)
        print("1x1", conv_1x1.get_shape())
        ## 1x1 - before the bigger patches
        conv_pre = tf.nn.conv2d(data, pre_inception_1x1_weights, [1, 1, 1, 1], padding='SAME')
        conv_pre = tf.nn.relu(conv_pre + pre_inception_1x1_biases)
        # Pooling 3x3
        ## average pool followed by a 1x1
        conv_pool = tf.nn.avg_pool(data, [1, 3, 3, 1], [1, 1, 1, 1], padding='SAME')
        conv_pool = tf.nn.conv2d(conv_pool, inception_1x1_pool_weights, [1, 1, 1, 1], padding='SAME')
        conv_pool = tf.nn.relu(conv_pool + inception_1x1_pool_biases)
        print("pool", conv_pool.get_shape())
        # Inception 3x3
        ## 1x1 followed by a 3x3
        conv_3x3 = tf.nn.conv2d(conv_pre, inception_3x3_weights, [1, 1, 1, 1], padding='SAME')
        conv_3x3 = tf.nn.relu(conv_3x3 + inception_3x3_biases)
        print("3x3", conv_3x3.get_shape())
        # Inception 5x5
        ## 1x1 followed by a 5x5
        conv_5x5 = tf.nn.conv2d(conv_pre, inception_5x5_weights, [1, 1, 1, 1], padding='SAME')
        conv_5x5 = tf.nn.relu(conv_5x5 + inception_5x5_biases)
        print("5x5", conv_5x5.get_shape())
        inception_result = tf.concat(3, [conv_1x1, conv_3x3, conv_5x5, conv_pool])
        print(inception_result.get_shape())
        return inception_result

    # Model. using elu not relu.
    def model(data):
        # layer 1 convo. max_pool 2x2.
        #conv = tf.nn.conv2d(data, layer1_weights, [1, 1, 1, 1], padding='SAME')
        #pool = tf.nn.max_pool(conv, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
        #hidden = tf.nn.elu(pool + layer1_biases)
        # layer 2 convo. max_pool 2x2
        #conv = tf.nn.conv2d(hidden, layer2_weights, [1, 1, 1, 1], padding='SAME')
        #pool = tf.nn.max_pool(conv, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
        #hidden = tf.nn.elu(pool + layer2_biases)
        hidden = inception_layer(data)
        # layer 3 fully connected.
        shape = hidden.get_shape().as_list()
        reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
        hidden = tf.nn.elu(tf.matmul(reshape, layer3_weights) + layer3_biases)
        hidden = tf.nn.dropout(hidden, keep3)
        # layer 4 fully connected
        hidden = tf.nn.elu(tf.matmul(hidden, layer4_weights) + layer4_biases)
        hidden = tf.nn.dropout(hidden, keep4)
        # layer 5 output
        output = tf.matmul(hidden, layer5_weights) + layer5_biases
        return output
  
    # Training computation.
    logits = model(tf_train_dataset)
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    
    # Optimizer.
    # optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
    optimizer = tf.train.AdamOptimizer(1e-4).minimize(loss)
  
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
    test_prediction = tf.nn.softmax(model(tf_test_dataset))

1x1 (16, 28, 28, 16)
pool (16, 28, 28, 16)
3x3 (16, 28, 28, 16)
5x5 (16, 28, 28, 16)
(16, 28, 28, 64)
1x1 (10000, 28, 28, 16)
pool (10000, 28, 28, 16)
3x3 (10000, 28, 28, 16)
5x5 (10000, 28, 28, 16)
(10000, 28, 28, 64)
1x1 (18724, 28, 28, 16)
pool (18724, 28, 28, 16)
3x3 (18724, 28, 28, 16)
5x5 (18724, 28, 28, 16)
(18724, 28, 28, 64)


In [10]:
num_steps = 10001 # 40001 IMPOSIBLE ON MY NOTEBOOK.

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    start = datetime.datetime.now()
    print('Initialized')
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels,
                    keep3:0.9, keep4:0.9}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            ends = eta(start, step, num_steps)
            valpred = valid_prediction.eval(feed_dict={keep3:1.0, keep4:1.0})            
            print('Step %d - Loss %f - Minibatch %.1f%% - Validation %.1f%% - ETA %s' %
                  (step, l, accuracy(predictions, batch_labels), accuracy(valpred, valid_labels), ends))
    print('Test accuracy: %.1f%%' %
          accuracy(test_prediction.eval(feed_dict={keep3:1.0, keep4:1.0}), test_labels))


Initialized
Step 0 - Loss 2.660672 - Minibatch 12.5% - Validation 13.9% - ETA 11:00:12
Step 500 - Loss 0.677274 - Minibatch 68.8% - Validation 80.6% - ETA 12:01:06


KeyboardInterrupt: 