Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [17]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in _notmist.ipynb_.

In [18]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [19]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [20]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

In [21]:
print(train_dataset.shape)
print(valid_dataset.shape)
print(test_dataset.shape)

train_dataset.flags.writeable = False
valid_dataset.flags.writeable = False
test_dataset.flags.writeable = False

from itertools import izip

data_label = dict()

def add_dataset(dataset, labels):
    for data, label in izip(dataset, labels):
        if data.data not in data_label:
            data_label[data.data] = set()
        data_label[data.data].add(label)
        
add_dataset(train_dataset, save["train_labels"])
add_dataset(valid_dataset, save["valid_labels"])
add_dataset(test_dataset, save["test_labels"])

ambiguous_data = set(data for data, label_set in data_label.iteritems() if len(label_set) > 1)

occurred = set()
# remove ambiguous and deduplicate
def cleaned(dataset, labels):
    mask = []
    for data in dataset:
        mask.append(data.data not in ambiguous_data and data.data not in occurred)
        occurred.add(data.data)
    mask = np.array(mask)
    print(mask.shape, dataset.shape, labels.shape)
    return dataset[mask, :], labels[mask, :]

train_dataset, train_labels = cleaned(train_dataset, train_labels)
valid_dataset, valid_labels = cleaned(valid_dataset, valid_labels)
test_dataset, test_labels = cleaned(test_dataset, test_labels)

(200000, 784)
(10000, 784)
(10000, 784)
(200000,) (200000, 784) (200000, 10)
(10000,) (10000, 784) (10000, 10)
(10000,) (10000, 784) (10000, 10)


---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

In [22]:
# LR + L2 regularization
beta = 0.001
batch_size = 128

# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
                                shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)

# Variables.
weights = tf.Variable(
tf.truncated_normal([image_size * image_size, num_labels]))
biases = tf.Variable(tf.zeros([num_labels]))

# Training computation.
logits = tf.matmul(tf_train_dataset, weights) + biases
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
loss = loss + tf.reduce_mean(tf.nn.l2_loss(weights)) * beta

# Optimizer.
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
valid_prediction = tf.nn.softmax(
tf.matmul(tf_valid_dataset, weights) + biases)
test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

num_steps = 3001

with tf.Session() as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
          [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(
            valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 20.625202
Minibatch accuracy: 6.2%
Validation accuracy: 11.1%
Minibatch loss at step 500: 2.670782
Minibatch accuracy: 75.8%
Validation accuracy: 74.5%
Minibatch loss at step 1000: 1.733475
Minibatch accuracy: 75.8%
Validation accuracy: 77.3%
Minibatch loss at step 1500: 1.201692
Minibatch accuracy: 79.7%
Validation accuracy: 78.6%
Minibatch loss at step 2000: 1.070226
Minibatch accuracy: 77.3%
Validation accuracy: 79.4%
Minibatch loss at step 2500: 0.946858
Minibatch accuracy: 81.2%
Validation accuracy: 79.4%
Minibatch loss at step 3000: 0.841304
Minibatch accuracy: 81.2%
Validation accuracy: 80.2%
Test accuracy: 87.8%


In [23]:
#fully connected NN + L2
batch_size = 128
num_steps = 30000 + 1
beta = 0.001

#input data and labels
x = tf.placeholder(tf.float32, shape=(None, image_size * image_size))
y_true = tf.placeholder(tf.float32, shape=(None, num_labels))

#hidden layer
W_1 = tf.Variable(tf.truncated_normal([image_size * image_size, 1024]))
b_1 = tf.Variable(tf.zeros([1024]))
h_1 = tf.nn.relu(tf.matmul(x, W_1) + b_1)

#output layer
W_2 = tf.Variable(tf.truncated_normal([1024, num_labels]))
b_2 = tf.Variable(tf.zeros([num_labels]))
logits = tf.matmul(h_1, W_2) + b_2
prob = tf.nn.softmax(logits)

#loss function
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, y_true))
l2 = beta * (tf.reduce_mean(tf.nn.l2_loss(W_1)) + tf.reduce_mean(tf.nn.l2_loss(W_2)))
loss = loss + l2

#optimizer.
optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

#accuracy
correct_prediction = tf.equal(tf.argmax(prob, 1), tf.argmax(y_true, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

with tf.Session() as sess:
    tf.initialize_all_variables().run()
    
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        _, l, a = sess.run([optimizer, loss, accuracy], feed_dict={x : batch_data, y_true : batch_labels})
        if step % 2000 == 0:
            print("step %d" % step)
            print("minibatch loss %f" % l)
            print("minibatch accuracy %f" % a)
            a = sess.run(accuracy, feed_dict={x: valid_dataset, y_true: valid_labels})
            print("validation accuracy %f" % a)
        
    print(sess.run(accuracy, feed_dict={x: test_dataset, y_true: test_labels}))

step 0
minibatch loss 661.489014
minibatch accuracy 0.109375
validation accuracy 0.181380
step 2000
minibatch loss 213.647110
minibatch accuracy 0.820312
validation accuracy 0.809769
step 4000
minibatch loss 139.746170
minibatch accuracy 0.851562
validation accuracy 0.824670
step 6000
minibatch loss 92.858833
minibatch accuracy 0.929688
validation accuracy 0.823885
step 8000
minibatch loss 62.133446
minibatch accuracy 0.906250
validation accuracy 0.833968
step 10000
minibatch loss 41.858868
minibatch accuracy 0.867188
validation accuracy 0.851781
step 12000
minibatch loss 28.116917
minibatch accuracy 0.867188
validation accuracy 0.854246
step 14000
minibatch loss 18.785597
minibatch accuracy 0.929688
validation accuracy 0.863433
step 16000
minibatch loss 12.837034
minibatch accuracy 0.882812
validation accuracy 0.872731
step 18000
minibatch loss 8.763467
minibatch accuracy 0.898438
validation accuracy 0.874300
step 20000
minibatch loss 5.945143
minibatch accuracy 0.890625
validation ac

---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

In [24]:
num_steps = 1000 + 1
batch_data = train_dataset[:batch_size, :]
batch_labels = train_labels[:batch_size, :]
with tf.Session() as sess:
    tf.initialize_all_variables().run()
    
    for step in range(num_steps):

        _, l, a = sess.run([optimizer, loss, accuracy], feed_dict={x : batch_data, y_true : batch_labels})
        if step % 200 == 0:
            print("step %d" % step)
            print("minibatch loss %f" % l)
            print("minibatch accuracy %f" % a)
            a = sess.run(accuracy, feed_dict={x: valid_dataset, y_true: valid_labels})
            print("validation accuracy %f" % a)
        
    print(sess.run(accuracy, feed_dict={x: test_dataset, y_true: test_labels}))

step 0
minibatch loss 754.092773
minibatch accuracy 0.101562
validation accuracy 0.233923
step 200
minibatch loss 301.571777
minibatch accuracy 1.000000
validation accuracy 0.532377
step 400
minibatch loss 289.746490
minibatch accuracy 1.000000
validation accuracy 0.532489
step 600
minibatch loss 278.384827
minibatch accuracy 1.000000
validation accuracy 0.532489
step 800
minibatch loss 267.468628
minibatch accuracy 1.000000
validation accuracy 0.532489
step 1000
minibatch loss 256.980469
minibatch accuracy 1.000000
validation accuracy 0.532601
0.58929


---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

In [None]:
#fully connected NN + L2 + dropout
batch_size = 128
num_steps = 100000 + 1
beta = 0.001

#input data and labels
x = tf.placeholder(tf.float32, shape=(None, image_size * image_size))
y_true = tf.placeholder(tf.float32, shape=(None, num_labels))

#hidden layer
W_1 = tf.Variable(tf.truncated_normal([image_size * image_size, 1024]))
b_1 = tf.Variable(tf.zeros([1024]))
h_1 = tf.nn.relu(tf.matmul(x, W_1) + b_1)

#dropout
keep_prob = tf.placeholder("float")
h_1_drop = tf.nn.dropout(h_1, keep_prob)

#output layer
W_2 = tf.Variable(tf.truncated_normal([1024, num_labels]))
b_2 = tf.Variable(tf.zeros([num_labels]))
logits = tf.matmul(h_1_drop, W_2) + b_2
prob = tf.nn.softmax(logits)

#loss function
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, y_true))
l2 = beta * (tf.reduce_mean(tf.nn.l2_loss(W_1)) + tf.reduce_mean(tf.nn.l2_loss(W_2)))
loss = loss + l2

#optimizer.
optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

#accuracy
correct_prediction = tf.equal(tf.argmax(prob, 1), tf.argmax(y_true, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

with tf.Session() as sess:
    tf.initialize_all_variables().run()
    
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        _, l, a = sess.run([optimizer, loss, accuracy], feed_dict={x: batch_data, 
                                                                   y_true: batch_labels, 
                                                                   keep_prob: 0.5})
        if step % 5000 == 0:
            print("step %d" % step)
            print("minibatch loss %f" % l)
            print("minibatch accuracy %f" % a)
            a = sess.run(accuracy, feed_dict={x: valid_dataset, y_true: valid_labels, keep_prob: 1.0})
            print("validation accuracy %f" % a)
        
    print(sess.run(accuracy, feed_dict={x: test_dataset, y_true: test_labels, keep_prob: 1.0}))

step 0
minibatch loss 846.167236
minibatch accuracy 0.140625
validation accuracy 0.213421
step 5000
minibatch loss 113.749832
minibatch accuracy 0.773438
validation accuracy 0.806408
step 10000
minibatch loss 42.011421
minibatch accuracy 0.804688
validation accuracy 0.836881
step 15000
minibatch loss 15.589698
minibatch accuracy 0.851562
validation accuracy 0.854246
step 20000
minibatch loss 5.999804
minibatch accuracy 0.867188
validation accuracy 0.866906
step 25000
minibatch loss 2.494871
minibatch accuracy 0.875000
validation accuracy 0.875196
step 30000
minibatch loss 1.219657
minibatch accuracy 0.906250
validation accuracy 0.881806
step 35000
minibatch loss 0.680283
minibatch accuracy 0.929688
validation accuracy 0.884943
step 40000
minibatch loss 0.474037
minibatch accuracy 0.937500
validation accuracy 0.884495
step 45000
minibatch loss 0.617348
minibatch accuracy 0.875000
validation accuracy 0.884831
step 50000
minibatch loss 0.489791
minibatch accuracy 0.898438
validation accur

In [None]:
batch_data = train_dataset[:batch_size, :]
batch_labels = train_labels[:batch_size, :]
with tf.Session() as sess:
    tf.initialize_all_variables().run()
    
    for step in range(num_steps):

        _, l, a = sess.run([optimizer, loss, accuracy], feed_dict={x: batch_data, 
                                                                   y_true: batch_labels, 
                                                                   keep_prob: 0.5})
        if step % 5000 == 0:
            print("step %d" % step)
            print("minibatch loss %f" % l)
            print("minibatch accuracy %f" % a)
            a = sess.run(accuracy, feed_dict={x: valid_dataset, y_true: valid_labels, keep_prob: 1.0})
            print("validation accuracy %f" % a)
        
    print(sess.run(accuracy, feed_dict={x: test_dataset, y_true: test_labels, keep_prob: 1.0}))

step 0
minibatch loss 802.539185
minibatch accuracy 0.070312
validation accuracy 0.266525
step 5000
minibatch loss 115.885597
minibatch accuracy 1.000000
validation accuracy 0.695048
step 10000
minibatch loss 42.680996
minibatch accuracy 0.992188
validation accuracy 0.695832
step 15000
minibatch loss 15.681966
minibatch accuracy 1.000000
validation accuracy 0.704347
step 20000
minibatch loss 5.768995
minibatch accuracy 1.000000
validation accuracy 0.700202
step 25000
minibatch loss 2.128421
minibatch accuracy 1.000000
validation accuracy 0.705243
step 30000
minibatch loss 0.801452
minibatch accuracy 1.000000
validation accuracy 0.709724
step 35000
minibatch loss 0.315725
minibatch accuracy 1.000000
validation accuracy 0.712973
step 40000
minibatch loss 0.136453
minibatch accuracy 1.000000
validation accuracy 0.713646
step 45000
minibatch loss 0.070772
minibatch accuracy 1.000000
validation accuracy 0.712973
step 50000
minibatch loss 0.046814
minibatch accuracy 1.000000
validation accur

---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---


In [16]:
#fully connected NN + L2
batch_size = 100
n_hidden = 4096
n_epoch = 20
beta = 0.001
keep_prob = tf.placeholder("float")

#input data and labels
x = tf.placeholder(tf.float32, shape=(None, image_size * image_size))
y_true = tf.placeholder(tf.float32, shape=(None, num_labels))

#hidden layer1
W_1 = tf.Variable(tf.random_normal([image_size * image_size, n_hidden]))
b_1 = tf.Variable(tf.random_normal([n_hidden]))
h_1 = tf.nn.relu(tf.matmul(x, W_1) + b_1)
h_1_drop = tf.nn.dropout(h_1, keep_prob)

#hidden layer2
W_2 = tf.Variable(tf.random_normal([n_hidden, n_hidden]))
b_2 = tf.Variable(tf.random_normal([n_hidden]))
h_2 = tf.nn.relu(tf.matmul(h_1_drop, W_2) + b_2)
h_2_drop = tf.nn.dropout(h_2, keep_prob)

#output layer
W_3 = tf.Variable(tf.random_normal([n_hidden, num_labels]))
b_3 = tf.Variable(tf.random_normal([num_labels]))
logits = tf.matmul(h_2_drop, W_3) + b_3
prob = tf.nn.softmax(logits)

#loss function
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, y_true))
l2 = tf.reduce_mean(tf.nn.l2_loss(W_1)) + tf.reduce_mean(tf.nn.l2_loss(W_2)) + tf.reduce_mean(tf.nn.l2_loss(W_3))
loss += beta * l2

#accuracy
correct_prediction = tf.equal(tf.argmax(prob, 1), tf.argmax(y_true, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

#training
lr = tf.placeholder("float")
optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr).minimize(loss)
# optimizer = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9).minimize(loss)

with tf.Session() as sess:
    lr_val = 1e-4
    tf.initialize_all_variables().run()
    for epoch in range(1, n_epoch + 1):
        print("epoch:", epoch, ",learning rate:", lr_val)
        indices = np.random.permutation(train_dataset.shape[0])
        for step in range(train_dataset.shape[0] / batch_size):
            offset = step * batch_size
            batch_indices = indices[offset:(offset + batch_size)]
            batch_data = train_dataset[batch_indices, :]
            batch_labels = train_labels[batch_indices, :]
            sess.run(optimizer, feed_dict={x: batch_data,
                                           y_true: batch_labels,
                                           keep_prob: 0.5,
                                           lr: lr_val})
#         print("train accuracy", 
#              sess.run(accuracy, feed_dict={x: train_dataset, y_true: train_labels, keep_prob: 1.0}))
        print("validation accuracy",
              sess.run(accuracy, feed_dict={x: valid_dataset, y_true: valid_labels, keep_prob: 1.0}))
        lr_val *= 0.95
        
    print("test accuracy", 
          sess.run(accuracy, feed_dict={x: test_dataset, y_true: test_labels, keep_prob: 1.0}))

epoch: 1 ,learning rate: 0.0001
validation accuracy 0.815147
epoch: 2 ,learning rate: 9.5e-05
validation accuracy 0.824669
epoch: 3 ,learning rate: 9.025e-05
validation accuracy 0.8324
epoch: 4 ,learning rate: 8.57375e-05
validation accuracy 0.835089
epoch: 5 ,learning rate: 8.1450625e-05
validation accuracy 0.837665
epoch: 6 ,learning rate: 7.737809375e-05
validation accuracy 0.842371
epoch: 7 ,learning rate: 7.35091890625e-05
validation accuracy 0.842035
epoch: 8 ,learning rate: 6.98337296094e-05
validation accuracy 0.844163
epoch: 9 ,learning rate: 6.63420431289e-05
validation accuracy 0.847076
epoch: 10 ,learning rate: 6.30249409725e-05
validation accuracy 0.847524
epoch: 11 ,learning rate: 5.98736939238e-05
validation accuracy 0.849205
epoch: 12 ,learning rate: 5.68800092276e-05
validation accuracy 0.848308
epoch: 13 ,learning rate: 5.40360087663e-05
validation accuracy 0.84842
epoch: 14 ,learning rate: 5.1334208328e-05
validation accuracy 0.849429
epoch: 15 ,learning rate: 4.8767