Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [2]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

import math

First reload the data we generated in _notmist.ipynb_.

In [3]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [4]:
image_size = 28
input_size = image_size * image_size
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, input_size)).astype(np.float32)
  # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [5]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

In [6]:
import time
def how_long(f, *args):
    #medir el tiempo que tarda f
    t1 = time.time()
    res = f(*args)
    t2 = time.time()
    print ("tiempo utilizado = ",t2-t1)
    #return res, t2-t1
    return res

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

### LOGISTIC

In [6]:
batch_size = 128

learning_rate = 0.5
#beta = 5e-6
#beta = 5e-4
beta = 5e-3
#beta = 5e-1

graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  weights = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_labels]))
  biases = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
  logits = tf.matmul(tf_train_dataset, weights) + biases
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + beta*(tf.nn.l2_loss(weights)+tf.nn.l2_loss(biases))
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(tf.matmul(tf_valid_dataset, weights) + biases)
  test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

In [7]:
t1 = time.time()

num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      t2 = time.time()
      print ("tiempo utilizado = ",t2-t1)
     
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
      print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

  t2 = time.time()
  print ("tiempo utilizado = ",t2-t1)

Initialized
tiempo utilizado =  0.757061958313
Minibatch loss at step 0: 35.464577
Minibatch accuracy: 7.0%
Validation accuracy: 10.0%
Test accuracy: 9.6%
tiempo utilizado =  2.35031604767
Minibatch loss at step 500: 1.545938
Minibatch accuracy: 86.7%
Validation accuracy: 79.7%
Test accuracy: 86.8%
tiempo utilizado =  3.69309902191
Minibatch loss at step 1000: 0.835385
Minibatch accuracy: 78.9%
Validation accuracy: 81.4%
Test accuracy: 88.5%
tiempo utilizado =  4.98722696304
Minibatch loss at step 1500: 0.528974
Minibatch accuracy: 82.8%
Validation accuracy: 81.5%
Test accuracy: 88.3%
tiempo utilizado =  6.38507390022
Minibatch loss at step 2000: 0.616269
Minibatch accuracy: 88.3%
Validation accuracy: 81.5%
Test accuracy: 88.2%
tiempo utilizado =  7.60038495064
Minibatch loss at step 2500: 0.760828
Minibatch accuracy: 79.7%
Validation accuracy: 81.3%
Test accuracy: 88.1%
tiempo utilizado =  8.73044300079
Minibatch loss at step 3000: 0.747785
Minibatch accuracy: 81.2%
Validation accurac

###### beta 5e-6
Validation accuracy: 78.9%
Test accuracy: 86.3%

###### beta 5e-4
Validation accuracy: 80.5%
Test accuracy: 88.3%

###### beta 5e-3
Validation accuracy: 81.6%
Test accuracy: 88.8%

###### beta 5e-1
Validation accuracy: 59.5%
Test accuracy: 64.2%

### 1LAYER DL

In [8]:
batch_size = 128
num_relus = 1024

learning_rate = 0.5
#beta = 5e-6
#beta = 5e-5
#beta = 5e-4
beta = 5e-3
#beta = 5e-2
#beta = 5e-1

graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  weights = tf.Variable(tf.truncated_normal([image_size * image_size, num_relus]))
  biases = tf.Variable(tf.zeros([num_relus]))

  weights2 = tf.Variable(tf.truncated_normal([num_relus, num_labels]))
  biases2 = tf.Variable(tf.zeros([num_labels]))
    
  # Training computation.

  # One Hidden layer with RELU activation
  def doLogits(x):
    return tf.matmul(tf.nn.relu(tf.matmul(x, weights) + biases), weights2) + biases2

  logits = doLogits(tf_train_dataset)
  L2 = tf.nn.l2_loss(weights)+tf.nn.l2_loss(biases) + tf.nn.l2_loss(weights2)+tf.nn.l2_loss(biases2)
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + beta*L2

  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(doLogits(tf_valid_dataset))
  test_prediction = tf.nn.softmax(doLogits(tf_test_dataset))

In [9]:
t1 = time.time()

num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      t2 = time.time()
      print ("tiempo utilizado = ",t2-t1)
     
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
      print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

  t2 = time.time()
  print ("tiempo utilizado = ",t2-t1)

Initialized
tiempo utilizado =  0.805325984955
Minibatch loss at step 0: 2005.599121
Minibatch accuracy: 7.0%
Validation accuracy: 34.8%
Test accuracy: 37.8%
tiempo utilizado =  23.0172588825
Minibatch loss at step 500: 128.045639
Minibatch accuracy: 84.4%
Validation accuracy: 80.5%
Test accuracy: 87.8%
tiempo utilizado =  45.6365559101
Minibatch loss at step 1000: 10.971403
Minibatch accuracy: 84.4%
Validation accuracy: 85.2%
Test accuracy: 91.8%
tiempo utilizado =  67.8223907948
Minibatch loss at step 1500: 1.339024
Minibatch accuracy: 90.6%
Validation accuracy: 85.4%
Test accuracy: 92.1%
tiempo utilizado =  91.3725028038
Minibatch loss at step 2000: 0.595944
Minibatch accuracy: 92.2%
Validation accuracy: 84.8%
Test accuracy: 91.8%
tiempo utilizado =  114.272111893
Minibatch loss at step 2500: 0.650190
Minibatch accuracy: 85.2%
Validation accuracy: 84.3%
Test accuracy: 90.9%
tiempo utilizado =  137.049785852
Minibatch loss at step 3000: 0.658890
Minibatch accuracy: 85.2%
Validation a

###### beta 5e-6
Validation accuracy: 82.0%
Test accuracy: 89.5%

###### beta 5e-5
Validation accuracy: 82.8%
Test accuracy: 89.8%

###### beta 5e-4
Validation accuracy: 84.5%
Test accuracy: 91.4%

###### beta 5e-3
Validation accuracy: 84.9%
Test accuracy: 91.5%

###### beta 5e-2
Validation accuracy: 80.3%
Test accuracy: 87.3%

---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

In [16]:
# subset the training data
train_dataset_s = train_dataset[:1000]
train_labels_s = train_labels[:1000]

In [17]:
batch_size = 128
num_relus = 1024

learning_rate = 0.5
beta = 5e-3

graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  weights = tf.Variable(tf.truncated_normal([image_size * image_size, num_relus]))
  biases = tf.Variable(tf.zeros([num_relus]))

  weights2 = tf.Variable(tf.truncated_normal([num_relus, num_labels]))
  biases2 = tf.Variable(tf.zeros([num_labels]))
    
  # Training computation.

  # One Hidden layer with RELU activation
  def doLogits(x):
    return tf.matmul(tf.nn.relu(tf.matmul(x, weights) + biases), weights2) + biases2

  logits = doLogits(tf_train_dataset)
  L2 = tf.nn.l2_loss(weights)+tf.nn.l2_loss(biases) + tf.nn.l2_loss(weights2)+tf.nn.l2_loss(biases2)
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + beta*L2

  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(doLogits(tf_valid_dataset))
  test_prediction = tf.nn.softmax(doLogits(tf_test_dataset))

In [22]:
t1 = time.time()

num_steps = 101
report_interval = 50

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    offset = (step * batch_size) % (train_labels_s.shape[0] - batch_size)
    batch_data = train_dataset_s[offset:(offset + batch_size), :]
    batch_labels = train_labels_s[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % report_interval == 0):
      t2 = time.time()
      print ("tiempo utilizado = ",t2-t1)
     
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
      print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

  t2 = time.time()
  print ("tiempo utilizado = ",t2-t1)

Initialized
tiempo utilizado =  0.409085035324
Minibatch loss at step 0: 1856.027344
Minibatch accuracy: 10.2%
Validation accuracy: 25.6%
Test accuracy: 26.7%
tiempo utilizado =  2.74956202507
Minibatch loss at step 50: 1222.573486
Minibatch accuracy: 99.2%
Validation accuracy: 75.8%
Test accuracy: 83.0%
tiempo utilizado =  4.81802797318
Minibatch loss at step 100: 951.721863
Minibatch accuracy: 100.0%
Validation accuracy: 76.2%
Test accuracy: 83.5%
tiempo utilizado =  5.83613300323


##### Demasiado acierta en train (minibatch 100% accuracy) pero luego mucho menos en test y validacion (76.2%, 83.5%)

---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

In [51]:
def DLmodel_1(train_dataset,
              train_labels,
              valid_dataset,
              valid_labels,
              test_dataset,
              test_labels,
              num_steps=3001,
              batch_size=128,
              num_relus=1024,
              learning_rate=0.5, #convercence speed, slower is better but more processing
              keep_prob=1.0, #probability of not-dropout, 1 does any dropout at all
              beta=5e-3, #presence of regularization with weights
              report_interval=500,
              silent=False):
    
    print("learning_rate: %f, num_steps: %d\n" % (learning_rate,num_steps))
    
    ##################
    # DECLARACION
    ##################
    
    mygraph = tf.Graph()
    with mygraph.as_default():

      # Input data. For the training data, we use a placeholder that will be fed
      # at run time with a training minibatch.
      tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, input_size))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
      tf_valid_dataset = tf.constant(valid_dataset)
      tf_test_dataset = tf.constant(test_dataset)
      #keep_prob = tf.placeholder(tf.float32)

      # Variables.
      weights = tf.Variable(tf.truncated_normal([input_size, num_relus]))
      biases = tf.Variable(tf.zeros([num_relus]))
      weights2 = tf.Variable(tf.truncated_normal([num_relus, num_labels]))
      biases2 = tf.Variable(tf.zeros([num_labels]))
    
      # Training computation.

      # One Hidden layer with RELU activation and dropout
      def doLogits(x,k=1.0):

        #layer1 = tf.nn.relu(tf.matmul(x, weights) + biases)
        layer1 = tf.nn.dropout(tf.nn.relu(tf.matmul(x, weights) + biases),k)
        out = tf.matmul(layer1, weights2) + biases2
        return out

      logits = doLogits(tf_train_dataset,k=keep_prob) #do apply dropout at training time
      L2 = tf.nn.l2_loss(weights)+tf.nn.l2_loss(biases) + tf.nn.l2_loss(weights2)+tf.nn.l2_loss(biases2)
      loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + beta*L2

      # Optimizer.
      optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
  
      # Predictions for the training, validation, and test data.
      train_prediction = tf.nn.softmax(logits)
      valid_prediction = tf.nn.softmax(doLogits(tf_valid_dataset))
      test_prediction = tf.nn.softmax(doLogits(tf_test_dataset))
        
    ##################
    # EJECUCION
    ##################

    t1 = time.time()

    with tf.Session(graph=mygraph) as session:
      tf.global_variables_initializer().run()
      print("Initialized")
      for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)

        if (step % report_interval == 0):
          if(silent == False):
              t2 = time.time()
              print ("\ntiempo utilizado = ",t2-t1)
              print("Minibatch loss at step %d: %f" % (step, l))
              print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
              print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
              print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

      t2 = time.time()
      print ("\ntiempo utilizado FINAL = ",t2-t1)
      if(silent == True):
          print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
          print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

In [52]:
DLmodel_1(train_dataset,train_labels,valid_dataset,valid_labels,test_dataset,test_labels,
          num_steps=3001,keep_prob=1.0,silent=True) #no dropout

learning_rate: 0.500000, num_steps: 3001

Initialized

tiempo utilizado FINAL =  65.4630889893
Validation accuracy: 84.9%
Test accuracy: 91.5%


In [53]:
DLmodel_1(train_dataset,train_labels,valid_dataset,valid_labels,test_dataset,test_labels,
           num_steps=3001,keep_prob=0.75,silent=True)

learning_rate: 0.500000, num_steps: 3001

Initialized

tiempo utilizado FINAL =  69.8998930454
Validation accuracy: 84.5%
Test accuracy: 91.2%


In [54]:
DLmodel_1(train_dataset,train_labels,valid_dataset,valid_labels,test_dataset,test_labels,
          num_steps=3001,keep_prob=0.5,silent=True)

learning_rate: 0.500000, num_steps: 3001

Initialized

tiempo utilizado FINAL =  71.026679039
Validation accuracy: 84.1%
Test accuracy: 90.8%


In [55]:
DLmodel_1(train_dataset,train_labels,valid_dataset,valid_labels,test_dataset,test_labels,
           num_steps=3001,keep_prob=0.25,silent=True)

learning_rate: 0.500000, num_steps: 3001

Initialized

tiempo utilizado FINAL =  70.2131431103
Validation accuracy: 82.4%
Test accuracy: 89.3%


##### Con overfit, el dropout si que tiene un efecto más llamativo

In [56]:
DLmodel_1(train_dataset_s,train_labels_s,valid_dataset,valid_labels,test_dataset,test_labels,
            num_steps=101,keep_prob=1,report_interval=50,silent=False) #Sin dropout

learning_rate: 0.500000, num_steps: 101

Initialized

tiempo utilizado =  0.410835027695
Minibatch loss at step 0: 1916.162354
Minibatch accuracy: 7.8%
Validation accuracy: 29.6%
Test accuracy: 31.5%

tiempo utilizado =  2.61412882805
Minibatch loss at step 50: 1222.881348
Minibatch accuracy: 98.4%
Validation accuracy: 77.2%
Test accuracy: 84.7%

tiempo utilizado =  4.96532893181
Minibatch loss at step 100: 949.555176
Minibatch accuracy: 100.0%
Validation accuracy: 77.1%
Test accuracy: 84.7%

tiempo utilizado FINAL =  6.09228801727


In [57]:
DLmodel_1(train_dataset_s,train_labels_s,valid_dataset,valid_labels,test_dataset,test_labels,
            num_steps=101,keep_prob=0.75,report_interval=50,silent=False)

learning_rate: 0.500000, num_steps: 101

Initialized

tiempo utilizado =  0.42064499855
Minibatch loss at step 0: 1993.792236
Minibatch accuracy: 4.7%
Validation accuracy: 24.4%
Test accuracy: 26.3%

tiempo utilizado =  2.70606899261
Minibatch loss at step 50: 1231.486694
Minibatch accuracy: 97.7%
Validation accuracy: 77.0%
Test accuracy: 84.2%

tiempo utilizado =  4.89443397522
Minibatch loss at step 100: 955.035278
Minibatch accuracy: 98.4%
Validation accuracy: 78.2%
Test accuracy: 85.6%

tiempo utilizado FINAL =  5.9489569664


In [58]:
DLmodel_1(train_dataset_s,train_labels_s,valid_dataset,valid_labels,test_dataset,test_labels,
           num_steps=101,keep_prob=0.5,report_interval=50,silent=False)

learning_rate: 0.500000, num_steps: 101

Initialized

tiempo utilizado =  0.385205030441
Minibatch loss at step 0: 2072.815430
Minibatch accuracy: 8.6%
Validation accuracy: 27.1%
Test accuracy: 29.0%

tiempo utilizado =  2.68621993065
Minibatch loss at step 50: 1230.565308
Minibatch accuracy: 94.5%
Validation accuracy: 77.6%
Test accuracy: 84.7%

tiempo utilizado =  4.76832795143
Minibatch loss at step 100: 956.962280
Minibatch accuracy: 93.0%
Validation accuracy: 77.4%
Test accuracy: 84.5%

tiempo utilizado FINAL =  5.73939990997


In [59]:
DLmodel_1(train_dataset_s,train_labels_s,valid_dataset,valid_labels,test_dataset,test_labels,
           num_steps=101,keep_prob=0.25,report_interval=50,silent=False)

learning_rate: 0.500000, num_steps: 101

Initialized

tiempo utilizado =  0.417919874191
Minibatch loss at step 0: 2242.200195
Minibatch accuracy: 9.4%
Validation accuracy: 27.6%
Test accuracy: 30.1%

tiempo utilizado =  2.72339987755
Minibatch loss at step 50: 1291.609619
Minibatch accuracy: 81.2%
Validation accuracy: 75.2%
Test accuracy: 82.1%

tiempo utilizado =  4.89027309418
Minibatch loss at step 100: 1018.325195
Minibatch accuracy: 82.0%
Validation accuracy: 76.9%
Test accuracy: 83.8%

tiempo utilizado FINAL =  5.89979100227


---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---


In [7]:
#IMPORTANTE BAJAR MUCHO LEARNING RATE PARA QUE LOS PESOS NO SE DESMADREN A NAN
#EL NUMERO DE STEPS DEPENDE DE LA CURVA DE APRENDIZAJE, A MAS LAYERS MENOS STEPS HACEN FALTA

def DLmodel_N(num_steps, #learning iterations
              batch_size=128, #data minibatch size
              num_layers=1, #number of hidden layers, must be greater than zero
              num_relus=None, #hidden layers nodes (array of num_layers size)
              starter_learning_rate=None,
              learning_decay_steps=100, 
              learning_decay_rate=1, #by default no decay
              learning_staircase=False,
              clip_limit=None, #for exploding gradients
              keep_prob=1.0, #probability of not-dropout, 1 does any dropout at all
              beta=5e-3, #presence of regularization with weights
              report_interval=500,
              silent=False):
    
    assert (num_layers > 0), 'Number of hidden layers must be greater than zero'
    
    if(num_relus == None):
        num_relus = [1024]*num_layers #by default all hidden layers with the same 1024 nodes
    else:
        assert (len(num_relus) == num_layers), 'Invalid num_relus size, must be equal to num_layers'
    
    if(starter_learning_rate == None):
        starter_learning_rate = 0.5/math.pow(10,num_layers) #para que no reviente y sea estable
    
    print("starter_learning_rate: %.3e, num_steps: %d\n" % (starter_learning_rate,num_steps))

    ##################
    # DECLARACION
    ##################
    
    graphN = tf.Graph()
    with graphN.as_default():

      # Input data. For the training data, we use a placeholder that will be fed
      # at run time with a training minibatch.
      tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, input_size))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
      tf_valid_dataset = tf.constant(valid_dataset)
      tf_test_dataset = tf.constant(test_dataset)
      #keep_prob = tf.placeholder(tf.float32)

      # Variables.
      sizes = [input_size] + num_relus + [num_labels]
      w = list()
      b = list()
      for i in range(0,num_layers+1):
        w.append(tf.Variable(tf.truncated_normal([sizes[i], sizes[i+1]])))
        b.append(tf.Variable(tf.zeros([sizes[i+1]])))
      global_step = tf.Variable(0,trainable=False)

      # Training computation.

      def doLogits(x,k=1.0):
        layer = x
        for i in range(0,num_layers):
            layer = tf.nn.dropout(tf.nn.relu(tf.matmul(layer, w[i]) + b[i]),k)
        return tf.matmul(layer, w[num_layers]) + b[num_layers]

      logits = doLogits(tf_train_dataset, k=keep_prob)
      L2 = 0.0
      for i in range(0,num_layers+1):
        L2 = L2 + tf.nn.l2_loss(w[i])+tf.nn.l2_loss(b[i])
      loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + beta*L2

      # Optimizer.
      learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 
                                                 learning_decay_steps, learning_decay_rate,
                                                staircase=learning_staircase)
      
      #optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
  
      optimizer = tf.train.GradientDescentOptimizer(learning_rate)
      gradients, v = zip(*optimizer.compute_gradients(loss))
      if(clip_limit != None):
          gradients, _ = tf.clip_by_global_norm(gradients, clip_limit) #Limitar los pesos para que no se disparen 
                                                               #y quitar el exploding
      optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)

      # Predictions for the training, validation, and test data.
      train_prediction = tf.nn.softmax(logits)
      valid_prediction = tf.nn.softmax(doLogits(tf_valid_dataset))
      test_prediction = tf.nn.softmax(doLogits(tf_test_dataset))
        
    ##################
    # EJECUCION
    ##################
    t1 = time.time()

    with tf.Session(graph=graphN) as session:
      tf.global_variables_initializer().run()
      print("Initialized")
      for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        lr,_, l, predictions = session.run([learning_rate, optimizer, loss, train_prediction], feed_dict=feed_dict)

        if (step % report_interval == 0):
          t2 = time.time()
          if(silent == False):
              print ("\ntiempo utilizado = ",t2-t1)
              print("Minibatch loss at step %d: %f" % (step, l))
              print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
              print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
              print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
              print("Learning rate at step %d: %.3e" % (step, lr))
          else:
            print("step %d\t %ds\t Lr: %.3e, Vacc: %.1f%%, Tacc: %.1f%%, Mbacc: %0.1f%%" % (step, t2-t1,lr,
                                                    accuracy(valid_prediction.eval(), valid_labels),
                                                    accuracy(test_prediction.eval(), test_labels),
                                                    accuracy(predictions, batch_labels)))
        if(math.isnan(l) == True):
            print("ERROR: los pesos se han desmadrado!!! step %d" % step)
            return

      t2 = time.time()
      print ("\ntiempo utilizado FINAL = ",t2-t1)
      if(silent == True):
          print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
          print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))


In [119]:
#aprendiendo más despacio de partida, se aplana más tarde 1500 steps
DLmodel_N(num_steps=2001,
          num_layers=2,
          starter_learning_rate=1e-3,
          learning_decay_rate=0.96,
          learning_decay_steps=10000,
          report_interval=100,
          keep_prob=1.0,beta=0,silent=True) #sin dropout ni ajuste de loss por pesos

starter_learning_rate: 1.000e-03, num_steps: 2001

[784, 1024, 1024, 10]
Initialized
step 0	 0s	 Lr: 1.000e-03, Vacc: 13.7%, Tacc: 14.2%, Mbacc: 6.2%
step 100	 8s	 Lr: 9.996e-04, Vacc: 72.9%, Tacc: 79.5%, Mbacc: 75.0%
step 200	 15s	 Lr: 9.992e-04, Vacc: 75.6%, Tacc: 82.8%, Mbacc: 79.7%
step 300	 24s	 Lr: 9.988e-04, Vacc: 76.7%, Tacc: 83.4%, Mbacc: 68.8%
step 400	 32s	 Lr: 9.984e-04, Vacc: 77.6%, Tacc: 84.8%, Mbacc: 71.1%
step 500	 40s	 Lr: 9.980e-04, Vacc: 78.3%, Tacc: 85.4%, Mbacc: 75.8%
step 600	 48s	 Lr: 9.976e-04, Vacc: 77.8%, Tacc: 85.2%, Mbacc: 76.6%
step 700	 56s	 Lr: 9.971e-04, Vacc: 79.2%, Tacc: 86.4%, Mbacc: 77.3%
step 800	 63s	 Lr: 9.967e-04, Vacc: 78.6%, Tacc: 86.0%, Mbacc: 83.6%
step 900	 71s	 Lr: 9.963e-04, Vacc: 79.0%, Tacc: 86.2%, Mbacc: 81.2%
step 1000	 79s	 Lr: 9.959e-04, Vacc: 78.9%, Tacc: 86.6%, Mbacc: 78.1%
step 1100	 87s	 Lr: 9.955e-04, Vacc: 79.3%, Tacc: 86.5%, Mbacc: 81.2%
step 1200	 95s	 Lr: 9.951e-04, Vacc: 78.9%, Tacc: 86.4%, Mbacc: 83.6%
step 1300	 103s	 Lr:

In [149]:
#a ver con beta (parece que aprende más despacio, y habría que subir iteraciones)
DLmodel_N(num_steps=12001,
          num_layers=2,
          starter_learning_rate=1e-3,
          learning_decay_rate=0.8,
          learning_decay_steps=1000,
          report_interval=1000,
          keep_prob=1.0,beta=1e-2,silent=True) #sin dropout ni ajuste de loss por pesos

starter_learning_rate: 1.000e-03, num_steps: 12001

Initialized
step 0	 0s	 Lr: 1.000e-03, Vacc: 12.9%, Tacc: 14.2%, Mbacc: 12.5%
step 1000	 53s	 Lr: 8.000e-04, Vacc: 79.3%, Tacc: 87.2%, Mbacc: 78.1%
step 2000	 106s	 Lr: 6.400e-04, Vacc: 80.3%, Tacc: 88.4%, Mbacc: 82.8%
step 3000	 159s	 Lr: 5.120e-04, Vacc: 80.8%, Tacc: 88.8%, Mbacc: 82.8%
step 4000	 211s	 Lr: 4.096e-04, Vacc: 81.0%, Tacc: 89.1%, Mbacc: 82.8%
step 5000	 262s	 Lr: 3.277e-04, Vacc: 81.2%, Tacc: 89.2%, Mbacc: 82.8%
step 6000	 314s	 Lr: 2.621e-04, Vacc: 81.2%, Tacc: 89.0%, Mbacc: 75.0%
step 7000	 367s	 Lr: 2.097e-04, Vacc: 81.7%, Tacc: 89.5%, Mbacc: 80.5%
step 8000	 419s	 Lr: 1.678e-04, Vacc: 81.8%, Tacc: 89.3%, Mbacc: 75.8%
step 9000	 471s	 Lr: 1.342e-04, Vacc: 81.8%, Tacc: 89.6%, Mbacc: 82.0%
step 10000	 522s	 Lr: 1.074e-04, Vacc: 82.0%, Tacc: 89.6%, Mbacc: 82.0%
step 11000	 575s	 Lr: 8.590e-05, Vacc: 81.9%, Tacc: 89.6%, Mbacc: 88.3%
step 12000	 626s	 Lr: 6.872e-05, Vacc: 81.8%, Tacc: 89.6%, Mbacc: 85.2%

tiempo utilizad

In [162]:
#niveles dispares
DLmodel_N(num_steps=3001,
          num_layers=3,
          num_relus=[1024,512,256],
          starter_learning_rate=1e-5,
          learning_decay_rate=0.9,
          learning_decay_steps=1000,
          #learning_staircase=True,
          report_interval=300,
          keep_prob=1.0,beta=0.0,silent=True)

starter_learning_rate: 1.000e-05, num_steps: 3001

Initialized
step 0	 0s	 Lr: 1.000e-05, Vacc: 12.6%, Tacc: 12.8%, Mbacc: 10.9%
step 300	 14s	 Lr: 9.689e-06, Vacc: 60.7%, Tacc: 68.6%, Mbacc: 52.3%
step 600	 27s	 Lr: 9.387e-06, Vacc: 66.9%, Tacc: 75.2%, Mbacc: 66.4%
step 900	 40s	 Lr: 9.095e-06, Vacc: 69.1%, Tacc: 77.7%, Mbacc: 65.6%
step 1200	 54s	 Lr: 8.812e-06, Vacc: 70.3%, Tacc: 78.9%, Mbacc: 74.2%
step 1500	 67s	 Lr: 8.538e-06, Vacc: 71.2%, Tacc: 79.8%, Mbacc: 77.3%
step 1800	 81s	 Lr: 8.272e-06, Vacc: 72.0%, Tacc: 80.5%, Mbacc: 71.1%
step 2100	 94s	 Lr: 8.015e-06, Vacc: 72.5%, Tacc: 81.0%, Mbacc: 74.2%
step 2400	 107s	 Lr: 7.766e-06, Vacc: 73.0%, Tacc: 81.5%, Mbacc: 76.6%
step 2700	 121s	 Lr: 7.524e-06, Vacc: 73.2%, Tacc: 81.8%, Mbacc: 67.2%
step 3000	 134s	 Lr: 7.290e-06, Vacc: 73.6%, Tacc: 82.1%, Mbacc: 68.8%

tiempo utilizado FINAL =  136.591145992
Validation accuracy: 73.6%
Test accuracy: 82.1%


In [163]:
#a ver con más niveles, le pongo beta grande para mantener los pesos bajo control
DLmodel_N(num_steps=1001,
          num_layers=6,
          starter_learning_rate=1e-8,
          learning_decay_rate=0.9,
          learning_decay_steps=1000,
          report_interval=100,
          keep_prob=1.0,beta=0.1,silent=True)

starter_learning_rate: 1.000e-08, num_steps: 1001

Initialized
step 0	 0s	 Lr: 1.000e-08, Vacc: 12.5%, Tacc: 14.2%, Mbacc: 10.2%
step 100	 23s	 Lr: 9.895e-09, Vacc: 70.6%, Tacc: 79.0%, Mbacc: 73.4%
step 200	 47s	 Lr: 9.791e-09, Vacc: 73.4%, Tacc: 81.8%, Mbacc: 77.3%
step 300	 70s	 Lr: 9.689e-09, Vacc: 74.2%, Tacc: 82.3%, Mbacc: 70.3%
step 400	 93s	 Lr: 9.587e-09, Vacc: 75.5%, Tacc: 83.3%, Mbacc: 71.1%
step 500	 117s	 Lr: 9.487e-09, Vacc: 75.0%, Tacc: 83.0%, Mbacc: 79.7%
step 600	 140s	 Lr: 9.387e-09, Vacc: 75.0%, Tacc: 83.3%, Mbacc: 71.9%
step 700	 163s	 Lr: 9.289e-09, Vacc: 76.3%, Tacc: 84.4%, Mbacc: 75.0%
step 800	 186s	 Lr: 9.192e-09, Vacc: 76.3%, Tacc: 84.7%, Mbacc: 78.9%
step 900	 209s	 Lr: 9.095e-09, Vacc: 76.9%, Tacc: 85.0%, Mbacc: 71.9%
step 1000	 233s	 Lr: 9.000e-09, Vacc: 76.4%, Tacc: 84.8%, Mbacc: 76.6%

tiempo utilizado FINAL =  240.529004097
Validation accuracy: 76.4%
Test accuracy: 84.8%


In [8]:
#Como Alex a lo bestia
DLmodel_N(num_steps=60001,
          batch_size=1024,
          num_layers=3,
          num_relus=[4096,2048,1024],
          starter_learning_rate=0.1,
          learning_decay_rate=0.96,
          learning_decay_steps=1000,
          #learning_staircase=True,
          clip_limit = 1.5, #avoid exploding gradients
          report_interval=1000,
          keep_prob=0.5,beta=0.0,silent=True)

starter_learning_rate: 1.000e-01, num_steps: 60001

Initialized
step 0	 3s	 Lr: 1.000e-01, Vacc: 6.1%, Tacc: 5.6%, Mbacc: 10.4%
step 1000	 2577s	 Lr: 9.600e-02, Vacc: 77.6%, Tacc: 85.7%, Mbacc: 52.6%
step 2000	 5176s	 Lr: 9.216e-02, Vacc: 79.6%, Tacc: 87.2%, Mbacc: 59.6%
step 3000	 7700s	 Lr: 8.847e-02, Vacc: 80.3%, Tacc: 87.9%, Mbacc: 64.6%
step 4000	 10231s	 Lr: 8.493e-02, Vacc: 80.3%, Tacc: 87.7%, Mbacc: 66.0%
step 5000	 12772s	 Lr: 8.154e-02, Vacc: 80.6%, Tacc: 88.0%, Mbacc: 63.4%
step 6000	 15289s	 Lr: 7.828e-02, Vacc: 80.4%, Tacc: 88.1%, Mbacc: 62.4%
step 7000	 17790s	 Lr: 7.514e-02, Vacc: 78.9%, Tacc: 86.4%, Mbacc: 58.9%
step 8000	 20293s	 Lr: 7.214e-02, Vacc: 79.0%, Tacc: 86.0%, Mbacc: 57.6%
step 9000	 22799s	 Lr: 6.925e-02, Vacc: 78.7%, Tacc: 86.2%, Mbacc: 58.0%


KeyboardInterrupt: 

In [9]:
#SIGUIENTE PRUEBA, MAS DECAY DE LEARNING
#Como Alex a lo bestia
DLmodel_N(num_steps=60001,
          batch_size=1024,
          num_layers=3,
          num_relus=[4096,2048,1024],
          starter_learning_rate=0.1,
          learning_decay_rate=0.96,
          learning_decay_steps=100,
          #learning_staircase=True,
          clip_limit = 1.5, #avoid exploding gradients
          report_interval=1000,
          keep_prob=0.5,beta=0.0,silent=True)

starter_learning_rate: 1.000e-01, num_steps: 60001

Initialized
step 0	 3s	 Lr: 1.000e-01, Vacc: 11.8%, Tacc: 12.5%, Mbacc: 9.6%
step 1000	 2806s	 Lr: 6.648e-02, Vacc: 75.8%, Tacc: 83.5%, Mbacc: 49.5%
step 2000	 7500s	 Lr: 4.420e-02, Vacc: 77.6%, Tacc: 85.2%, Mbacc: 57.9%
step 3000	 10105s	 Lr: 2.939e-02, Vacc: 78.4%, Tacc: 86.0%, Mbacc: 60.1%
step 4000	 12758s	 Lr: 1.954e-02, Vacc: 78.7%, Tacc: 86.3%, Mbacc: 64.1%
step 5000	 15329s	 Lr: 1.299e-02, Vacc: 78.9%, Tacc: 86.4%, Mbacc: 62.8%
step 6000	 17882s	 Lr: 8.635e-03, Vacc: 79.0%, Tacc: 86.5%, Mbacc: 63.9%
step 7000	 20440s	 Lr: 5.741e-03, Vacc: 79.1%, Tacc: 86.7%, Mbacc: 63.5%
step 8000	 23005s	 Lr: 3.817e-03, Vacc: 79.1%, Tacc: 86.7%, Mbacc: 63.5%
step 9000	 25563s	 Lr: 2.538e-03, Vacc: 79.1%, Tacc: 86.8%, Mbacc: 64.0%
step 10000	 28126s	 Lr: 1.687e-03, Vacc: 79.2%, Tacc: 86.7%, Mbacc: 66.0%
step 11000	 30686s	 Lr: 1.122e-03, Vacc: 79.1%, Tacc: 86.7%, Mbacc: 59.7%
step 12000	 33248s	 Lr: 7.457e-04, Vacc: 79.2%, Tacc: 86.7%, Mbacc: 

KeyboardInterrupt: 

In [None]:
#SIGUIENTE PRUEBA, INTERMEDIA
#Como Alex a lo bestia
DLmodel_N(num_steps=10001,
          batch_size=1024,
          num_layers=3,
          num_relus=[4096,2048,1024],
          starter_learning_rate=0.1,
          learning_decay_rate=0.5,
          learning_decay_steps=1000,
          #learning_staircase=True,
          clip_limit = 1.5, #avoid exploding gradients
          report_interval=1000,
          keep_prob=0.5,beta=0.0,silent=True)

starter_learning_rate: 1.000e-01, num_steps: 10001

Initialized
step 0	 3s	 Lr: 1.000e-01, Vacc: 11.9%, Tacc: 11.5%, Mbacc: 9.6%


### COMPARAR CON DL de 1 LAYER para verificar que no hay errores

In [191]:
DLmodel_N(num_steps=3001,
          num_layers=1,
          starter_learning_rate=.5,
          learning_decay_rate=1,
          learning_decay_steps=10000,
          report_interval=500,
          keep_prob=1.0,beta=0.0,silent=True) #sin dropout ni ajuste de loss por pesos

starter_learning_rate: 5.000e-01, num_steps: 3001

Initialized
step 0	 0s	 Lr: 5.000e-01, Vacc: 35.8%, Tacc: 39.1%, Mbacc: 7.8%
step 500	 12s	 Lr: 5.000e-01, Vacc: 77.9%, Tacc: 84.8%, Mbacc: 78.9%
step 1000	 25s	 Lr: 5.000e-01, Vacc: 80.9%, Tacc: 87.3%, Mbacc: 78.9%
step 1500	 37s	 Lr: 5.000e-01, Vacc: 81.0%, Tacc: 87.9%, Mbacc: 85.9%
step 2000	 50s	 Lr: 5.000e-01, Vacc: 81.3%, Tacc: 88.8%, Mbacc: 88.3%
step 2500	 63s	 Lr: 5.000e-01, Vacc: 82.1%, Tacc: 89.0%, Mbacc: 84.4%
step 3000	 75s	 Lr: 5.000e-01, Vacc: 82.8%, Tacc: 89.2%, Mbacc: 82.8%

tiempo utilizado FINAL =  76.6125369072
Validation accuracy: 82.8%
Test accuracy: 89.2%


In [79]:
DLmodel_1(train_dataset,train_labels,valid_dataset,valid_labels,test_dataset,test_labels,
            keep_prob=1.0,beta=0.0,silent=True) #sin dropout ni ajuste de loss por pesos COMPARAR

learning_rate: 0.500000, num_steps: 3001

Initialized

tiempo utilizado FINAL =  123.619668007
Validation accuracy: 82.3%
Test accuracy: 90.0%
