Deep Learning
=============

Assignment 4
------------

Previously in `2_fullyconnected.ipynb` and `3_regularization.ipynb`, we trained fully connected networks to classify [notMNIST](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html) characters.

The goal of this assignment is make the neural network convolutional.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a TensorFlow-friendly shape:
- convolutions need the image data formatted as a cube (width by height by #channels)
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10
num_channels = 1 # grayscale

import numpy as np

def reformat(dataset, labels):
  dataset = dataset.reshape(
    (-1, image_size, image_size, num_channels)).astype(np.float32)
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28, 1) (200000, 10)
Validation set (10000, 28, 28, 1) (10000, 10)
Test set (10000, 28, 28, 1) (10000, 10)


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

In [5]:
import math
import time
def how_long(f, *args):
    #medir el tiempo que tarda f
    t1 = time.time()
    res = f(*args)
    t2 = time.time()
    print ("tiempo utilizado = ",t2-t1)
    #return res, t2-t1
    return res

Let's build a small network with two convolutional layers, followed by one fully connected layer. Convolutional networks are more expensive computationally, so we'll limit its depth and number of fully connected nodes.

In [6]:
##################
# DECLARACION
##################

batch_size = 16
patch_size = 5
depth = 16
num_hidden = 64
conv_strides = [1, 2, 2, 1] # Must have strides[0] = strides[3] = 1. 
    #For the most common case of the same horizontal and vertices strides, strides = [1, stride, stride, 1].
conv_out_size = image_size // 4 * image_size // 4 * depth
starter_learning_rate = 0.05

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size, image_size, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  layer1_weights = tf.Variable(tf.truncated_normal([patch_size, patch_size, num_channels, depth], stddev=0.1))
  layer1_biases = tf.Variable(tf.zeros([depth]))
  layer2_weights = tf.Variable(tf.truncated_normal([patch_size, patch_size, depth, depth], stddev=0.1))
  layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
  layer3_weights = tf.Variable(tf.truncated_normal([conv_out_size, num_hidden], stddev=0.1))
  layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
  layer4_weights = tf.Variable(tf.truncated_normal([num_hidden, num_labels], stddev=0.1))
  layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
  # Model.
  def model(data):
    #Convolution layer1
    conv = tf.nn.conv2d(data, layer1_weights, conv_strides, padding='SAME')
    hidden = tf.nn.relu(conv + layer1_biases)
    #Convolution layer2
    conv = tf.nn.conv2d(hidden, layer2_weights, conv_strides, padding='SAME')
    hidden = tf.nn.relu(conv + layer2_biases)
    #Fully connected layer
    shape = hidden.get_shape().as_list()
    reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
    hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
    #Output
    return tf.matmul(hidden, layer4_weights) + layer4_biases
  
  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(starter_learning_rate).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [7]:
##################
# EJECUCION
##################

t1 = time.time()

num_steps = 1001
report_interval = 50
silent = True

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  for step in range(num_steps):
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
    lr = starter_learning_rate
    
    if (step % report_interval == 0):
      t2 = time.time()
      if(silent == False):
          print('Minibatch loss at step %d: %f' % (step, l))
          print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
          print('Validation accuracy: %.1f%%' % accuracy(valid_prediction.eval(), valid_labels))
          print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
      else:
          print("step %d\t %ds\t Lr: %.2e, Vacc: %.1f%%, Tacc: %.1f%%, Mbacc: %.1f%%" % (step, t2-t1,lr,
                                                  accuracy(valid_prediction.eval(), valid_labels),
                                                  accuracy(test_prediction.eval(), test_labels),
                                                accuracy(predictions, batch_labels)))
      if(math.isnan(l) == True):
          print("ERROR: los pesos se han desmadrado!!!")
          break
          #return
  print('End')
    

Initialized
step 0	 0s	 Lr: 5.00e-02, Vacc: 10.0%, Tacc: 10.0%, Mbacc: 18.8%
step 50	 5s	 Lr: 5.00e-02, Vacc: 45.3%, Tacc: 50.7%, Mbacc: 25.0%
step 100	 8s	 Lr: 5.00e-02, Vacc: 66.8%, Tacc: 73.6%, Mbacc: 68.8%
step 150	 11s	 Lr: 5.00e-02, Vacc: 76.2%, Tacc: 84.0%, Mbacc: 81.2%
step 200	 14s	 Lr: 5.00e-02, Vacc: 78.4%, Tacc: 85.3%, Mbacc: 68.8%
step 250	 18s	 Lr: 5.00e-02, Vacc: 77.9%, Tacc: 85.3%, Mbacc: 68.8%
step 300	 21s	 Lr: 5.00e-02, Vacc: 77.5%, Tacc: 85.1%, Mbacc: 87.5%
step 350	 24s	 Lr: 5.00e-02, Vacc: 77.2%, Tacc: 84.3%, Mbacc: 87.5%
step 400	 27s	 Lr: 5.00e-02, Vacc: 79.6%, Tacc: 86.7%, Mbacc: 93.8%
step 450	 31s	 Lr: 5.00e-02, Vacc: 79.0%, Tacc: 86.4%, Mbacc: 75.0%
step 500	 34s	 Lr: 5.00e-02, Vacc: 81.0%, Tacc: 87.8%, Mbacc: 87.5%
step 550	 38s	 Lr: 5.00e-02, Vacc: 80.7%, Tacc: 87.9%, Mbacc: 75.0%
step 600	 41s	 Lr: 5.00e-02, Vacc: 81.3%, Tacc: 88.4%, Mbacc: 87.5%
step 650	 44s	 Lr: 5.00e-02, Vacc: 82.5%, Tacc: 89.1%, Mbacc: 81.2%
step 700	 47s	 Lr: 5.00e-02, Vacc: 82.3%, 

---
Lo que les sale a ellos (mejor que a mí con el mismo código... que raro):
========================================================================
Initialized
Minibatch loss at step 0 : 3.51275
Minibatch accuracy: 6.2%
Validation accuracy: 12.8%
Minibatch loss at step 50 : 1.48703
Minibatch accuracy: 43.8%
Validation accuracy: 50.4%
Minibatch loss at step 100 : 1.04377
Minibatch accuracy: 68.8%
Validation accuracy: 67.4%
Minibatch loss at step 150 : 0.601682
Minibatch accuracy: 68.8%
Validation accuracy: 73.0%
Minibatch loss at step 200 : 0.898649
Minibatch accuracy: 75.0%
Validation accuracy: 77.8%
Minibatch loss at step 250 : 1.3637
Minibatch accuracy: 56.2%
Validation accuracy: 75.4%
Minibatch loss at step 300 : 1.41968
Minibatch accuracy: 62.5%
Validation accuracy: 76.0%
Minibatch loss at step 350 : 0.300648
Minibatch accuracy: 81.2%
Validation accuracy: 80.2%
Minibatch loss at step 400 : 1.32092
Minibatch accuracy: 56.2%
Validation accuracy: 80.4%
Minibatch loss at step 450 : 0.556701
Minibatch accuracy: 81.2%
Validation accuracy: 79.4%
Minibatch loss at step 500 : 1.65595
Minibatch accuracy: 43.8%
Validation accuracy: 79.6%
Minibatch loss at step 550 : 1.06995
Minibatch accuracy: 75.0%
Validation accuracy: 81.2%
Minibatch loss at step 600 : 0.223684
Minibatch accuracy: 100.0%
Validation accuracy: 82.3%
Minibatch loss at step 650 : 0.619602
Minibatch accuracy: 87.5%
Validation accuracy: 81.8%
Minibatch loss at step 700 : 0.812091
Minibatch accuracy: 75.0%
Validation accuracy: 82.4%
Minibatch loss at step 750 : 0.276302
Minibatch accuracy: 87.5%
Validation accuracy: 82.3%
Minibatch loss at step 800 : 0.450241
Minibatch accuracy: 81.2%
Validation accuracy: 82.3%
Minibatch loss at step 850 : 0.137139
Minibatch accuracy: 93.8%
Validation accuracy: 82.3%
Minibatch loss at step 900 : 0.52664
Minibatch accuracy: 75.0%
Validation accuracy: 82.2%
Minibatch loss at step 950 : 0.623835
Minibatch accuracy: 87.5%
Validation accuracy: 82.1%
Minibatch loss at step 1000 : 0.243114
Minibatch accuracy: 93.8%
Validation accuracy: 82.9%
Test accuracy: 90.0%
---

---
Problem 1
---------

The convolutional model above uses convolutions with stride 2 to reduce the dimensionality. Replace the strides by a max pooling operation (`nn.max_pool()`) of stride 2 and kernel size 2.

---

###### tf.nn.max_pool(value, ksize, strides, padding, data_format='NHWC', name=None)

In [8]:
##################
# DECLARACION
##################

batch_size = 16
patch_size = 5
depth = 16
num_hidden = 64
k = 2
pooling_ksize = [1, k, k, 1] #talla de la ventana en NHWC para maxpooling y remuestrear a mitad
pooling_strides = [1, k, k, 1] #movimiento de la ventana para aplicar el maxpooling 
conv_strides = [1, 1, 1, 1] # Ya no remuestrea, es realista, el subsamplig se lo hace el maxpooling
conv_out_size = image_size // 4 * image_size // 4 * depth
starter_learning_rate = 0.05

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size, image_size, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  layer1_weights = tf.Variable(tf.truncated_normal([patch_size, patch_size, num_channels, depth], stddev=0.1))
  layer1_biases = tf.Variable(tf.zeros([depth]))
  layer2_weights = tf.Variable(tf.truncated_normal([patch_size, patch_size, depth, depth], stddev=0.1))
  layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
  layer3_weights = tf.Variable(tf.truncated_normal([conv_out_size, num_hidden], stddev=0.1))
  layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
  layer4_weights = tf.Variable(tf.truncated_normal([num_hidden, num_labels], stddev=0.1))
  layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
  # Model.
  def model(data):
    #Convolution + pooling layer1
    conv = tf.nn.conv2d(data, layer1_weights, conv_strides, padding='SAME')
    hidden = tf.nn.relu(conv + layer1_biases)
    pool = tf.nn.max_pool(hidden,pooling_ksize,pooling_strides,padding='SAME')
    
    #Convolution + pooling layer2
    conv = tf.nn.conv2d(pool, layer2_weights, conv_strides, padding='SAME')
    hidden = tf.nn.relu(conv + layer2_biases)
    pool = tf.nn.max_pool(hidden,pooling_ksize,pooling_strides,padding='SAME')
    
    #Fully connected layer
    shape = pool.get_shape().as_list()
    reshape = tf.reshape(pool, [shape[0], shape[1] * shape[2] * shape[3]])
    hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
    
    #Output
    return tf.matmul(hidden, layer4_weights) + layer4_biases
  
  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(starter_learning_rate).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [9]:
##################
# EJECUCION
##################

t1 = time.time()

num_steps = 1001
report_interval = 50
silent = True

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  for step in range(num_steps):
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
    lr = starter_learning_rate
    
    if (step % report_interval == 0):
      t2 = time.time()
      if(silent == False):
          print('Minibatch loss at step %d: %f' % (step, l))
          print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
          print('Validation accuracy: %.1f%%' % accuracy(valid_prediction.eval(), valid_labels))
          print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
      else:
          print("step %d\t %ds\t Lr: %.2e, Vacc: %.1f%%, Tacc: %.1f%%, Mbacc: %.1f%%" % (step, t2-t1,lr,
                                                  accuracy(valid_prediction.eval(), valid_labels),
                                                  accuracy(test_prediction.eval(), test_labels),
                                                accuracy(predictions, batch_labels)))
      if(math.isnan(l) == True):
          print("ERROR: los pesos se han desmadrado!!!")
          break
          #return
  print('End')
    

Initialized
step 0	 0s	 Lr: 5.00e-02, Vacc: 9.9%, Tacc: 10.0%, Mbacc: 6.2%
step 50	 16s	 Lr: 5.00e-02, Vacc: 49.7%, Tacc: 55.8%, Mbacc: 50.0%
step 100	 28s	 Lr: 5.00e-02, Vacc: 71.3%, Tacc: 78.1%, Mbacc: 75.0%
step 150	 39s	 Lr: 5.00e-02, Vacc: 74.2%, Tacc: 81.8%, Mbacc: 81.2%
step 200	 50s	 Lr: 5.00e-02, Vacc: 78.2%, Tacc: 85.1%, Mbacc: 62.5%
step 250	 61s	 Lr: 5.00e-02, Vacc: 77.7%, Tacc: 85.1%, Mbacc: 62.5%
step 300	 73s	 Lr: 5.00e-02, Vacc: 80.0%, Tacc: 87.0%, Mbacc: 93.8%
step 350	 86s	 Lr: 5.00e-02, Vacc: 78.9%, Tacc: 86.0%, Mbacc: 93.8%
step 400	 98s	 Lr: 5.00e-02, Vacc: 81.0%, Tacc: 87.7%, Mbacc: 100.0%
step 450	 112s	 Lr: 5.00e-02, Vacc: 78.9%, Tacc: 86.2%, Mbacc: 75.0%
step 500	 136s	 Lr: 5.00e-02, Vacc: 80.9%, Tacc: 87.2%, Mbacc: 81.2%
step 550	 158s	 Lr: 5.00e-02, Vacc: 81.5%, Tacc: 88.5%, Mbacc: 75.0%
step 600	 182s	 Lr: 5.00e-02, Vacc: 82.8%, Tacc: 89.7%, Mbacc: 87.5%
step 650	 203s	 Lr: 5.00e-02, Vacc: 82.5%, Tacc: 89.4%, Mbacc: 81.2%
step 700	 225s	 Lr: 5.00e-02, Vacc: 

---
Problem 2
---------

Try to get the best performance you can using a convolutional net. Look for example at the classic [LeNet5](http://yann.lecun.com/exdb/lenet/) architecture, adding Dropout, and/or adding learning rate decay.

---

LENET:
<img src="lenet_architecture.png">

In [10]:
#Dos conv+pool y un hidden
#Fuera esta definido image_size = 28, num_channels = 1 (grayscale)

def CNmodel(num_steps=None, #learning iterations
            batch_size=16, #data minibatch size
            num_hidden=64, #hidden layers nodes
            patch_size = 5,
            depth = 16,
            k = 2, #maxpool reduction
            starter_learning_rate=0.05,
            learning_decay_steps=10000, 
            learning_decay_rate=1, #by default no decay
            learning_staircase=False,
            beta=5e-3, #presence of regularization with weights
            report_interval=50,
            silent=False):
    
    print("starter_learning_rate: %.3e, num_steps: %d\n" % (starter_learning_rate,num_steps))
    if(num_steps == None):
        return
    
    ##################
    # DECLARACION
    ##################

    hidden_levels = 1
    conv_levels = 2
    pooling_ksize = [1, k, k, 1] #talla de la ventana en NHWC para maxpooling y remuestrear a mitad
    pooling_strides = [1, k, k, 1] #movimiento de la ventana para aplicar el maxpooling 
    conv_strides = [1, 1, 1, 1] # Ya no remuestrea, es realista, el subsamplig se lo hace el maxpooling
    conv_out_size = image_size // (conv_levels*k) * image_size // (conv_levels*k) * depth

    mygraph = tf.Graph()

    with mygraph.as_default():

      # Input data.
      tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size, image_size, num_channels))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
      tf_valid_dataset = tf.constant(valid_dataset)
      tf_test_dataset = tf.constant(test_dataset)

      # Variables.
      layer1_weights = tf.Variable(tf.truncated_normal([patch_size, patch_size, num_channels, depth], stddev=0.1))
      layer1_biases = tf.Variable(tf.zeros([depth]))
      layer2_weights = tf.Variable(tf.truncated_normal([patch_size, patch_size, depth, depth], stddev=0.1))
      layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
      layer3_weights = tf.Variable(tf.truncated_normal([conv_out_size, num_hidden], stddev=0.1))
      layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
      layer4_weights = tf.Variable(tf.truncated_normal([num_hidden, num_labels], stddev=0.1))
      layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
      global_step = tf.Variable(0,trainable=False)

      # Model.
      def model(data):
        #Convolution + pooling layer1
        conv = tf.nn.conv2d(data, layer1_weights, conv_strides, padding='SAME')
        hidden = tf.nn.relu(conv + layer1_biases)
        pool = tf.nn.max_pool(hidden,pooling_ksize,pooling_strides,padding='SAME')

        #Convolution + pooling layer2
        conv = tf.nn.conv2d(pool, layer2_weights, conv_strides, padding='SAME')
        hidden = tf.nn.relu(conv + layer2_biases)
        pool = tf.nn.max_pool(hidden,pooling_ksize,pooling_strides,padding='SAME')

        #Fully connected layer
        shape = pool.get_shape().as_list()
        reshape = tf.reshape(pool, [shape[0], shape[1] * shape[2] * shape[3]])
        hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)

        #Output
        return tf.matmul(hidden, layer4_weights) + layer4_biases

      # Training computation.
      logits = model(tf_train_dataset)
      L2 = tf.nn.l2_loss(layer1_weights)+tf.nn.l2_loss(layer1_biases) \
         + tf.nn.l2_loss(layer2_weights)+tf.nn.l2_loss(layer2_biases) \
         + tf.nn.l2_loss(layer3_weights)+tf.nn.l2_loss(layer3_biases) \
         + tf.nn.l2_loss(layer4_weights)+tf.nn.l2_loss(layer4_biases)
      loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + beta*L2
        
      # Optimizer.
      learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 
                                learning_decay_steps, learning_decay_rate,staircase=learning_staircase)
      optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

      # Predictions for the training, validation, and test data.
      train_prediction = tf.nn.softmax(logits)
      valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
      test_prediction = tf.nn.softmax(model(tf_test_dataset))
        
    ##################
    # EJECUCION
    ##################

    t1 = time.time()

    with tf.Session(graph=mygraph) as session:
      tf.global_variables_initializer().run()
      print('Initialized')
      for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        lr,_, l, predictions = session.run([learning_rate,optimizer, loss, train_prediction], feed_dict=feed_dict)

        if (step % report_interval == 0):
          t2 = time.time()
          if(silent == False):
              print ("\ntiempo utilizado = ",t2-t1)
              print("Learning rate at step %d: %.3e" % (step, lr))
              print('Minibatch loss at step %d: %f' % (step, l))
              print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
              print('Validation accuracy: %.1f%%' % accuracy(valid_prediction.eval(), valid_labels))
              print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
          else:
              print("step %d\t %ds\t Lr: %.5e, Vacc: %.1f%%, Tacc: %.1f%%" % (step, t2-t1,lr,
                                                      accuracy(valid_prediction.eval(), valid_labels),
                                                      accuracy(test_prediction.eval(), test_labels)))
          if(math.isnan(l) == True):
              print("ERROR: los pesos se han desmadrado!!!")
              return

      t2 = time.time()
      print ("\ntiempo utilizado FINAL = ",t2-t1)
      if(silent == True):
          print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
          print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

In [11]:
#Primero sin cambiar nada
CNmodel(num_steps=1001, #learning iterations
        starter_learning_rate=0.05,
        learning_decay_steps=10000, 
        learning_decay_rate=1,
        beta=0,report_interval=50,silent=True)  

starter_learning_rate: 5.000e-02, num_steps: 1001

Initialized
step 0	 1s	 Lr: 5.00000e-02, Vacc: 10.0%, Tacc: 10.0%
step 50	 29s	 Lr: 5.00000e-02, Vacc: 44.1%, Tacc: 48.1%
step 100	 55s	 Lr: 5.00000e-02, Vacc: 59.7%, Tacc: 64.9%
step 150	 78s	 Lr: 5.00000e-02, Vacc: 73.8%, Tacc: 80.9%
step 200	 103s	 Lr: 5.00000e-02, Vacc: 76.1%, Tacc: 83.4%
step 250	 128s	 Lr: 5.00000e-02, Vacc: 78.0%, Tacc: 85.1%
step 300	 153s	 Lr: 5.00000e-02, Vacc: 80.0%, Tacc: 87.2%
step 350	 177s	 Lr: 5.00000e-02, Vacc: 79.1%, Tacc: 86.4%
step 400	 200s	 Lr: 5.00000e-02, Vacc: 80.7%, Tacc: 87.7%
step 450	 224s	 Lr: 5.00000e-02, Vacc: 79.2%, Tacc: 86.0%
step 500	 236s	 Lr: 5.00000e-02, Vacc: 81.8%, Tacc: 88.2%
step 550	 250s	 Lr: 5.00000e-02, Vacc: 82.7%, Tacc: 89.4%
step 600	 263s	 Lr: 5.00000e-02, Vacc: 82.7%, Tacc: 89.5%
step 650	 277s	 Lr: 5.00000e-02, Vacc: 82.8%, Tacc: 89.5%
step 700	 289s	 Lr: 5.00000e-02, Vacc: 83.6%, Tacc: 90.2%
step 750	 303s	 Lr: 5.00000e-02, Vacc: 83.8%, Tacc: 90.3%
step 800	 317s	 L

In [12]:
#Experimentamos con learning decay y subiendo iteraciones
CNmodel(num_steps=30001, #learning iterations
        starter_learning_rate=0.05,
        learning_decay_steps=1000, 
        learning_decay_rate=0.9,
        beta=0,report_interval=3000,silent=True)  

starter_learning_rate: 5.000e-02, num_steps: 30001

Initialized
step 0	 0s	 Lr: 5.00000e-02, Vacc: 10.0%, Tacc: 10.0%
step 3000	 118s	 Lr: 3.64500e-02, Vacc: 86.8%, Tacc: 93.2%
step 6000	 260s	 Lr: 2.65720e-02, Vacc: 88.1%, Tacc: 94.3%
step 9000	 401s	 Lr: 1.93710e-02, Vacc: 88.7%, Tacc: 94.8%
step 12000	 543s	 Lr: 1.41215e-02, Vacc: 89.2%, Tacc: 95.0%
step 15000	 683s	 Lr: 1.02946e-02, Vacc: 89.3%, Tacc: 95.1%
step 18000	 821s	 Lr: 7.50473e-03, Vacc: 89.8%, Tacc: 95.4%
step 21000	 955s	 Lr: 5.47095e-03, Vacc: 89.8%, Tacc: 95.3%
step 24000	 1083s	 Lr: 3.98832e-03, Vacc: 89.9%, Tacc: 95.4%
step 27000	 1222s	 Lr: 2.90748e-03, Vacc: 90.0%, Tacc: 95.6%
step 30000	 1360s	 Lr: 2.11956e-03, Vacc: 90.0%, Tacc: 95.6%

tiempo utilizado FINAL =  1383.39959216
Validation accuracy: 90.0%
Test accuracy: 95.6%


In [13]:
#Experimentamos con learning decay menor y subiendo iteraciones
CNmodel(num_steps=60001, #learning iterations
        starter_learning_rate=0.05,
        learning_decay_steps=100000, 
        learning_decay_rate=0.9,
        beta=0,report_interval=6000,silent=True)  

starter_learning_rate: 5.000e-02, num_steps: 60001

Initialized
step 0	 1s	 Lr: 5.00000e-02, Vacc: 10.0%, Tacc: 10.0%
step 6000	 224s	 Lr: 4.96849e-02, Vacc: 88.2%, Tacc: 94.4%
step 12000	 413s	 Lr: 4.93718e-02, Vacc: 89.1%, Tacc: 95.0%
step 18000	 602s	 Lr: 4.90607e-02, Vacc: 89.6%, Tacc: 95.4%
step 24000	 790s	 Lr: 4.87515e-02, Vacc: 90.1%, Tacc: 95.6%
step 30000	 979s	 Lr: 4.84443e-02, Vacc: 90.1%, Tacc: 95.7%
step 36000	 1199s	 Lr: 4.81390e-02, Vacc: 90.2%, Tacc: 95.9%
step 42000	 1458s	 Lr: 4.78357e-02, Vacc: 90.6%, Tacc: 95.9%
step 48000	 1719s	 Lr: 4.75342e-02, Vacc: 90.5%, Tacc: 96.0%
step 54000	 1982s	 Lr: 4.72347e-02, Vacc: 90.9%, Tacc: 96.3%
step 60000	 2251s	 Lr: 4.69370e-02, Vacc: 90.3%, Tacc: 95.9%

tiempo utilizado FINAL =  2275.46044922
Validation accuracy: 90.3%
Test accuracy: 95.9%


In [14]:
#Ahora con beta para fijar pesos menores, no funciona mucho, habría que probar varios valores y ajustar
CNmodel(num_steps=60001, #learning iterations
        starter_learning_rate=0.05,
        learning_decay_steps=1000, 
        learning_decay_rate=0.9,
        beta=1e-3,report_interval=6000,silent=True)  

starter_learning_rate: 5.000e-02, num_steps: 60001

Initialized
step 0	 1s	 Lr: 5.00000e-02, Vacc: 10.0%, Tacc: 10.0%
step 6000	 241s	 Lr: 2.65720e-02, Vacc: 87.6%, Tacc: 94.1%
step 12000	 503s	 Lr: 1.41215e-02, Vacc: 88.8%, Tacc: 95.1%
step 18000	 756s	 Lr: 7.50473e-03, Vacc: 89.5%, Tacc: 95.4%
step 24000	 946s	 Lr: 3.98832e-03, Vacc: 89.7%, Tacc: 95.5%
step 30000	 1166s	 Lr: 2.11956e-03, Vacc: 89.9%, Tacc: 95.6%
step 36000	 1353s	 Lr: 1.12642e-03, Vacc: 89.9%, Tacc: 95.6%
step 42000	 1541s	 Lr: 5.98625e-04, Vacc: 89.9%, Tacc: 95.6%
step 48000	 1730s	 Lr: 3.18134e-04, Vacc: 89.9%, Tacc: 95.6%
step 54000	 1920s	 Lr: 1.69069e-04, Vacc: 89.9%, Tacc: 95.7%
step 60000	 2107s	 Lr: 8.98504e-05, Vacc: 89.9%, Tacc: 95.6%

tiempo utilizado FINAL =  2117.68269587
Validation accuracy: 89.9%
Test accuracy: 95.6%
