In [1]:
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save  # hint to help gc free up memory
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


In [3]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
    dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
    # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
    labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
    return dataset, labels

In [4]:
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [5]:
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))/ predictions.shape[0])

Adding regularization to the Net without hidden layer:

In [6]:
# With gradient descent training, even this much data is prohibitive.
# Subset the training data for faster turnaround.
train_subset = 10000
#beta constant to start
beta=0.01

graph = tf.Graph()
with graph.as_default():
    # Input data.
    # Load the training, validation and test data into constants that are attached to the graph.
    tf_train_dataset = tf.constant(train_dataset[:train_subset, :])
    tf_train_labels = tf.constant(train_labels[:train_subset])
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables.
    # These are the parameters that we are going to be training. 
    # The weight matrix will be initialized using random values following a (truncated) normal distribution. 
    weights = tf.Variable(tf.truncated_normal([image_size * image_size, num_labels]))
    # The biases get initialized to zero.    
    biases = tf.Variable(tf.zeros([num_labels]))
  
    # Training computation.
    # We multiply the inputs with the weight matrix, and add biases. 
    logits = tf.matmul(tf_train_dataset, weights) + biases
    # We compute the softmax and cross-entropy (one operation in TensorFlow, because it's very common, and it can be optimized). 
    # We take the average of this cross-entropy across all training examples: that's our loss.    
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
    #setting the regularizatior
    regularizer=tf.nn.l2_loss(weights)
    #adding it to the loss
    loss = tf.reduce_mean(loss + beta * regularizer)
  
    # Optimizer.
    # We are going to find the minimum of this loss using gradient descent.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
    # Predictions for the training, validation, and test data.
    # These are not part of training, but merely here so that we can report accuracy figures as we train.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(tf_valid_dataset, weights) + biases)
    test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

In [7]:
num_steps = 801

def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))/ predictions.shape[0])

with tf.Session(graph=graph) as session:
    # This is a one-time operation which ensures the parameters get initialized as
    # we described in the graph: random weights for the matrix, zeros for the biases. 
    tf.global_variables_initializer().run()
    print('Initialized')
    for step in range(num_steps):
        # Run the computations. 
        # We tell .run() that we want to run the optimizer, and get the loss value and 
        # the training predictions returned as numpy arrays.
        _, l, predictions = session.run([optimizer, loss, train_prediction])
        if (step % 100 == 0):
            print('Loss at step %d: %f' % (step, l))
            print('Training accuracy: %.1f%%' % accuracy(predictions, train_labels[:train_subset, :]))
            # Calling .eval() on valid_prediction is basically like calling run(), but
            # just to get that one numpy array. Note that it recomputes all its graph
            # dependencies.
            print('Validation accuracy: %.1f%%\n' % accuracy(valid_prediction.eval(), valid_labels))
            
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

Initialized
Loss at step 0: 51.068317
Training accuracy: 7.7%
Validation accuracy: 8.9%

Loss at step 100: 11.898849
Training accuracy: 73.5%
Validation accuracy: 71.8%

Loss at step 200: 4.491138
Training accuracy: 78.9%
Validation accuracy: 76.5%

Loss at step 300: 1.980109
Training accuracy: 82.2%
Validation accuracy: 79.8%

Loss at step 400: 1.128514
Training accuracy: 83.9%
Validation accuracy: 81.1%

Loss at step 500: 0.835942
Training accuracy: 84.3%
Validation accuracy: 81.7%

Loss at step 600: 0.734170
Training accuracy: 84.7%
Validation accuracy: 82.0%

Loss at step 700: 0.698424
Training accuracy: 84.8%
Validation accuracy: 82.0%

Loss at step 800: 0.685764
Training accuracy: 84.9%
Validation accuracy: 82.0%

Test accuracy: 88.8%


adding the regularization to the NN with hidden layers ans SGD

Conclusion:

Looks like that having too much L2 (high punish when Wegths values are high) 
and low Dropout (high probaility of keeping values), there is no changes in the model because is that changes, the accuracy starts going down.
I think this is because there is enougth data and the algotithm is rigth for it to fit, so there is no risk of overfitting, 
because of that, the regularization techniques fails.

In [8]:
batch_size = 128
h1_neurons_size=1024
beta1=0.009

graph = tf.Graph()
with graph.as_default():
    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)  

    #Input Layer: 28*28 neurons = 784
    #Hidden layer:1024
    #Otput Layer: 10
    
    #Input and Hidden1.
    weights_In_h1 = tf.Variable(tf.truncated_normal([image_size * image_size, h1_neurons_size]))
    biases_In_h1 = tf.Variable(tf.zeros([h1_neurons_size]))
  
    preactivation_h1 = tf.matmul(tf_train_dataset, weights_In_h1) + biases_In_h1
    activation_h1 = tf.nn.relu(preactivation_h1)
    
    #adding dropout
    keep_prob = tf.placeholder("float")
    dropout_h1=tf.nn.dropout(activation_h1, keep_prob)
    
    #Hidden1 and Output
    weights_h1_Out = tf.Variable(tf.truncated_normal([h1_neurons_size, num_labels]))
    biases_h1_Out = tf.Variable(tf.zeros([num_labels]))
  
    logits_h1_Out = tf.matmul(dropout_h1, weights_h1_Out) + biases_h1_Out
    
    #Total Loss
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits_h1_Out))   
    
    #regularization
    regularizers=tf.nn.l2_loss(weights_h1_Out) + tf.nn.l2_loss(weights_In_h1)
    
    #new LOSS
    loss= tf.reduce_mean(loss + beta1*regularizers)
  
    #Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
    # Predictions for the training, validation, and test data:
    train_prediction = tf.nn.softmax(logits_h1_Out)
    
    #valid_prediction
    tf_valida_dataset_logits_h1=tf.matmul(tf_valid_dataset, weights_In_h1) + biases_In_h1
    tf_valid_dataset_h1 = tf.nn.relu(tf_valida_dataset_logits_h1)
    tf_valid_dataset_logits_out=tf.matmul(tf_valid_dataset_h1, weights_h1_Out) + biases_h1_Out
    valid_prediction = tf.nn.softmax(tf_valid_dataset_logits_out)
                                     
    valididation_loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=valid_labels, logits=tf_valid_dataset_logits_out))                                 
         
    #test_prediction
    tf_test_dataset_h1 = tf.nn.relu(tf.matmul(tf_test_dataset, weights_In_h1) + biases_In_h1)
    test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset_h1, weights_h1_Out) + biases_h1_Out)

In [9]:
num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob:0.7}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("offset = (%d * %d) // (%d - %d) = %d" % (step, batch_size, train_labels.shape[0], batch_size, offset))
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
            print('Validation Loss: %.1f\n' % valididation_loss.eval())
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
offset = (0 * 128) // (200000 - 128) = 0
Minibatch loss at step 0: 3351.523193
Minibatch accuracy: 8.6%
Validation accuracy: 27.6%
Validation Loss: 1422.5

offset = (500 * 128) // (200000 - 128) = 64000
Minibatch loss at step 500: 31.269665
Minibatch accuracy: 86.7%
Validation accuracy: 83.8%
Validation Loss: 0.6

offset = (1000 * 128) // (200000 - 128) = 128000
Minibatch loss at step 1000: 1.181374
Minibatch accuracy: 81.2%
Validation accuracy: 83.2%
Validation Loss: 0.6

offset = (1500 * 128) // (200000 - 128) = 192000
Minibatch loss at step 1500: 0.624116
Minibatch accuracy: 86.7%
Validation accuracy: 83.7%
Validation Loss: 0.6

offset = (2000 * 128) // (200000 - 128) = 56128
Minibatch loss at step 2000: 0.646111
Minibatch accuracy: 89.8%
Validation accuracy: 83.3%
Validation Loss: 0.6

offset = (2500 * 128) // (200000 - 128) = 120128
Minibatch loss at step 2500: 0.734108
Minibatch accuracy: 82.0%
Validation accuracy: 83.3%
Validation Loss: 0.6

offset = (3000 * 128) // 

Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

In [10]:
num_steps = 3001


train_dataset_2 = train_dataset[:500, :]
train_labels_2 = train_labels[:500]

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels_2.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset_2[offset:(offset + batch_size), :]
        batch_labels = train_labels_2[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob:0.5}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("offset = (%d * %d) // (%d - %d) = %d" % (step, batch_size, train_labels_2.shape[0], batch_size, offset))
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
            print('Validation Loss: %.1f\n' % valididation_loss.eval())
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
offset = (0 * 128) // (500 - 128) = 0
Minibatch loss at step 0: 3271.212402
Minibatch accuracy: 15.6%
Validation accuracy: 20.8%
Validation Loss: 994.1

offset = (500 * 128) // (500 - 128) = 16
Minibatch loss at step 500: 31.299320
Minibatch accuracy: 100.0%
Validation accuracy: 78.5%
Validation Loss: 1.1

offset = (1000 * 128) // (500 - 128) = 32
Minibatch loss at step 1000: 0.711739
Minibatch accuracy: 98.4%
Validation accuracy: 79.5%
Validation Loss: 0.7

offset = (1500 * 128) // (500 - 128) = 48
Minibatch loss at step 1500: 0.321067
Minibatch accuracy: 99.2%
Validation accuracy: 79.5%
Validation Loss: 0.7

offset = (2000 * 128) // (500 - 128) = 64
Minibatch loss at step 2000: 0.296688
Minibatch accuracy: 100.0%
Validation accuracy: 79.6%
Validation Loss: 0.7

offset = (2500 * 128) // (500 - 128) = 80
Minibatch loss at step 2500: 0.289435
Minibatch accuracy: 99.2%
Validation accuracy: 79.7%
Validation Loss: 0.7

offset = (3000 * 128) // (500 - 128) = 96
Minibatch loss at

Adding more layers:

In [39]:
import math as math

batch_size = 128
h1_neurons_size=1024
h2_neurons_size = int(h1_neurons_size * 0.5)
h3_neurons_size = int(h1_neurons_size * np.power(0.5, 2))
h4_neurons_size = int(h1_neurons_size * np.power(0.5, 3))
h5_neurons_size = int(h1_neurons_size * np.power(0.5, 4))

beta1=0.001

graph = tf.Graph()
with graph.as_default():
    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)  
    #dropout variable
    keep_prob = tf.placeholder("float")

    #Input Layer: 28*28 neurons = 784
    #Hidden layer:1024
    #Otput Layer: 10
    
    #weights_1 = tf.Variable(tf.truncated_normal([image_size * image_size, hidden_nodes_1], stddev=math.sqrt(2.0/(image_size*image_size))))
    
    #Hidden_1 layer:
    weights_h1 = tf.Variable(tf.truncated_normal([image_size * image_size, h1_neurons_size], stddev=math.sqrt(2.0/(image_size*image_size))))
    biases_h1 = tf.Variable(tf.zeros([h1_neurons_size]))
    
    #Hidden_2 layer:
    weights_h2= tf.Variable(tf.truncated_normal([h1_neurons_size,h2_neurons_size], stddev=math.sqrt(2.0/h1_neurons_size)))
    biases_h2 = tf.Variable(tf.zeros([h2_neurons_size]))    

    #Hidden_3 layer:
    weights_h3= tf.Variable(tf.truncated_normal([h2_neurons_size,h3_neurons_size], stddev=math.sqrt(2.0/h2_neurons_size)))
    biases_h3 = tf.Variable(tf.zeros([h3_neurons_size]))   

    #Hidden_4 layer:
    weights_h4= tf.Variable(tf.truncated_normal([h3_neurons_size,h4_neurons_size], stddev=math.sqrt(2.0/h3_neurons_size)))
    biases_h4 = tf.Variable(tf.zeros([h4_neurons_size]))   
    
    #Hidden_5 layer:
    weights_h5= tf.Variable(tf.truncated_normal([h4_neurons_size,h5_neurons_size], stddev=math.sqrt(2.0/h4_neurons_size)))
    biases_h5 = tf.Variable(tf.zeros([h5_neurons_size]))
    
    #Out Layer
    weights_out = tf.Variable(tf.truncated_normal([h5_neurons_size, num_labels], stddev=math.sqrt(2.0/h5_neurons_size)))
    biases_out = tf.Variable(tf.zeros([num_labels]))
  
    #Training
    
    #hidden 1
    preactivation_h1 = tf.matmul(tf_train_dataset, weights_h1) + biases_h1
    activation_h1 = tf.nn.relu(preactivation_h1)
    dropout_h1=tf.nn.dropout(activation_h1, keep_prob)    
    
    #hidden 2
    preactivation_h2=tf.matmul(dropout_h1,weights_h2) + biases_h2
    activation_h2=tf.nn.relu(preactivation_h2)    
    dropout_h2=tf.nn.dropout(activation_h2, keep_prob)
    
    #hidden 3
    preactivation_h3=tf.matmul(dropout_h2, weights_h3) + biases_h3
    activation_h3=tf.nn.relu(preactivation_h3)    
    dropout_h3=tf.nn.dropout(activation_h3, keep_prob)
    
    #hidden 4
    preactivation_h4=tf.matmul(dropout_h3, weights_h4) + biases_h4
    activation_h4=tf.nn.relu(preactivation_h4)    
    dropout_h4=tf.nn.dropout(activation_h4, keep_prob)
    
    #hidden 5
    preactivation_h5=tf.matmul(dropout_h4, weights_h5) + biases_h5
    activation_h5=tf.nn.relu(preactivation_h5)    
    dropout_h5=tf.nn.dropout(activation_h5, keep_prob)
    
    #Output layer

    logits = tf.matmul(dropout_h5, weights_out) + biases_out
    
    #Loss
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))   
    
    #adding regularization to the Loss:
    regularizers = tf.nn.l2_loss(weights_out) + tf.nn.l2_loss(weights_h1) + \
                   tf.nn.l2_loss(weights_h2) + tf.nn.l2_loss(weights_h3) + \
                   tf.nn.l2_loss(weights_h4)+ tf.nn.l2_loss(weights_h5)
    
    #new LOSS:
    loss= tf.reduce_mean(loss + beta1*regularizers)
  
    #Optimizer:
    
    # Decaying learning rate
    #global_step = tf.Variable(0)  # count the number of steps taken.
    #start_learning_rate = 0.5
    #learning_rate = tf.train.exponential_decay(start_learning_rate, global_step, 100000, 0.96, staircase=True)
    #optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)   
  
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    # Predictions for the training, validation, and test data:
    train_prediction = tf.nn.softmax(logits)
    
    #valid_prediction
    tf_valida_dataset_logits_h1=tf.matmul(tf_valid_dataset, weights_h1) + biases_h1
    tf_valid_dataset_h1 = tf.nn.relu(tf_valida_dataset_logits_h1)
    
    tf_valida_dataset_logits_h2=tf.matmul(tf_valid_dataset_h1, weights_h2) + biases_h2
    tf_valid_dataset_h2 = tf.nn.relu(tf_valida_dataset_logits_h2)
    
    tf_valida_dataset_logits_h3=tf.matmul(tf_valid_dataset_h2, weights_h3) + biases_h3
    tf_valid_dataset_h3 = tf.nn.relu(tf_valida_dataset_logits_h3)
    
    tf_valida_dataset_logits_h4=tf.matmul(tf_valid_dataset_h3, weights_h4) + biases_h4
    tf_valid_dataset_h4 = tf.nn.relu(tf_valida_dataset_logits_h4)
    
    tf_valida_dataset_logits_h5=tf.matmul(tf_valid_dataset_h4, weights_h5) + biases_h5
    tf_valid_dataset_h5 = tf.nn.relu(tf_valida_dataset_logits_h5)
    
    tf_valid_dataset_logits_out=tf.matmul(tf_valid_dataset_h5, weights_out) + biases_out
    valid_prediction = tf.nn.softmax(tf_valid_dataset_logits_out)
                                     
    valididation_loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=valid_labels, logits=tf_valid_dataset_logits_out))                                 
         
    #test_prediction
    tf_test_dataset_h1 = tf.nn.relu(tf.matmul(tf_test_dataset, weights_h1) + biases_h1)
    tf_test_dataset_h2 = tf.nn.relu(tf.matmul(tf_test_dataset_h1, weights_h2) + biases_h2)
    tf_test_dataset_h3 = tf.nn.relu(tf.matmul(tf_test_dataset_h2, weights_h3) + biases_h3)
    tf_test_dataset_h4 = tf.nn.relu(tf.matmul(tf_test_dataset_h3, weights_h4) + biases_h4)
    tf_test_dataset_h5 = tf.nn.relu(tf.matmul(tf_test_dataset_h4, weights_h5) + biases_h5)
    
    test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset_h5, weights_out) + biases_out)

In [40]:
num_steps = 15000

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob:0.7}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 1000 == 0):
            print("offset = (%d * %d) // (%d - %d) = %d" % (step, batch_size, train_labels.shape[0], batch_size, offset))
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
            print('Validation Loss: %.1f\n' % valididation_loss.eval())
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
offset = (0 * 128) // (200000 - 128) = 0
Minibatch loss at step 0: 4.103625
Minibatch accuracy: 9.4%
Validation accuracy: 15.4%
Validation Loss: 2.3

offset = (1000 * 128) // (200000 - 128) = 128000
Minibatch loss at step 1000: 1.281305
Minibatch accuracy: 82.8%
Validation accuracy: 85.3%
Validation Loss: 0.5

offset = (2000 * 128) // (200000 - 128) = 56128
Minibatch loss at step 2000: 0.710922
Minibatch accuracy: 90.6%
Validation accuracy: 86.5%
Validation Loss: 0.4

offset = (3000 * 128) // (200000 - 128) = 184128
Minibatch loss at step 3000: 0.724169
Minibatch accuracy: 85.2%
Validation accuracy: 86.5%
Validation Loss: 0.4

offset = (4000 * 128) // (200000 - 128) = 112256
Minibatch loss at step 4000: 0.632468
Minibatch accuracy: 89.8%
Validation accuracy: 87.2%
Validation Loss: 0.4

offset = (5000 * 128) // (200000 - 128) = 40384
Minibatch loss at step 5000: 0.653072
Minibatch accuracy: 88.3%
Validation accuracy: 87.8%
Validation Loss: 0.4

offset = (6000 * 128) // (2000