# Deep Neural Network for MNIST classification:   
The dataset provides 28x28 images of handwritten digits (1 per image) and the goal is to write an algorithm that detects which digit is written. this is a classification problem with 10 classes. We will build a network with 2 hidden layers between inputs and outputs.

## 1) Import the relevant packages:

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.
Extracting MNIST_data/train-images-idx3-ubyte.gz
Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Successfully downloaded t10k-images-idx3-ubyte.gz 1648877 bytes.
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


## 2) Outline the model:

In [61]:
input_size = 784
output_size = 10
hidden_layer_size = 400
#reset any variables left in memory from previous runs.
tf.reset_default_graph()

#declare placeholders where the data will be fed into.
inputs = tf.placeholder(tf.float32, [None, input_size])
targets = tf.placeholder(tf.float32, [None, output_size])

#weights and biases fro the first linear combination between the inputs and the first hidden layer.
#use get_variable: the default TensorFlow initializer which is Xavier
weights_1 = tf.get_variable('weights_1', [input_size, hidden_layer_size])
biases_1 = tf.get_variable('biases_1', [hidden_layer_size])
#operation between the inputs and the first hidden layer:
#we have chosen ReLu as our activation function. try playing with other non-linearities
outputs_1 = tf.nn.relu(tf.matmul(inputs, weights_1) + biases_1)

#tf.nn: is a module that contains neural network(nn) support. among other things, it contains the most commonly used activation functions. 
#other activation functions:
#tf.nn.sigmoid
#tf.nn.tanh
#tf.nn.relu
#tf.nn.softmax

#weights and biases for the second linear combination. this is between the first and second hidden layer
weights_2 = tf.get_variable('weights_2', [hidden_layer_size, hidden_layer_size])
biases_2 = tf.get_variable('biases_2', [hidden_layer_size])
#operation between the first and the second hidden layers:
outputs_2 = tf.nn.relu(tf.matmul(outputs_1, weights_2) +biases_2)

#weights and biases for the final linear combination: that's between the second hidden layer and the output layer
weights_3 = tf.get_variable('weights_3', [hidden_layer_size, output_size])
biases_3 = tf.get_variable('biases_3', [output_size])


#operation between the second hidden layer and the output layer.
#in this last operation we don't use an activation function.
#because we will trick to include it directly in the loss function. this works for softmax and sigmoid activation functions
outputs = tf.matmul(outputs_2, weights_3) + biases_3 #this is only calculating the linear combination (no activation function)



#now we're going to calculate the loss function for every output/target layer
#the function used is the same as applying softmax to the last layer and then calculating cross entropy error (an alternative to mean square error)
#this function combines them in a clever way which makes it both faster and more numerically stable
#logits here means: unscaled probabilities (the outputs before they are scaled by the softmax)
#the labels are the targets
loss = tf.nn.softmax_cross_entropy_with_logits(logits=outputs, labels = targets)
#get the average loss:
mean_loss = tf.reduce_mean(loss)

#define the optimization step. using adaptive optimizers such as adam in tensorflow, instead of gradient descent:
optimize = tf.train.AdamOptimizer(learning_rate = 0.001).minimize(mean_loss)

#get a 0 or 1 for every input in the batch indicating whether it output the correct answer out of the 10:
out_equals_target = tf.equal(tf.argmax(outputs, 1), tf.argmax(targets, 1))

#tf.argmax: we are only interested in the highest value in a given output row. this function returns the index of the column with the largest value
# if the indeces of the outputs are matching with the indeces of targets, the model has predicted equally otherwise it hasn't.
#tf.equal returns a boolean 1 if they match 0 if they dont
# the number 1 in the tf.argmax function refers to axis=1 which refers to the column axes.
#so, out_equals_target is a vector with one column with 1s and 0s. 1 for matches and 0 for mismatches

#get the average accuracy of the outputs:
accuracy = tf.reduce_mean(tf.cast(out_equals_target, tf.float32))#tensorflow method of finding the mean
#accuracy is the mean of the vector out_equals_target
#we change it to float, because it's a boolean vector.
#tf.cast: casts a tensor to a new type


#declare the session variable:
sess = tf.InteractiveSession()

#initialize the variables:
initializer = tf.global_variables_initializer()
sess.run(initializer)


#now we need the batch:
batch_size=100

#calculate the number of batches per epoch for the training set:
batches_number = mnist.train._num_examples // batch_size

#basic early stopping. set a maximum number of epochs:
max_epochs = 15

#keep trach of the validation loss of the previous epoch
#if the validation loss becomes increasing, we want to trigger early stopping
#we initially set it at some arbitrarily high number to make sure we don't trigger it at the first epoch
prev_validation_loss = 9999999.


#learning:
#create a loop for the epochs. epoch_encounter is a variable which automatically starts from 0.

for epoch_counter in range(max_epochs):
    curr_epoch_loss = 0.
    for batch_counter in range(batches_number):
        input_batch, target_batch = mnist.train.next_batch(batch_size)
        _, batch_loss = sess.run([optimize, mean_loss],
            feed_dict = {inputs: input_batch, targets: target_batch})
        curr_epoch_loss += batch_loss
    curr_epoch_loss /= batches_number
    input_batch, target_batch = mnist.validation.next_batch(mnist.validation._num_examples)
    validation_loss, validation_accuracy = sess.run([mean_loss, accuracy],
        feed_dict = {inputs: input_batch, targets: target_batch})
    print('Epoch '+str(epoch_counter+1)+
          '. Mean loss (training loss): '+'{0:.3f}'.format(curr_epoch_loss)+
          '. Validation loss: '+'{0:.3f}'.format(validation_loss)+
          '. Validation accuracy: '+'{0:.2f}'.format(validation_accuracy * 100.)+'%')
    if validation_loss > prev_validation_loss:
        break
    prev_validation_loss = validation_loss
print('End of training.')

Epoch 1. Mean loss (training loss): 0.228. Validation loss: 0.108. Validation accuracy: 96.90%
Epoch 2. Mean loss (training loss): 0.086. Validation loss: 0.091. Validation accuracy: 97.30%
Epoch 3. Mean loss (training loss): 0.055. Validation loss: 0.081. Validation accuracy: 97.42%
Epoch 4. Mean loss (training loss): 0.038. Validation loss: 0.069. Validation accuracy: 98.14%
Epoch 5. Mean loss (training loss): 0.029. Validation loss: 0.068. Validation accuracy: 98.16%
Epoch 6. Mean loss (training loss): 0.024. Validation loss: 0.074. Validation accuracy: 97.88%
End of training.


## 3) Test:

In [62]:
input_batch, target_batch = mnist.test.next_batch(mnist.test._num_examples)
test_accuracy = sess.run([accuracy],
    feed_dict ={inputs: input_batch, targets: target_batch})
print(test_accuracy)

test_accuracy_percent = test_accuracy[0] *100

print('test accuracy:' + '{0:.2f}'.format(test_accuracy_percent) + '%')

[0.9817]
test accuracy:98.17%


In [None]:
#hidden layer size = 50: test accuracy: 96.84
#hidden layer size = 200: test accuracy: 97.69 (longer time)
#hidden layer size = 400: test accuracy: 97.92 (takes much longer)
#hidden layer size = 200 + one more layer: test accur: 96.86
#hidden layer size = 200 + 5 hidden layers in  total: test accur: 96.91
#hidden layer size = 400 + 5 hidden layers in total: test accur: 97.14 (long)
#hidden layer size = 200 2 layers in total: with sigmoid in the last layer output (output_2): test accur: 97.58
#hidden layer size = 400 2 layers in total: with sigmoid " : test accur: 97.92
#hidden layer size = 400 2 layers in total: with softmax " : test accur: 70.21
#hidden layer size = 400 2 layers in total: with tanh " : test accur: 97.56
#hidden layer size = 400 2 layers in total: with softmax both outputs : test accur: 93.63 
#started low validation accuracy; took longer; more epochs; 
#hidden layer size = 400 2 layers in total: with sigmoid both outputs : test accur: 97.82
#hidden layer size = 400 2 layers in total: batch size = 1000: test accur: 97.73
#hidden layer size = 400 2 layers in total: batch size = 1: test accur: (takes extra extra long)
#hidden layer size = 400 2 layers in total: learning rate = 0.0001: test accur: (takes longer; more epoch) 97.88
#hidden layer size = 400 2 layers in total: learning rate = 0.02: test accur: 94.46 (takes less time; less epochs)
#hidden layer size = 400 2 layers in total: learning rate = 0.01: test accur: 96.49
#hidden layer size = 600 2 layers in total: learning rate = 0.001: test accur: 97.81
#hidden layer size = 400 2 layers in total: learning rate = 0.001: test accur: 98.02
#hidden layer size = 400 2 layers in total: batch size = 500: test accur: 97.94
#hidden layer size = 400 2 layers in total: learning rate = 0.001: test accur: 98.17