The dataset used for this project/experiment is MNIST dataset. You can download it using tensorflow built-in functions as shown below.


Network depth is of crucial importance in neural network architectures, but deeper networks are more difficult to train. The residual learning framework eases the training of these networks, and enables them to be substantially deeper — leading to improved performance in both visual and non-visual tasks. These residual networks are much deeper than their ‘plain’ counterparts, yet they require a similar number of parameters (weights).

Materials:

[Deep Residual Learning for Image Recognition](https://arxiv.org/pdf/1512.03385.pdf)

[Identity Mappings in Deep Residual Networks](https://arxiv.org/pdf/1603.05027.pdf)

This [Blog post](https://blog.waya.ai/deep-residual-learning-9610bb62c355) is great for intuition behind ResNet.

![](paper_net.png)

In [None]:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import numpy as np
import time

In [None]:
mnist_data = input_data.read_data_sets("MNIST_data", one_hot=True)

### Step 1. Define helper functions

In [None]:
def weights_init(shape):
    '''
    Weights initialization helper function.
    
    Input(s): shape - Type: int list, Example: [5, 5, 32, 32], This parameter is used to define dimensions of weights tensor
    
    Output: tensor of weights in shape defined with the input to this function
    '''
    return tf.Variable(tf.truncated_normal(shape, stddev=0.05))

In [None]:
def bias_init(shape, bias_value=0.01):
    '''
    Bias initialization helper function.
    
    Input(s): shape - Type: int list, Example: [32], This parameter is used to define dimensions of bias tensor.
              bias_value - Type: float number, Example: 0.01, This parameter is set to be value of bias tensor.
    
    Output: tensor of biases in shape defined with the input to this function
    '''
    return tf.Variable(tf.constant(bias_value, shape=shape))

In [None]:
def conv2d_custom(input, filter_size, num_of_channels, num_of_filters, activation=tf.nn.relu, dropout=None,
                  padding='SAME', max_pool=True, strides=(1, 1)):  
    '''
    This function is used to define a convolutional layer for a network,
    
    Input(s): input - this is input into convolutional layer (Previous layer or an image)
              filter_size - also called kernel size, kernel is moved (convolved) across an image. Example: 3
              number_of_channels - how many channels the input tensor has
              number_of_filters - this is hyperparameter, and this will set one of dimensions of the output tensor from 
                                  this layer. Note: this number will be number_of_channels for the layer after this one
              max_pool - if this is True, output tensor will be 2x smaller in size. Max pool is there to decrease spartial 
                        dimensions of our output tensor, so computation is less expensive.
              padding - the way that we pad input tensor with zeros ("SAME" or "VALID")
              activation - the non-linear function used at this layer.
              
              
    Output: Convolutional layer with input parameters.
    '''
    weights = weights_init([filter_size, filter_size, num_of_channels, num_of_filters])
    bias = bias_init([num_of_filters])
    
    layer = tf.nn.conv2d(input, filter=weights, strides=[1, strides[0], strides[1], 1], padding=padding) + bias
    
    if activation != None:
        layer = activation(layer)
    
    if max_pool:
        layer = tf.nn.max_pool(layer, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
    
    if dropout != None:
        layer = tf.nn.dropout(layer, dropout)
        
    return layer

In [None]:
def flatten(layer):
    '''
    This method is used to convert convolutional output (4 dimensional tensor) into 2 dimensional tensor.
    
    Input(s): layer - the output from last conv layer in your network (4d tensor)
    
    Output(s): reshaped - reshaped layer, 2 dimensional matrix
               elements_num - number of features for this layer
    '''
    shape = layer.get_shape()
    
    num_elements_ = shape[1:4].num_elements()
    
    flattened_layer = tf.reshape(layer, [-1, num_elements_])
    return flattened_layer, num_elements_

In [None]:
def dense_custom(input, input_size, output_size, activation=tf.nn.relu, dropout=None):
    '''
    This function is used to define a fully connected layer for a network,
    
    Input(s): input - this is input into fully connected (Dense) layer (Previous layer or an image)
              input_size - how many neurons/features the input tensor has. Example: input.shape[1]
              output_shape - how many neurons this layer will have
              activation - the non-linear function used at this layer.    
              dropout - the regularization method used to prevent overfitting. The way it works, we randomly turn off
                        some neurons in this layer
              
    Output: fully connected layer with input parameters.
    '''
    weights = weights_init([input_size, output_size])
    bias = bias_init([output_size])
    
    layer = tf.matmul(input, weights) + bias
    
    if activation != None:
        layer = activation(layer)
    
    if dropout != None:
        layer = tf.nn.dropout(layer, dropout)
        
    return layer

The resunit implemented in this notebook is explained in this [paper](https://arxiv.org/pdf/1603.05027.pdf).

This is a picutre of the resunit used:

![](resunit.jpeg?raw=true)

Note: implemented version is B

In [None]:
def residual_unit(layer):
    '''
    Input(s): layer - conv layer before this res unit
    
    Output(s): ResUnit layer - implemented as described in the paper
    '''
    step1 = tf.layers.batch_normalization(layer)
    step2 = tf.nn.relu(step1)
    step3 = conv2d_custom(step2, 3, 32, 32, activation=None, max_pool=False) #32 number of feautres is hyperparam
    step4 = tf.layers.batch_normalization(step3)
    step5 = tf.nn.relu(step4)
    step6 = conv2d_custom(step5, 3, 32, 32, activation=None, max_pool=False)
    return layer + step6

### Step 2. Residual Network (ResNet)

In [None]:
inputs = tf.placeholder(tf.float32, [None, 28, 28, 1], name='inputs')
targets = tf.placeholder(tf.float32, [None, 10], name='targets')

In [None]:
num_of_layers = 20
between_strides = num_of_layers/5

#### This is our network

In [None]:
prev1 = conv2d_custom(inputs, 3, 1, 32, activation=None, max_pool=False)
prev1 = tf.layers.batch_normalization(prev1)
for i in range(5): # this number * between_strides = number_of_layers
    for j in range(int(between_strides)):
        prev1 = residual_unit(prev1)
    #After 4 res units we perform strides 2x2, which will reduce data
    perv1 = conv2d_custom(inputs, 3, 1, 32, activation=None, max_pool=False, strides=[2, 2])
    prev1 = tf.layers.batch_normalization(prev1)
#after all resunits we have last conv layer, than flattening and output layer
last_conv = conv2d_custom(prev1, 3, 32, 10, activation=None, max_pool=False)
flat, features = flatten(last_conv)
output = dense_custom(flat, features, 10, activation=None)

In [None]:
#This part is for computing the accuracy of this model
pred_y = tf.nn.softmax(output)
pred_y_true = tf.argmax(pred_y, 1)
y_true = tf.argmax(targets, 1)
correct_prediction = tf.equal(pred_y_true, y_true)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [None]:
# loss function and optimizer
cost = tf.reduce_mean((tf.nn.softmax_cross_entropy_with_logits(logits=output, labels=targets)))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

### Step 3. Training and testing helper functions

In [None]:
batch_size = 32

total_number_trained = 0
epochs = 5
def optmizer():

    for i in (range(epochs)):
        epoch_loss = []
        start_epoch = time.time()
        for ii in range(mnist_data.train.num_examples//batch_size):
            batch = mnist_data.train.next_batch(batch_size)
            imgs = batch[0].reshape((-1, 28, 28, 1))
            labs = batch[1]

            dict_input = {inputs:imgs, targets:labs}

            c, _ = session.run([cost, optimizer], feed_dict=dict_input)
            epoch_loss.append(c)
        print("Epoche: {}/{}".format(i+1, epochs), "| Training accuracy: ", session.run(accuracy, feed_dict=dict_input), 
              "| Cost: {}".format(np.mean(epoch_loss)), " | Time for epoch: {:.2f}s".format(time.time() - start_epoch))

In [None]:
batch_size_valid = 1000
def validate_model():
    accuracy_per_batch = []
    for ii in range(mnist_data.validation.num_examples//batch_size_valid):
        batch = mnist_data.validation.next_batch(batch_size_valid)
        imgs = batch[0].reshape((-1, 28, 28, 1))
        labs = batch[1]

        accuracy_per_batch.append(session.run(accuracy, feed_dict={inputs:imgs, targets:labs}))

    print("Validation per batch accuracy {}".format(accuracy_per_batch))
    print("Test accuracy average: {:.2f}%".format(np.mean(accuracy_per_batch)*100))

In [None]:
batch_size_test = 1000
def test_model():
    accuracy_per_batch = []
    for ii in range(mnist_data.test.num_examples//batch_size_test):
        batch = mnist_data.test.next_batch(batch_size_test)
        imgs = batch[0].reshape((-1, 28, 28, 1))
        labs = batch[1]

        accuracy_per_batch.append(session.run(accuracy, feed_dict={inputs:imgs, targets:labs}))

    print("Test per batch accuracy {}".format(accuracy_per_batch))
    print("Test accuracy average: {:.2f}%".format(np.mean(accuracy_per_batch)*100))

### Step 4. Train/Test the network

In [None]:
session = tf.Session()
session.run(tf.global_variables_initializer())

In [None]:
optmizer()

In [None]:
test_model()

In [None]:
validate_model()

In [None]:
session.close()
#close the session after testing the model