The dataset used in this project is MNIST dataset, you can download it by using built-in tensorflow functions.


For more information about Highway Networks read:

[Highway Networks](https://arxiv.org/pdf/1505.00387.pdf)

[This blog post](https://medium.com/jim-fleming/highway-networks-with-tensorflow-1e6dfa667daa)

In [1]:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import numpy as np

In [2]:
mnist_data = input_data.read_data_sets("MNIST_data", one_hot=True)

Extracting MNIST_data\train-images-idx3-ubyte.gz
Extracting MNIST_data\train-labels-idx1-ubyte.gz
Extracting MNIST_data\t10k-images-idx3-ubyte.gz
Extracting MNIST_data\t10k-labels-idx1-ubyte.gz


### Step 1. Define helper functions

In [3]:
def weights_init(shape):
    '''
    Weights initialization helper function.
    
    Input(s): shape - Type: int list, Example: [5, 5, 32, 32], This parameter is used to define dimensions of weights tensor
    
    Output: tensor of weights in shape defined with the input to this function
    '''
    return tf.Variable(tf.truncated_normal(shape, stddev=0.05))

In [4]:
def bias_init(shape, bias_init=0.05):
    '''
    Bias initialization helper function.
    
    Input(s): shape - Type: int list, Example: [32], This parameter is used to define dimensions of bias tensor.
              bias_value - Type: float number, Example: 0.01, This parameter is set to be value of bias tensor.
    
    Output: tensor of biases in shape defined with the input to this function
    '''
    return tf.Variable(tf.constant(bias_init, shape=shape))

In [5]:
def fully_connected_layer(input, input_shape, output_shape, activation=tf.nn.relu):
   '''
    This function is used to define a fully connected layer for a network,
    
    Input(s): input - this is input into fully connected (Dense) layer (Previous layer or an image)
              input_size - how many neurons/features the input tensor has. Example: input.shape[1]
              output_shape - how many neurons this layer will have
              activation - the non-linear function used at this layer.    
              
    Output: fully connected layer with input parameters.
    '''
    weights = weights_init([input_shape, output_shape]) 
    bias = bias_init([output_shape])
    layer = tf.add(tf.matmul(input, weights), bias) #x*w + b
    
    if activation != None:
        return activation(layer)
    else:
        return layer

The **highway layer** what we want are two “gates” that control the flow of information. The “transform” gate controls how much of the activation we pass through and the “carry” gate controls how much of the unmodified input we pass through.

The formula for the highway layer:
![](formula_highway_layer.png?raw=true)

In [6]:
def highway_fc_layer(input, hidden_layer_size, carry_b = -2.0, activation=tf.nn.relu):
    '''
    The function used to crate Highway fully connected layer in the network.
    
    Inputs: input - data input
            hidden_layer_size - number of neurons in the hidden layers (highway layers)
            carry_b -  value for the carry bias used in transform gate
            activation - non-linear function used at this layer
    '''
    #Step 1. Define weights and biases for the activation gate
    weights_normal = weights_init([hidden_layer_size, hidden_layer_size])
    bias_normal = bias_init([hidden_layer_size])
    
    #Step 2. Define weights and biases for the transform gate
    weights_transform = weights_init([hidden_layer_size, hidden_layer_size])
    bias_transform = bias_init(shape=[hidden_layer_size], bias_init=carry_b)
    
    #Step 3. calculate activation gate
    H = activation(tf.matmul(input, weights_normal) + bias_normal, name="Input_gate")
    #Step 4. calculate transform game
    T = tf.nn.sigmoid(tf.matmul(input, weights_transform) +bias_transform, name="T_gate")
    #Step 5. calculate carry get (1 - T)
    C = tf.subtract(1.0, T, name='C_gate')
    # y = (H * T) + (x * C)
    #Final step 6. campute the output from the highway fully connected layer
    y = tf.add(tf.multiply(H, T), tf.multiply(input, C), name='output_highway')
    return y

In [23]:
#defining hyperparams
input_shape = 784 #28x28x1 <- Number of pixels of MNIST image

hidden_size = 50 # This is number of neurons used at EVERY hidden highway layer, you can test with this number
                #but becuase we have highway (deep) network this number doesn't have to be very large

output_size = 10 # number of neurons at the output layer, 10 because we have 10 classes

number_of_layers = 18 # this is another hyperparam to care about in highway networks, play with it 

cary_bias = -20.0 # This is cary bias used at transform gate inside highway layer

epochs = 40 # How many times are we going to run through whole dataset

batch_size = 64 # How many data samples to feed to a network at onces

learning_rate = 0.01

### Step 2. Define HIGHWAY network

In [8]:
#Defining inputs to tensorflow graph, one is for images - inputs, and another one is for classes - targets
inputs = tf.placeholder(tf.float32, shape=[None, input_shape], name='Input')
targets = tf.placeholder(tf.float32, shape=[None, output_size], name='output')

In [9]:
#Defining HIGHWAY NETWORK
prev_layer = None
output = None
for layer in range(number_of_layers):
    
    if layer == 0:
        #This for input layer
        prev_layer = fully_connected_layer(inputs, input_shape, hidden_size)
    elif layer == number_of_layers-1:
        #This if for output layer
        output = fully_connected_layer(prev_layer, hidden_size, output_size, activation=None)
    else:
        # for any layer between input and output layer
        prev_layer = highway_fc_layer(prev_layer, hidden_size, carry_b=cary_bias)        

In [10]:
#Defining error/cost/loss function and optimizier
cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=output, labels=targets)) #this is standard cross entropy loss
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

In [11]:
#This is used only for testing
y_pred = tf.nn.softmax(output)
y_pred_scores = tf.argmax(y_pred, 1)
y_true = tf.argmax(targets, 1)

In [12]:
#Getting accuracy
correct_prediction = tf.equal(y_pred_scores, y_true)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [13]:
# if you make some mistake or change the structure of your network, good practice is to reset default graph.
# tf.reset_default_graph()

In [14]:
session = tf.Session()

In [15]:
session.run(tf.global_variables_initializer())

### Time for training this network
    1. Define optimize function to train the network
    2. Define two more, small, functions for testint and validating the network
    3. Play AC/DC Highway to hell and train your first highway network :)

In [22]:
import time
def optimize():
    
    for i in range(epochs):
        epoch_cost = []
        epoch_time = time.time()
        for ii in range(mnist_data.train.num_examples//batch_size):
            batch = mnist_data.train.next_batch(batch_size)
            imgs = batch[0]
            labs = batch[1]
            
            c, _ = session.run([cost, optimizer], feed_dict={inputs:imgs, targets:labs})

            epoch_cost.append(c)
        print("Epoch: {}/{}".format(i+1, epochs), " | Current loss: {}".format(np.mean(epoch_cost)),
             "  |  Epoch time: {:.2f}s".format(time.time() - epoch_time))
        print("test accuracy %g" % session.run(accuracy ,feed_dict={ inputs: mnist_data.test.images, targets: mnist_data.test.labels }))

In [17]:
def test_model():
    return session.run(accuracy, feed_dict={inputs:mnist_data.test.images, 
                                           targets:mnist_data.test.labels})

In [18]:
def validate_model():
    return session.run(accuracy, feed_dict={inputs:mnist_data.validation.images, 
                                           targets:mnist_data.validation.labels})

In [24]:
optimize()

Epoch: 1/40  | Current loss: 1.0439070463180542   |  Epoch time: 4.13s
test accuracy 0.9704
Epoch: 2/40  | Current loss: 0.8419095873832703   |  Epoch time: 4.11s
test accuracy 0.9701
Epoch: 3/40  | Current loss: 0.6185269355773926   |  Epoch time: 4.16s
test accuracy 0.9684
Epoch: 4/40  | Current loss: 0.7163951396942139   |  Epoch time: 4.08s
test accuracy 0.9726
Epoch: 5/40  | Current loss: 0.6212238073348999   |  Epoch time: 4.17s
test accuracy 0.973
Epoch: 6/40  | Current loss: 0.5380747318267822   |  Epoch time: 4.13s
test accuracy 0.9712
Epoch: 7/40  | Current loss: 0.41859257221221924   |  Epoch time: 4.19s
test accuracy 0.9727
Epoch: 8/40  | Current loss: 0.2758252024650574   |  Epoch time: 4.08s
test accuracy 0.9725
Epoch: 9/40  | Current loss: 0.1436740756034851   |  Epoch time: 4.09s
test accuracy 0.9727
Epoch: 10/40  | Current loss: 0.1168537512421608   |  Epoch time: 4.11s
test accuracy 0.9724
Epoch: 11/40  | Current loss: 0.08351754397153854   |  Epoch time: 4.08s
test a

In [25]:
test_model()

0.97380012

In [26]:
validate_model()

0.97719967

In [None]:
# close session after you finish with using your network
# session.close()