# Load required Libraries

In [1]:
%matplotlib inline

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

# Load MNIST data

In [2]:
mnist = input_data.read_data_sets('MNIST_data', one_hot = True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


# Start TensorFlow interactive session

Interactive session helps to create tensor graph using interactive coding in IPython, Jupyter etc 

In [3]:
sess = tf.InteractiveSession()

Now let's create the graph. We need to define input variables, mulitple convolutional layers having it's own weights and biases and pooling, then a fully connected neural layer. 

# Create input variables

In [4]:
x = tf.placeholder(tf.float32, shape=[None,784]) 
y_ = tf.placeholder(tf.float32, shape=[None, 10])

# Here we are not defining number of rows. This will allow to input data with any number of rows
# Input has 784 columns (28x28 image) and output has 10 columns/classes. That means total 10 classes can be predicted with this DNN


# Defining Weights and Bias

Throught the network designing, we will require weights and biases to be defined multipe times, hence let's define functions to create weights and biases

In [5]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape=shape, stddev=0.1)
    return tf.Variable(initial)
# tf.Variable is a data type in tensorflow for containing the data that can be or needs to be modified while processing the graph

In [6]:
def bias_varible(shape):
    bias = tf.constant(shape=shape, value=0.1)
    return tf.Variable(bias)

Let's create variables and initialize them in the session

In [7]:
W = weight_variable([784,10])
b = bias_varible([10])
sess.run(tf.initialize_all_variables())

# Prediction and Loss Operation

In order to train a model, we need to provide a prediction function, a loss function and a derivative function which will try to minimize the loss.

## Prediction Operation

We need to evaluate below expression

y = Wx + b

In [8]:
y = tf.matmul(a=x, b=W) + b

## Loss Operation

We need to find difference/cross entropy between y and y_ to find loss. However, since our output is of 10 dimension, we will have to apply softmax function. Also, the calculated loss will have shape of [n,10], hence we will have to take mean to get a loss value.

In [9]:
cross_entropy= tf.nn.softmax_cross_entropy_with_logits(y, y_)
loss=tf.reduce_mean(cross_entropy) # loss is 1D value representing total loss mean)

## Derivative Operation

In [10]:
train_step = tf.train.GradientDescentOptimizer(learning_rate=0.5).minimize(loss)

By now, we have created the required graph. It has input placeholders, trainable Weight and Bias variables, operation to predict y and function to calculate the loss.

Let's feed the graph with Data

# Train the Model

As part of training the Model, we will have to evaluate the graph by providing values for input placeholders, multiple times.

In [11]:
for i in range(1000):
    batch = mnist.train.next_batch(1000)
    train_step.run(feed_dict={x:batch[0], y_:batch[1]}) # This is to input data to input placeholders. feed_dict is not restricted to input placeholders only
    

Above will train our model and put the values in y. We will have to compare y with y_ to find the accuracy.

# Evaluate Model

If we recall, the output y had [n,10] dimensions where each of the 10 columns contains probability or [0,1] to depict where that column is the output. Hence, in order to find out, for each row, which column contains the output or max value. tf.argmax function is used for this purpose.

E.g. of outputs:
[0,0,1,0,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0,1]
[1,0,0,0,0,0,0,0,0,0]
[0,0,0,0,0,0,1,0,0,0]

In [12]:
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
# This is provide list of booleans

Let's take mean of above booleans to find the accuracy%

In [13]:
accuracy = tf.reduce_mean(tf.cast(dtype=tf.float16, x=correct_prediction))

In [14]:
print accuracy.eval(feed_dict = {x:mnist.train.images, y_ : mnist.train.labels})

0.037231


# Defining Convolution and Pooling

In [15]:
def conv2d(x, W):
    conv = tf.nn.conv2d(x, W, strides=[1,1,1,1], padding = 'SAME')
    return conv
def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1,2,2,1], strides=[1,2,2,1], padding="SAME")

In [16]:
x_image = tf.reshape(x, [-1,28,28,1]) 
# This dimension contains row count, image (28 x 28), color channel (1). 
# We are keeping -1 as row count because we will not know before hand how many rows of data we will be using for training.

In [17]:
print x.get_shape()
print x_image.get_shape()

(?, 784)
(?, 28, 28, 1)


### Convolve and Apply Relu
#### Layer 1

In [18]:
W_conv1 = weight_variable([5,5,1,32])
b_conv1 = bias_varible([32])

In [19]:
h_conv1 = conv2d(x_image, W_conv1) + b_conv1
print h_conv1.get_shape()

h_conv1_relu = tf.nn.relu(h_conv1)
print h_conv1_relu.get_shape()

h_pool1 = max_pool_2x2(h_conv1_relu)
print h_pool1.get_shape()

(?, 28, 28, 32)
(?, 28, 28, 32)
(?, 14, 14, 32)


After first convolution layer, an image of 28 x 28 x 1 size i.e. 784 when flatten, will convolve to 28 x 28 x 32 matrix, where 32 are different features of the image.  
Now we need to have 64 features in layer 2, hence we will create weights based on   
[  
  convolve matrix [5 x 5] x  
  number of existing channels/features [32] x  
  number of new features desired [64]  
]

#### Layer 2

In [20]:

W_conv2 = weight_variable([5,5,32,64])
b_conv2 = bias_varible([64])

In [21]:
h_conv2 = conv2d(W=W_conv2, x=h_pool1)
print h_conv2.get_shape()

h_conv2_relu = tf.nn.relu(h_conv2)
print h_conv2_relu.get_shape()

h_pool2 = max_pool_2x2(x=h_conv2_relu)
print h_pool2.get_shape()

(?, 14, 14, 64)
(?, 14, 14, 64)
(?, 7, 7, 64)


# Fully Connected Neural Network

Now the image size is 7 * 7 per feature and there are total 64 features. Let's pass this data through Neural Network. In order to do that we will be required to flatten current 3D image data to 2D

In [22]:
h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
# -1 indicates number of rows

We will be created a NN with 1 layer of 1024 neurons

In [23]:
W_fc1 = weight_variable([7 * 7 * 64, 1024])
b_fc1 = bias_varible([1024])
h_fc1 = tf.nn.relu(tf.matmul(b= W_fc1, a = h_pool2_flat ) + b_fc1) 

# Dropout

We need to drop some results to avoid overfitting

In [24]:
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

# Readout layer

We wil aply softmax function to convert our 1024 outputs to 10 outputs (classes) using Softmax Regression


In [25]:
W_fc2 = weight_variable([1024,10])
b_fc2 = bias_varible([10])
y_conv = tf.matmul(b = W_fc2, a = h_fc1_drop) + b_fc2

# Calculate Error

We will calculate the error using cross entropy

In [26]:
error = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_conv, y_))

# Optimizer

In [27]:
step = tf.train.AdamOptimizer(1e-04).minimize(error)

# Accuracy

In [28]:
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))

# Training

The graph has been created. Let's start the training code

## Initialize Variables

In [29]:
sess.run(tf.initialize_all_variables())

## Train Model

### Create Feed Dictionary

In [30]:
feed_dict = {
                x:batch[0],
                y_:batch[1],
                keep_prob:0.5
            }

### Iteratively Train model

In [31]:

for i in range(20):
    batch = mnist.train.next_batch(50)
    if i % 5 == 0:
        train_accuracy = accuracy.eval(
        feed_dict = feed_dict
        )
        print ("Step %d, training accuracy %g" % (i, train_accuracy))
        
    step.run(feed_dict = feed_dict)


Step 0, training accuracy 0.111
Step 5, training accuracy 0.138
Step 10, training accuracy 0.214
Step 15, training accuracy 0.255


# Test Model

In [32]:
print("Test accuracy %g" % accuracy.eval(
        feed_dict = {
            x:mnist.test.images, 
            y_:mnist.test.labels,
            keep_prob:1
            }
        )
      )

Test accuracy 0.6337
