This notebook is also available at https://github.com/mersad95zd/Deep_Neural_Network.

In [2]:
import warnings
warnings.filterwarnings("ignore")
import os
from time import time
# Load MNIST dataset
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
# Import Tensorflow and start a session
import tensorflow as tf
sess = tf.InteractiveSession()

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


# 2.a. Build and Train a 4-layer DCN

In [3]:
def weight_variable(shape):
    '''
    Initialize weights
    :param shape: shape of weights, e.g. [w, h ,Cin, Cout] where
    w: width of the filters
    h: height of the filters
    Cin: the number of the channels of the filters
    Cout: the number of filters
    :return: a tensor variable for weights with initial values
    '''
    # IMPLEMENT YOUR WEIGHT_VARIABLE HERE
    W = tf.Variable( tf.truncated_normal(shape,stddev=0.05) )
    return W

In [4]:
def bias_variable(shape):
    '''
    Initialize biases
    :param shape: shape of biases, e.g. [Cout] where
    Cout: the number of filters
    :return: a tensor variable for biases with initial values
    '''
    # IMPLEMENT YOUR BIAS_VARIABLE HERE
    b = tf.Variable( tf.constant(0.05 , shape=shape) )
    return b

In [5]:
def conv2d(x, W):
    '''
    Perform 2-D convolution
    :param x: input tensor of size [N, W, H, Cin] where
    N: the number of images
    W: width of images
    H: height of images
    Cin: the number of channels of images
    :param W: weight tensor [w, h, Cin, Cout]
    w: width of the filters
    h: height of the filters
    Cin: the number of the channels of the filters = the number of channels of
    images
    Cout: the number of filters
    :return: a tensor of features extracted by the filters, a.k.a. the results
    after convolution
    '''
    # IMPLEMENT YOUR CONV2D HERE
    h_conv = tf.nn.conv2d(x,W,strides=[1,1,1,1],padding='SAME')
    return h_conv

In [6]:
def max_pool_2x2(x):
    '''
    Perform non-overlapping 2-D maxpooling on 2x2 regions in the input data
    :param x: input data
    :return: the results of maxpooling (max-marginalized + downsampling)
    '''
    # IMPLEMENT YOUR MAX_POOL_2X2 HERE
    h_max = tf.nn.max_pool(x,strides=[1,2,2,1],ksize=[1,2,2,1],padding='SAME')
    return h_max

In [7]:
def main():
    # Specify training parameters
    result_dir = './results/' # directory where the results from the training are saved
    max_step = 5500 # the maximum iterations. After max_step iterations, the training will stop no matter what
    start_time = time() # start timing
    # FILL IN THE CODE BELOW TO BUILD YOUR NETWORK
    # placeholders for input data and input labeles
    x = tf.placeholder(tf.float32,[None,784],name='x')
    y_ = tf.placeholder(tf.float32,[None,10],name='y_')
    # reshape the input image
    x_image = tf.reshape(x, [-1, 28, 28, 1])
    # first convolutional layer
    W_conv1 = weight_variable([5,5,1,32])
    b_conv1 = bias_variable([32])
    h_conv1 = tf.nn.relu(conv2d(x_image,W_conv1) + b_conv1)
    h_pool1 = max_pool_2x2(h_conv1)
    # second convolutional layer
    W_conv2 = weight_variable([5,5,32,64])
    b_conv2 = bias_variable([64])
    h_conv2 = tf.nn.relu(conv2d(h_pool1,W_conv2) + b_conv2)
    h_pool2 = max_pool_2x2(h_conv2)
    # densely connected layer
    W_fc1 = weight_variable([7*7*64,1024])
    b_fc1 = bias_variable([1024])
    h_pool2_flat = tf.reshape(h_pool2,[-1,7*7*64])
    h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat,W_fc1) + b_fc1)
    # dropout
    keep_prob = tf.placeholder(tf.float32)
    h_fc1_drop = tf.nn.dropout(h_fc1, rate = 1-keep_prob)
    # softmax
    W_fc2 = weight_variable([1024,10])
    b_fc2 = bias_variable([10])
    y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop,W_fc2) + b_fc2, name='y_conv')
    # FILL IN THE FOLLOWING CODE TO SET UP THE TRAINING
    # setup training
    cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_*tf.log(y_conv),reduction_indices=[1]))
    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
    correct_prediction = tf.equal(tf.argmax(y_conv,1),tf.argmax(y_,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32),name='accuracy')
    # Add a scalar summary for the snapshot loss.
    loss_summary = tf.summary.scalar(cross_entropy.op.name, cross_entropy)
    
    # Build the summary operation based on the TF collection of Summaries.
    #summary_op = tf.summary.merge_all()
    summary_op = tf.summary.merge([loss_summary])
    
    # Add the variable initializer Op.
    init = tf.initialize_all_variables()
    # Create a saver for writing training checkpoints.
    saver = tf.train.Saver()
    # Instantiate a SummaryWriter to output summaries and the Graph.
    summary_writer = tf.summary.FileWriter(result_dir, sess.graph)
    # Run the Op to initialize the variables.
    sess.run(init)
    # run the training
    for i in range(max_step):
        batch = mnist.train.next_batch(50) # make the data batch, which is used in the training iteration.
        # the batch size is 50
        if i%100 == 0:
            # output the training accuracy every 100 iterations
            train_accuracy = accuracy.eval(feed_dict={x:batch[0], y_:batch[1], keep_prob: 1.0})
            print("step %d, training accuracy %g"%(i, train_accuracy))
            # Update the events file which is used to monitor the training (in this case,
            # only the training loss is monitored)
            summary_str = sess.run(summary_op, feed_dict={x: batch[0], y_:batch[1], keep_prob: 0.5})
            summary_writer.add_summary(summary_str, i)
            summary_writer.flush()
        # save the checkpoints every 1100 iterations
        if i % 1100 == 0 or i == max_step:
            checkpoint_file = os.path.join(result_dir, 'checkpoint')
            saver.save(sess, checkpoint_file, global_step=i)
        train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5}) # run one train_step
        #tf.reset_default_graph()
    # print test error
    print("test accuracy %g"%accuracy.eval(feed_dict={
    x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
    stop_time = time()
    print('The training takes %f second to finish'%(stop_time - start_time))

In [123]:
if __name__ == "__main__":
    main()

step 0, training accuracy 0.1
step 100, training accuracy 0.8
step 200, training accuracy 0.92
step 300, training accuracy 0.96
step 400, training accuracy 0.88
step 500, training accuracy 0.92
step 600, training accuracy 0.94
step 700, training accuracy 0.92
step 800, training accuracy 0.96
step 900, training accuracy 0.96
step 1000, training accuracy 0.88
step 1100, training accuracy 0.98
step 1200, training accuracy 1
step 1300, training accuracy 0.96
step 1400, training accuracy 0.96
step 1500, training accuracy 0.96
step 1600, training accuracy 1
step 1700, training accuracy 0.98
step 1800, training accuracy 1
step 1900, training accuracy 1
step 2000, training accuracy 1
step 2100, training accuracy 0.96
step 2200, training accuracy 1
step 2300, training accuracy 0.96
step 2400, training accuracy 0.98
step 2500, training accuracy 0.98
step 2600, training accuracy 1
step 2700, training accuracy 0.96
step 2800, training accuracy 1
step 2900, training accuracy 0.98
step 3000, trainin

You may see the image generated by tensorboard below. I installed jupyter-tensorboard instead of using the command line.

<img src="loss.PNG" alt="Drawing" style="width: 2000px;"/>

# 2.b. More on Visualizing Your Training
Changes made:
1. defining the function variable_summaries which records the statistical summary of the desired variables.
2. changed the initialization step so that it sends eachvariable to variable_summaries before moving forward in the code.
3. changed summary_op such that it monitors the desired variables.
4. defined a new summary variable "summary_nop" which is used for monitoring test accuracy.
5. tf.summary.merge_all() doesn't work because it merges all placeholders and varibales before this command which is wrong! So you may remove it from the helper code.

In [8]:
def variable_summaries(var):
    """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
    with tf.name_scope('summaries'):
        mean = tf.reduce_mean(var)
        m1 = tf.summary.scalar('mean', mean)
        with tf.name_scope('stddev'):
              stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
        m2 = tf.summary.scalar('stddev', stddev)
        m3 = tf.summary.scalar('max', tf.reduce_max(var))
        m4 = tf.summary.scalar('min', tf.reduce_min(var))
        m5 = tf.summary.histogram('histogram', var)
    return m1,m2,m3,m4,m5

In [9]:
def main():
    # Specify training parameters
    result_dir = './results/' # directory where the results from the training are saved
    max_step = 5500 # the maximum iterations. After max_step iterations, the training will stop no matter what
    start_time = time() # start timing
    # FILL IN THE CODE BELOW TO BUILD YOUR NETWORK
    # placeholders for input data and input labeles
    x = tf.placeholder(tf.float32,[None,784],name='x')
    y_ = tf.placeholder(tf.float32,[None,10],name='y_')
    # reshape the input image
    x_image = tf.reshape(x, [-1, 28, 28, 1])
    # first convolutional layer
    with tf.name_scope("layer1"):
        # This Variable will hold the state of the weights for the layer
        with tf.name_scope('weights'):
            W_conv1 = weight_variable([5,5,1,32])
            w1m1,w1m2,w1m3,w1m4,w1m5 = variable_summaries(W_conv1)
        with tf.name_scope('biases'):
            b_conv1 = bias_variable([32])
            b1m1,b1m2,b1m3,b1m4,b1m5 = variable_summaries(b_conv1)
        with tf.name_scope('preactivate'):
            preactivate1 = conv2d(x_image,W_conv1) + b_conv1
            p1m1,p1m2,p1m3,p1m4,p1m5 = variable_summaries(preactivate1)
            #tf.summary.histogram('pre_activations', preactivate)
        with tf.name_scope('postactivate'):
            h_conv1 = tf.nn.relu(preactivate1)
            hc1m1,hc1m2,hc1m3,hc1m4,hc1m5 = variable_summaries(h_conv1)
            #tf.summar.histogram
        with tf.name_scope("afterMaxPool"):
            h_pool1 = max_pool_2x2(h_conv1)
            hp1m1,hp1m2,hp1m3,hp1m4,hp1m5 = variable_summaries(h_pool1)
    
    # second convolutional layer
    with tf.name_scope("layer2"):
        # This Variable will hold the state of the weights for the layer
        with tf.name_scope('weights'):
            W_conv2 = weight_variable([5,5,32,64])
            w2m1,w2m2,w2m3,w2m4,w2m5 = variable_summaries(W_conv2)
        with tf.name_scope('biases'):
            b_conv2 = bias_variable([64])
            b2m1,b2m2,b2m3,b2m4,b2m5 = variable_summaries(b_conv2)
        with tf.name_scope('preactivate'):
            preactivate2 = conv2d(h_pool1,W_conv2) + b_conv2
            p2m1,p2m2,p2m3,p2m4,p2m5 = variable_summaries(preactivate2)
            #tf.summary.histogram('pre_activations', preactivate)
        with tf.name_scope('postactivate'):
            h_conv2 = tf.nn.relu(preactivate2)
            hc2m1,hc2m2,hc2m3,hc2m4,hc2m5 = variable_summaries(h_conv2)
            #tf.summar.histogram
        with tf.name_scope("afterMaxPool"):
            h_pool2 = max_pool_2x2(h_conv2)
            hp2m1,hp2m2,hp2m3,hp2m4,hp2m5 = variable_summaries(h_pool2)
    # densely connected layer
    with tf.name_scope("layerfc1"):
        # This Variable will hold the state of the weights for the layer
        with tf.name_scope('weights'):
            W_fc1 = weight_variable([7*7*64,1024])
            wf1m1,wf1m2,wf1m3,wf1m4,wf1m5 = variable_summaries(W_fc1)
        with tf.name_scope('biases'):
            b_fc1 = bias_variable([1024])
            bf1m1,bf1m2,bf1m3,bf1m4,bf1m5 = variable_summaries(b_fc1)
        with tf.name_scope('preactivate'):
            h_pool2_flat = tf.reshape(h_pool2,[-1,7*7*64])
            pf1m1,pf1m2,pf1m3,pf1m4,pf1m5 = variable_summaries(h_pool2_flat)
            #tf.summar.histogram
        with tf.name_scope("postactivate"):
            h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat,W_fc1) + b_fc1)
            hf1m1,hf1m2,hf1m3,hf1m4,hf1m5 = variable_summaries(h_fc1)
    # dropout
    keep_prob = tf.placeholder(tf.float32)
    h_fc1_drop = tf.nn.dropout(h_fc1, rate = 1-keep_prob)
    # softmax
    with tf.name_scope("layerfc2"):
        # This Variable will hold the state of the weights for the layer
        with tf.name_scope('weights'):
            W_fc2 = weight_variable([1024,10])
            wf2m1,wf2m2,wf2m3,wf2m4,wf2m5 = variable_summaries(W_fc2)
        with tf.name_scope('biases'):
            b_fc2 = bias_variable([10])
            bf2m1,bf2m2,bf2m3,bf2m4,bf2m5 = variable_summaries(b_fc2)
        with tf.name_scope('preactivate'):
            preactivatefc2 = tf.matmul(h_fc1_drop,W_fc2) + b_fc2
            pf2m1,pf2m2,pf2m3,pf2m4,pf2m5 = variable_summaries(preactivatefc2)
        with tf.name_scope("postactivate"):
            y_conv = tf.nn.softmax(preactivatefc2, name='y_conv')
            hf2m1,hf2m2,hf2m3,hf2m4,hf2m5 = variable_summaries(y_conv)
    # FILL IN THE FOLLOWING CODE TO SET UP THE TRAINING
    # setup training
    cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_*tf.log(y_conv),reduction_indices=[1]))
    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
    correct_prediction = tf.equal(tf.argmax(y_conv,1),tf.argmax(y_,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32),name='accuracy')
    # Add a scalar summary for the snapshot loss.
    loss_summary = tf.summary.scalar(cross_entropy.op.name, cross_entropy)
    
    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.summary.merge([loss_summary,w1m1,w1m2,w1m3,w1m4,w1m5, \
                                   b1m1,b1m2,b1m3,b1m4,b1m5,p1m1,p1m2,p1m3,p1m4,p1m5,\
                                   hc1m1,hc1m2,hc1m3,hc1m4,hc1m5,hp1m1,hp1m2,hp1m3,hp1m4,hp1m5, \
                                   w2m1,w2m2,w2m3,w2m4,w2m5, \
                                   b2m1,b2m2,b2m3,b2m4,b2m5,p2m1,p2m2,p2m3,p2m4,p2m5,\
                                   hc2m1,hc2m2,hc2m3,hc2m4,hc2m5,hp2m1,hp2m2,hp2m3,hp2m4,hp2m5, \
                                   wf1m1,wf1m2,wf1m3,wf1m4,wf1m5, \
                                   bf1m1,bf1m2,bf1m3,bf1m4,bf1m5,pf1m1,pf1m2,pf1m3,pf1m4,pf1m5,\
                                   hf1m1,hf1m2,hf1m3,hf1m4,hf1m5, \
                                   wf2m1,wf2m2,wf2m3,wf2m4,wf2m5, \
                                   bf2m1,bf2m2,bf2m3,bf2m4,bf2m5,pf2m1,pf2m2,pf2m3,pf2m4,pf2m5,\
                                   hf2m1,hf2m2,hf2m3,hf2m4,hf2m5])
    # summary_nop records summary of test accuracy
    summary_nop = tf.summary.merge([tf.summary.scalar("test_accuracy",accuracy)])
    # Add the variable initializer Op.
    init = tf.initialize_all_variables()
    # Create a saver for writing training checkpoints.
    saver = tf.train.Saver()
    # Instantiate a SummaryWriter to output summaries and the Graph.
    summary_writer = tf.summary.FileWriter(result_dir, sess.graph)
    # Run the Op to initialize the variables.
    sess.run(init)
    
    train_writer = tf.summary.FileWriter(result_dir + '/train', sess.graph)
    test_writer = tf.summary.FileWriter(result_dir + '/test')
    def feed_dict(train):
        """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
        if train:
            xs, ys = mnist.train.next_batch(50)
            k = 0.5
        else:
            xs, ys = mnist.test.images, mnist.test.labels
            k = 1.0
        return {x: xs, y_: ys, keep_prob: k}
    # run the training
    for i in range(max_step):
        
        if i % 100 == 0:  # Record summaries and test-set accuracy
            train_accuracy = accuracy.eval(feed_dict=feed_dict(False))
            print("step %d, training accuracy %g"%(i, train_accuracy))
            summary, _ = sess.run([summary_op, train_step], feed_dict=feed_dict(True))
            train_writer.add_summary(summary, i)
        if i % 1100 == 0:  # Record train set summaries, and train
            summary, acc = sess.run([summary_nop,accuracy], feed_dict=feed_dict(False))
            test_writer.add_summary(summary, i)
        if i % 1100 == 0 or i == max_step-1:
            checkpoint_file = os.path.join(result_dir, 'checkpoint')
            saver.save(sess, checkpoint_file, global_step=i)
            
        train_step.run(feed_dict=feed_dict(True)) # run one train_step
    # print test error
    print("test accuracy %g"%accuracy.eval(feed_dict={
    x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
    stop_time = time()
    print('The training takes %f second to finish'%(stop_time - start_time))

In [80]:
if __name__ == "__main__":
    main()

step 0, training accuracy 0.1009
step 100, training accuracy 0.8614
step 200, training accuracy 0.9134
step 300, training accuracy 0.9369
step 400, training accuracy 0.9453
step 500, training accuracy 0.9518
step 600, training accuracy 0.9559
step 700, training accuracy 0.9607
step 800, training accuracy 0.9635
step 900, training accuracy 0.9653
step 1000, training accuracy 0.9683
step 1100, training accuracy 0.9722
step 1200, training accuracy 0.9726
step 1300, training accuracy 0.9749
step 1400, training accuracy 0.9772
step 1500, training accuracy 0.9752
step 1600, training accuracy 0.9758
step 1700, training accuracy 0.9767
step 1800, training accuracy 0.9798
step 1900, training accuracy 0.9788
step 2000, training accuracy 0.9805
step 2100, training accuracy 0.98
step 2200, training accuracy 0.9806
step 2300, training accuracy 0.9811
step 2400, training accuracy 0.9823
step 2500, training accuracy 0.9838
step 2600, training accuracy 0.9838
step 2700, training accuracy 0.9841
step 2

In what follows, you may see a bunch of figures obtained from tensorboard. There were other tags (for different time intervals) of layer_1, but I included the last one here. The order of the images is follows. First, the statistical figures of one layer appears, then its histogram, the comes 2nd layer and so on. This order is also used for point c of this problem (next section).

<img src="layer1_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="layer1_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="hist_layer1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="layer2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="layer2_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="hist_layer2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="layerfc1_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="layerfc1_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="hist_layerfc1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="layerfc2_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="layerfc2_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="hist_layer_fc2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="test_ac.PNG" alt="Drawing" style="width: 550px;"/>

# 2.c. Tome for More Fun!!! :(
In this section, I'm going to test the following setups:
1. Non-linearity: Tanh, Sigmoid, leaky-ReLu
2. Initialization technique: variance_scaling_initializer
3. Training algorithm: GradientDescentOptimizer, AdagradOptimizer

I will provide results of using any 3 combinations of these parameters, so there are goining to be a lot of figures.

In [10]:
def main_new(train_func,act_func):
    # Specify training parameters
    result_dir = './results/' # directory where the results from the training are saved
    max_step = 5500 # the maximum iterations. After max_step iterations, the training will stop no matter what
    start_time = time() # start timing
    # FILL IN THE CODE BELOW TO BUILD YOUR NETWORK
    # placeholders for input data and input labeles
    x = tf.placeholder(tf.float32,[None,784],name='x')
    y_ = tf.placeholder(tf.float32,[None,10],name='y_')
    # reshape the input image
    x_image = tf.reshape(x, [-1, 28, 28, 1])
    # first convolutional layer
    with tf.name_scope("layer1"):
        # This Variable will hold the state of the weights for the layer
        with tf.name_scope('weights'):
            W_conv1 = tf.get_variable("W_conv1", shape=[5,5,1,32], \
                                      initializer=tf.contrib.layers.variance_scaling_initializer(1,mode="FAN_AVG",uniform=True))
            w1m1,w1m2,w1m3,w1m4,w1m5 = variable_summaries(W_conv1)
        with tf.name_scope('biases'):
            b_conv1 = tf.Variable(tf.random_normal([32]))
            b1m1,b1m2,b1m3,b1m4,b1m5 = variable_summaries(b_conv1)
        with tf.name_scope('preactivate'):
            preactivate1 = conv2d(x_image,W_conv1) + b_conv1
            p1m1,p1m2,p1m3,p1m4,p1m5 = variable_summaries(preactivate1)
            #tf.summary.histogram('pre_activations', preactivate)
        with tf.name_scope('postactivate'):
            h_conv1 = act_func(preactivate1)
            hc1m1,hc1m2,hc1m3,hc1m4,hc1m5 = variable_summaries(h_conv1)
            #tf.summar.histogram
        with tf.name_scope("afterMaxPool"):
            h_pool1 = max_pool_2x2(h_conv1)
            hp1m1,hp1m2,hp1m3,hp1m4,hp1m5 = variable_summaries(h_pool1)
    
    # second convolutional layer
    with tf.name_scope("layer2"):
        # This Variable will hold the state of the weights for the layer
        with tf.name_scope('weights'):
            W_conv2 = tf.get_variable("W_conv2", shape=[5,5,32,64], \
                                      initializer=tf.contrib.layers.variance_scaling_initializer(2,mode="FAN_AVG",uniform=True))
            w2m1,w2m2,w2m3,w2m4,w2m5 = variable_summaries(W_conv2)
        with tf.name_scope('biases'):
            b_conv2 = tf.Variable(tf.random_normal([64]))
            b2m1,b2m2,b2m3,b2m4,b2m5 = variable_summaries(b_conv2)
        with tf.name_scope('preactivate'):
            preactivate2 = conv2d(h_pool1,W_conv2) + b_conv2
            p2m1,p2m2,p2m3,p2m4,p2m5 = variable_summaries(preactivate2)
            #tf.summary.histogram('pre_activations', preactivate)
        with tf.name_scope('postactivate'):
            h_conv2 = act_func(preactivate2)
            hc2m1,hc2m2,hc2m3,hc2m4,hc2m5 = variable_summaries(h_conv2)
            #tf.summar.histogram
        with tf.name_scope("afterMaxPool"):
            h_pool2 = max_pool_2x2(h_conv2)
            hp2m1,hp2m2,hp2m3,hp2m4,hp2m5 = variable_summaries(h_pool2)
    # densely connected layer
    with tf.name_scope("layerfc1"):
        # This Variable will hold the state of the weights for the layer
        with tf.name_scope('weights'):
            W_fc1 = tf.get_variable("W_fc1", shape=[7*7*64,1024], initializer=tf.contrib.layers.variance_scaling_initializer())
            wf1m1,wf1m2,wf1m3,wf1m4,wf1m5 = variable_summaries(W_fc1)
        with tf.name_scope('biases'):
            b_fc1 = tf.Variable(tf.random_normal([1024]))
            bf1m1,bf1m2,bf1m3,bf1m4,bf1m5 = variable_summaries(b_fc1)
        with tf.name_scope('preactivate'):
            h_pool2_flat = tf.reshape(h_pool2,[-1,7*7*64])
            pf1m1,pf1m2,pf1m3,pf1m4,pf1m5 = variable_summaries(h_pool2_flat)
            #tf.summar.histogram
        with tf.name_scope("postactivate"):
            h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat,W_fc1) + b_fc1)
            hf1m1,hf1m2,hf1m3,hf1m4,hf1m5 = variable_summaries(h_fc1)
    # dropout
    keep_prob = tf.placeholder(tf.float32)
    h_fc1_drop = tf.nn.dropout(h_fc1, rate = 1-keep_prob)
    # softmax
    with tf.name_scope("layerfc2"):
        # This Variable will hold the state of the weights for the layer
        with tf.name_scope('weights'):
            W_fc2 = tf.get_variable("W_fc2", shape=[1024,10], initializer=tf.contrib.layers.variance_scaling_initializer())
            wf2m1,wf2m2,wf2m3,wf2m4,wf2m5 = variable_summaries(W_fc2)
        with tf.name_scope('biases'):
            b_fc2 = tf.Variable(tf.random_normal([10]))
            bf2m1,bf2m2,bf2m3,bf2m4,bf2m5 = variable_summaries(b_fc2)
        with tf.name_scope('preactivate'):
            preactivatefc2 = tf.matmul(h_fc1_drop,W_fc2) + b_fc2
            pf2m1,pf2m2,pf2m3,pf2m4,pf2m5 = variable_summaries(preactivatefc2)
        with tf.name_scope("postactivate"):
            y_conv = tf.nn.softmax(preactivatefc2, name='y_conv')
            hf2m1,hf2m2,hf2m3,hf2m4,hf2m5 = variable_summaries(y_conv)
    # FILL IN THE FOLLOWING CODE TO SET UP THE TRAINING
    # setup training
    cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_*tf.log(y_conv),reduction_indices=[1]))
    train_step = train_func.minimize(cross_entropy)
    correct_prediction = tf.equal(tf.argmax(y_conv,1),tf.argmax(y_,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32),name='accuracy')
    # Add a scalar summary for the snapshot loss.
    loss_summary = tf.summary.scalar(cross_entropy.op.name, cross_entropy)
    
    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.summary.merge([loss_summary,w1m1,w1m2,w1m3,w1m4,w1m5, \
                                   b1m1,b1m2,b1m3,b1m4,b1m5,p1m1,p1m2,p1m3,p1m4,p1m5,\
                                   hc1m1,hc1m2,hc1m3,hc1m4,hc1m5,hp1m1,hp1m2,hp1m3,hp1m4,hp1m5, \
                                   w2m1,w2m2,w2m3,w2m4,w2m5, \
                                   b2m1,b2m2,b2m3,b2m4,b2m5,p2m1,p2m2,p2m3,p2m4,p2m5,\
                                   hc2m1,hc2m2,hc2m3,hc2m4,hc2m5,hp2m1,hp2m2,hp2m3,hp2m4,hp2m5, \
                                   wf1m1,wf1m2,wf1m3,wf1m4,wf1m5, \
                                   bf1m1,bf1m2,bf1m3,bf1m4,bf1m5,pf1m1,pf1m2,pf1m3,pf1m4,pf1m5,\
                                   hf1m1,hf1m2,hf1m3,hf1m4,hf1m5, \
                                   wf2m1,wf2m2,wf2m3,wf2m4,wf2m5, \
                                   bf2m1,bf2m2,bf2m3,bf2m4,bf2m5,pf2m1,pf2m2,pf2m3,pf2m4,pf2m5,\
                                   hf2m1,hf2m2,hf2m3,hf2m4,hf2m5])
    # summary_nop records summary of test accuracy
    summary_nop = tf.summary.merge([tf.summary.scalar("test_accuracy",accuracy)])
    # Add the variable initializer Op.
    init = tf.global_variables_initializer()
    # Create a saver for writing training checkpoints.
    saver = tf.train.Saver()
    # Instantiate a SummaryWriter to output summaries and the Graph.
    summary_writer = tf.summary.FileWriter(result_dir, sess.graph)
    # Run the Op to initialize the variables.
    sess.run(init)
    
    train_writer = tf.summary.FileWriter(result_dir + '/train', sess.graph)
    test_writer = tf.summary.FileWriter(result_dir + '/test')
    def feed_dict(train):
        """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
        if train:
            xs, ys = mnist.train.next_batch(50)
            k = 0.5
        else:
            xs, ys = mnist.test.images, mnist.test.labels
            k = 1.0
        return {x: xs, y_: ys, keep_prob: k}
    # run the training
    for i in range(max_step):
        
        if i % 100 == 0:  # Record summaries and test-set accuracy
            train_accuracy = accuracy.eval(feed_dict=feed_dict(False))
            print("step %d, training accuracy %g"%(i, train_accuracy))
            summary, _ = sess.run([summary_op, train_step], feed_dict=feed_dict(True))
            train_writer.add_summary(summary, i)
        if i % 1100 == 0:  # Record train set summaries, and train
            summary, acc = sess.run([summary_nop,accuracy], feed_dict=feed_dict(False))
            test_writer.add_summary(summary, i)
        if i % 1100 == 0 or i == max_step-1:
            checkpoint_file = os.path.join(result_dir, 'checkpoint')
            saver.save(sess, checkpoint_file, global_step=i)
            
        train_step.run(feed_dict=feed_dict(True)) # run one train_step
    # print test error
    print("test accuracy %g"%accuracy.eval(feed_dict={
    x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
    stop_time = time()
    print('The training takes %f second to finish'%(stop_time - start_time))

In [11]:
red_color = "\033[91m {}\033[00m"
bold = "\033[1m"
bolde = "\033[0m"

In [114]:
if __name__ == "__main__":
    case = "{}{}{}".format(bold,"Setup #1:",bolde)
    print(case,red_color.format("GradientDescentOptimizer, tanh activation"))
    tf.reset_default_graph() 
    sess = tf.InteractiveSession()
    main_new(tf.train.GradientDescentOptimizer(5e-2),tf.nn.tanh)

[1mSetup #1:[0m [91m GradientDescentOptimizer, tanh activation[00m
step 0, training accuracy 0.0892
step 100, training accuracy 0.1339
step 200, training accuracy 0.7707
step 300, training accuracy 0.8863
step 400, training accuracy 0.9033
step 500, training accuracy 0.9438
step 600, training accuracy 0.9515
step 700, training accuracy 0.9569
step 800, training accuracy 0.9611
step 900, training accuracy 0.9615
step 1000, training accuracy 0.9655
step 1100, training accuracy 0.9688
step 1200, training accuracy 0.9719
step 1300, training accuracy 0.9741
step 1400, training accuracy 0.9729
step 1500, training accuracy 0.9759
step 1600, training accuracy 0.978
step 1700, training accuracy 0.9788
step 1800, training accuracy 0.9782
step 1900, training accuracy 0.981
step 2000, training accuracy 0.9802
step 2100, training accuracy 0.9809
step 2200, training accuracy 0.9817
step 2300, training accuracy 0.9818
step 2400, training accuracy 0.982
step 2500, training accuracy 0.9833
step 260

Here comes the figures:

<img src="./s1/layer1_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s1/layer1_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s1/hist_layer1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s1/layer2_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s1/layer2_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s1/hist_layer2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s1/layerfc1_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s1/layerfc1_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s1/hist_layerfc1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s1/layerfc2_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s1/layerfc2_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s1/hist_layerfc2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s1/test_ac.PNG" alt="Drawing" style="width: 550px;"/>

In [47]:
if __name__ == "__main__":
    case = "{}{}{}".format(bold,"Setup #2:",bolde)
    print(case,red_color.format("AdagradOptimizer, tanh activation"))
    tf.reset_default_graph() 
    sess = tf.InteractiveSession()
    main_new(tf.train.AdagradOptimizer(1e-2),tf.nn.tanh)

[1mSetup #2:[0m [91m AdagradOptimizer, tanh activation[00m
step 0, training accuracy 0.1028
step 100, training accuracy 0.4191
step 200, training accuracy 0.8625
step 300, training accuracy 0.8946
step 400, training accuracy 0.9214
step 500, training accuracy 0.9269
step 600, training accuracy 0.9378
step 700, training accuracy 0.9406
step 800, training accuracy 0.9516
step 900, training accuracy 0.952
step 1000, training accuracy 0.9571
step 1100, training accuracy 0.9578
step 1200, training accuracy 0.9578
step 1300, training accuracy 0.965
step 1400, training accuracy 0.9666
step 1500, training accuracy 0.9656
step 1600, training accuracy 0.9682
step 1700, training accuracy 0.9699
step 1800, training accuracy 0.9677
step 1900, training accuracy 0.9709
step 2000, training accuracy 0.972
step 2100, training accuracy 0.9735
step 2200, training accuracy 0.9742
step 2300, training accuracy 0.9758
step 2400, training accuracy 0.9744
step 2500, training accuracy 0.9763
step 2600, train

Here are the results:

<img src="./s2/layer1_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s2/layer1_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s2/hist_layer1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s2/layer2_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s2/layer2_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s2/hist_layer2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s2/layerfc1_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s2/layerfc1_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s2/hist_layerfc1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s2/layerfc2_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s2/layerfc2_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s2/hist_layerfc2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s2/test_ac.PNG" alt="Drawing" style="width: 550px;"/>

In [48]:
if __name__ == "__main__":
    case = "{}{}{}".format(bold,"Setup #3:",bolde)
    print(case,red_color.format("GradientDescentOptimizer, sigmoid activation"))
    tf.reset_default_graph() 
    sess = tf.InteractiveSession()
    main_new(tf.train.GradientDescentOptimizer(5e-2),tf.nn.sigmoid)

[1mSetup #3:[0m [91m GradientDescentOptimizer, sigmoid activation[00m
step 0, training accuracy 0.1135
step 100, training accuracy 0.1135
step 200, training accuracy 0.101
step 300, training accuracy 0.1135
step 400, training accuracy 0.1135
step 500, training accuracy 0.1028
step 600, training accuracy 0.0982
step 700, training accuracy 0.1032
step 800, training accuracy 0.1135
step 900, training accuracy 0.1135
step 1000, training accuracy 0.1135
step 1100, training accuracy 0.1032
step 1200, training accuracy 0.1032
step 1300, training accuracy 0.1893
step 1400, training accuracy 0.2079
step 1500, training accuracy 0.1031
step 1600, training accuracy 0.0989
step 1700, training accuracy 0.2703
step 1800, training accuracy 0.1601
step 1900, training accuracy 0.1403
step 2000, training accuracy 0.6055
step 2100, training accuracy 0.7121
step 2200, training accuracy 0.7743
step 2300, training accuracy 0.8169
step 2400, training accuracy 0.8008
step 2500, training accuracy 0.8411
ste

Results:

<img src="./s3/layer1_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s3/layer1_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s3/hist_layer1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s3/layer2_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s3/layer2_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s3/hist_layer2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s3/layerfc1_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s3/layerfc1_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s3/hist_layerfc1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s3/layerfc2_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s3/layerfc2_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s3/hist_layerfc2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s3/test_ac.PNG" alt="Drawing" style="width: 550px;"/>

In [13]:
if __name__ == "__main__":
    case = "{}{}{}".format(bold,"Setup #4:",bolde)
    print(case,red_color.format("AdagradOptimizer, sigmoid activation"))
    tf.reset_default_graph() 
    sess = tf.InteractiveSession()
    main_new(tf.train.AdagradOptimizer(5e-2),tf.nn.sigmoid)

[1mSetup #4:[0m [91m AdagradOptimizer, sigmoid activation[00m
step 0, training accuracy 0.0892
step 100, training accuracy 0.1135
step 200, training accuracy 0.1009
step 300, training accuracy 0.0974
step 400, training accuracy 0.1028
step 500, training accuracy 0.1028
step 600, training accuracy 0.1009
step 700, training accuracy 0.098
step 800, training accuracy 0.1028
step 900, training accuracy 0.1135
step 1000, training accuracy 0.1135
step 1100, training accuracy 0.113
step 1200, training accuracy 0.1009
step 1300, training accuracy 0.1814
step 1400, training accuracy 0.1135
step 1500, training accuracy 0.1135
step 1600, training accuracy 0.1586
step 1700, training accuracy 0.2388
step 1800, training accuracy 0.319
step 1900, training accuracy 0.3918
step 2000, training accuracy 0.3458
step 2100, training accuracy 0.6586
step 2200, training accuracy 0.8021
step 2300, training accuracy 0.8677
step 2400, training accuracy 0.8895
step 2500, training accuracy 0.9045
step 2600, tr

Results:

<img src="./s4/layer1_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s4/layer1_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s4/hist_layer1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s4/layer2_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s4/layer2_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s4/hist_layer2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s4/layerfc1_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s4/layerfc1_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s4/hist_layerfc1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s4/layerfc2_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s4/layerfc2_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s4/hist_layerfc2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s4/test_ac.PNG" alt="Drawing" style="width: 550px;"/>

In [16]:
if __name__ == "__main__":
    case = "{}{}{}".format(bold,"Setup #5:",bolde)
    print(case,red_color.format("GradientDescentOptimizer, leaky_relu activation"))
    tf.reset_default_graph() 
    sess = tf.InteractiveSession()
    main_new(tf.train.GradientDescentOptimizer(1e-2),tf.nn.leaky_relu)

[1mSetup #5:[0m [91m GradientDescentOptimizer, leaky_relu activation[00m
step 0, training accuracy 0.1009
step 100, training accuracy 0.1154
step 200, training accuracy 0.2205
step 300, training accuracy 0.775
step 400, training accuracy 0.8373
step 500, training accuracy 0.8967
step 600, training accuracy 0.9025
step 700, training accuracy 0.9228
step 800, training accuracy 0.929
step 900, training accuracy 0.9324
step 1000, training accuracy 0.9368
step 1100, training accuracy 0.9436
step 1200, training accuracy 0.9479
step 1300, training accuracy 0.9482
step 1400, training accuracy 0.9513
step 1500, training accuracy 0.9513
step 1600, training accuracy 0.953
step 1700, training accuracy 0.9535
step 1800, training accuracy 0.9552
step 1900, training accuracy 0.9601
step 2000, training accuracy 0.9621
step 2100, training accuracy 0.9628
step 2200, training accuracy 0.9658
step 2300, training accuracy 0.9641
step 2400, training accuracy 0.9669
step 2500, training accuracy 0.9675
st

Results:

<img src="./s5/layer1_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s5/layer1_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s5/hist_layer1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s5/layer2_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s5/layer2_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s5/hist_layer2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s5/layerfc1_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s5/layerfc1_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s5/hist_layerfc1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s5/layerfc2_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s5/layerfc2_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s5/hist_layerfc2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s5/test_ac.PNG" alt="Drawing" style="width: 550px;"/>

In [17]:
if __name__ == "__main__":
    case = "{}{}{}".format(bold,"Setup #6:",bolde)
    print(case,red_color.format("AdagradOptimizer, leaky_relu activation"))
    tf.reset_default_graph() 
    sess = tf.InteractiveSession()
    main_new(tf.train.AdagradOptimizer(5e-2,2),tf.nn.leaky_relu)

[1mSetup #6:[0m [91m AdagradOptimizer, leaky_relu activation[00m
step 0, training accuracy 0.0958
step 100, training accuracy 0.1219
step 200, training accuracy 0.7981
step 300, training accuracy 0.8798
step 400, training accuracy 0.9167
step 500, training accuracy 0.9264
step 600, training accuracy 0.9415
step 700, training accuracy 0.9479
step 800, training accuracy 0.9553
step 900, training accuracy 0.9556
step 1000, training accuracy 0.9627
step 1100, training accuracy 0.9641
step 1200, training accuracy 0.9549
step 1300, training accuracy 0.9687
step 1400, training accuracy 0.9675
step 1500, training accuracy 0.9691
step 1600, training accuracy 0.969
step 1700, training accuracy 0.9683
step 1800, training accuracy 0.975
step 1900, training accuracy 0.9736
step 2000, training accuracy 0.9737
step 2100, training accuracy 0.9778
step 2200, training accuracy 0.9759
step 2300, training accuracy 0.976
step 2400, training accuracy 0.9776
step 2500, training accuracy 0.9776
step 2600,

Results:

<img src="./s6/layer1_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s6/layer1_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s6/hist_layer1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s6/layer2_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s6/layer2_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s6/hist_layer2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s6/layerfc1_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s6/layerfc1_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s6/hist_layerfc1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s6/layerfc2_1.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s6/layerfc2_2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s6/hist_layerfc2.PNG" alt="Drawing" style="width: 1500px;"/>
<img src="./s6/test_ac.PNG" alt="Drawing" style="width: 550px;"/>

# 2.c. continued: Describe what you observe

This part really depends on the set of parameters (activation function, learning algorithm, ...) one has picked for comparison, so based on the parameters I chose, there are a couple of interesting things to note:
1. Adagrad and Gradient Descent perform almost the same in all setups. i.e., when everything is fixed, the final accuracy of these two methods is pretty close. This can be understood by comparing setup #1 with setup #2, or setup #3 with setup #4, or setup #5 with setup #6.
2. Sigmoid is the worst function among the ones I tested in terms of stability and final test as well as train accuracy. As you may see in the result of setup #3 or #4, for the first 1700 iterations, the training accuracy goes back and forth and wanders around %10.
3. Based on the result of setup #5, if leaky_ReLu is chosen as the activation function, a smaller learning rate is needed to use the Gradient Descent compared with setup #1 where tanh is employed.
4. To wrap up the accuracy discussion, GradientDecent alongside with tanh is the best choice of parameters when using the variance scaling initializer.