<a href="https://colab.research.google.com/github/tiensu/Coding-The-Deep-Learning-Revolution/blob/master/weight_initialization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf
import os
import numpy as np
from tensorflow import keras

In [0]:
base_path = 'E:\\MACHINE_LEARNING\\CODING_THE_DEEP_LEARNING_REVOLUTION\\PRACTICE\TensorBoard\\'

In [0]:
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

In [0]:
def get_batch(x_data, y_data, batch_size):
    idxs = np.random.randint(0, len(y_data), batch_size)
    return x_data[idxs,:,:], y_data[idxs]

In [0]:
def maybe_create_folder_structure(sub_folders):
    for fold in sub_folders:
        if not os.path.isdir(base_path + fold):
            os.makedirs(base_path + fold)

In [0]:
class Model(object):
    def __init__(self, initialization, activation, num_layers=3, hidden_size=100):
        self._init = initialization
        self._activation = activation
        # number of layers does not include output layer
        self._num_layers = num_layers
        self._hidden_size = hidden_size
        self._model_def()
        
    def _model_def(self):
        # create placeholder for input
        self.input_images = tf.placeholder(tf.float32, shape=[None, 28, 28])
        # reshape input x - for 28x28 pixels = 784
        x_rs = tf.reshape(self.input_images, shape=[-1, 784])
        # scale the input data
        input = tf.div(x_rs, 255.0)
        # create placeholder for label
        self.labels = tf.placeholder(tf.int64, shape=[None, 1])
        # convert label data to one hot values
        y_one_hot = tf.reshape(tf.one_hot(self.labels, 10), shape=[-1, 10])
        
        # create self._num_layers dense layers as the model
        tf.summary.scalar('input_var', self._calculate_variance(input))
        for i in range(self._num_layers-1):
            input = tf.layers.dense(input, self._hidden_size, kernel_initializer=self._init, activation=self._activation,
                                    name='layer{}'.format(i+1))
            # get the input to the nodes
            mat_mul_in = tf.get_default_graph().get_tensor_by_name('layer{}/MatMul:0'.format(i+1))
            # log pre and post activation function histogram
            tf.summary.histogram('mat_mul_in_{}'.format(i+1), mat_mul_in)
            tf.summary.histogram('fc_out_{}'.format(i+1), input)
            # also log the variance of mat mul
            tf.summary.scalar('mat_mul_var_{}'.format(i+1), self._calculate_variance(mat_mul_in))
            
        # create output layer. Do not supply an activation for the output layer. The loss function definition will supply
        # softmax activation. This defaults to a linear activation i.e. f(x) = x
        logits = tf.layers.dense(input, 10, name='layer{}'.format(self._num_layers))
        mat_mul_in = tf.get_default_graph().get_tensor_by_name('layer{}/MatMul:0'.format(self._num_layers))
        tf.summary.histogram('mat_mul_hist_{}'.format(self._num_layers), mat_mul_in)
        tf.summary.histogram('fc_out_{}'.format(self._num_layers), input)
        
        # define loss function, use softmax cross entropy with logits - no need to apply softmax activation to logits
        self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_one_hot))
        # define optimizer function
        self.optimizer = tf.train.AdamOptimizer().minimize(self.loss)
        # define accuracy function
        self.accuracy = self._compute_accuracy(logits, y_one_hot)
        
        # add the loss, accuracy to the summary
        tf.summary.scalar('loss', self.loss)
        tf.summary.scalar('acc', self.accuracy)
        self.merged = tf.summary.merge_all()
        
        # define init variables function
        self.init_op = tf.global_variables_initializer()
        
    def _compute_accuracy(self, logits, labels):
        prediction = tf.argmax(logits, 1)
        equality = tf.equal(prediction, tf.argmax(labels, 1))
        accuracy = tf.reduce_mean(tf.cast(equality, tf.float32))
        return accuracy
    
    def _calculate_variance(self, x):
        mean = tf.reduce_mean(x)
        sqr = tf.square(x-mean)
        return tf.reduce_mean(sqr)            

In [0]:
def init_pass_through(model, fold):
    with tf.Session() as sess:
        sess.run(model.init_op)
        train_writer = tf.summary.FileWriter(base_path+fold, sess.graph)
        image_batch, label_batch = get_batch(x_train, y_train, 100)
        summary = sess.run(model.merged, feed_dict={model.input_images: image_batch, model.labels: label_batch.reshape(-1, 1)})
        train_writer.add_summary(summary, 0)

In [0]:
def train_model(model, fold, batch_size, epochs):
    with tf.Session() as sess:
        sess.run(model.init_op)
        train_writer = tf.summary.FileWriter(base_path+fold, sess.graph)
        # total_batch = int(len(x_train)/batch_size)
        for i in range(epochs):
            image_batch, label_batch = get_batch(x_train, y_train, batch_size)
            loss, _, acc = sess.run([model.loss, model. optimizer, model.accuracy], 
                                       feed_dict={model.input_images: image_batch, model.labels: label_batch.reshape(-1,1)})
            if i%50==0:
                print('Iteration {} of {} - loss: {:.3f}, training accuracy: {:.2f}%'.format(i, epochs, loss, acc*100))
                summary = sess.run(model.merged, feed_dict={model.input_images: image_batch, model.labels: label_batch.reshape(-1, 1)})
                train_writer.add_summary(summary, i)

In [0]:
if __name__ == '__main__':
    sub_folders= ['first_pass_normal', 'first_pass_variance',
                 'full_train_normal', 'full_train_variance',
                 'full_train_normal_relu', 'full_train_variance_relu',
                 'full_train_he_relu']
    initializers = [tf.random_normal_initializer,
                   tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_AVG', uniform=False),
                   tf.random_normal_initializer,
                   tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_AVG', uniform=False),
                   tf.random_normal_initializer,
                   tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_AVG', uniform=False),
                   tf.contrib.layers.variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False)]
    activations = [tf.sigmoid, tf.sigmoid, tf.sigmoid, tf.sigmoid, tf.nn.relu, tf.nn.relu, tf.nn.relu]
    
    assert len(sub_folders) == len(initializers) == len(activations)
    
    maybe_create_folder_structure(sub_folders)
    
    for i in range(len(sub_folders)):
        tf.reset_default_graph()
        model = Model(initializers[i], activations[i])
        if 'first_pass' in sub_folders[i]:
            init_pass_through(model, sub_folders[i])
        else:
            train_model(model, sub_folders[i], 32, 5000)

Iteration 0 of 5000 - loss: 2.557, training accuracy: 12.50%
Iteration 50 of 5000 - loss: 1.960, training accuracy: 37.50%
Iteration 100 of 5000 - loss: 1.732, training accuracy: 43.75%
Iteration 150 of 5000 - loss: 1.265, training accuracy: 81.25%
Iteration 200 of 5000 - loss: 1.415, training accuracy: 53.12%
Iteration 250 of 5000 - loss: 1.000, training accuracy: 71.88%
Iteration 300 of 5000 - loss: 1.078, training accuracy: 71.88%
Iteration 350 of 5000 - loss: 0.766, training accuracy: 81.25%
Iteration 400 of 5000 - loss: 0.760, training accuracy: 78.12%
Iteration 450 of 5000 - loss: 0.549, training accuracy: 87.50%
Iteration 500 of 5000 - loss: 0.843, training accuracy: 71.88%
Iteration 550 of 5000 - loss: 0.545, training accuracy: 84.38%
Iteration 600 of 5000 - loss: 0.584, training accuracy: 84.38%
Iteration 650 of 5000 - loss: 0.355, training accuracy: 93.75%
Iteration 700 of 5000 - loss: 0.675, training accuracy: 78.12%
Iteration 750 of 5000 - loss: 0.649, training accuracy: 84

Iteration 1500 of 5000 - loss: 0.237, training accuracy: 93.75%
Iteration 1550 of 5000 - loss: 0.096, training accuracy: 100.00%
Iteration 1600 of 5000 - loss: 0.116, training accuracy: 96.88%
Iteration 1650 of 5000 - loss: 0.119, training accuracy: 96.88%
Iteration 1700 of 5000 - loss: 0.090, training accuracy: 100.00%
Iteration 1750 of 5000 - loss: 0.204, training accuracy: 96.88%
Iteration 1800 of 5000 - loss: 0.205, training accuracy: 90.62%
Iteration 1850 of 5000 - loss: 0.110, training accuracy: 96.88%
Iteration 1900 of 5000 - loss: 0.368, training accuracy: 84.38%
Iteration 1950 of 5000 - loss: 0.408, training accuracy: 87.50%
Iteration 2000 of 5000 - loss: 0.316, training accuracy: 87.50%
Iteration 2050 of 5000 - loss: 0.049, training accuracy: 96.88%
Iteration 2100 of 5000 - loss: 0.215, training accuracy: 90.62%
Iteration 2150 of 5000 - loss: 0.493, training accuracy: 87.50%
Iteration 2200 of 5000 - loss: 0.292, training accuracy: 87.50%
Iteration 2250 of 5000 - loss: 0.193, 

Iteration 3000 of 5000 - loss: 0.477, training accuracy: 87.50%
Iteration 3050 of 5000 - loss: 0.423, training accuracy: 75.00%
Iteration 3100 of 5000 - loss: 1.140, training accuracy: 81.25%
Iteration 3150 of 5000 - loss: 0.692, training accuracy: 90.62%
Iteration 3200 of 5000 - loss: 1.234, training accuracy: 93.75%
Iteration 3250 of 5000 - loss: 0.220, training accuracy: 90.62%
Iteration 3300 of 5000 - loss: 0.723, training accuracy: 81.25%
Iteration 3350 of 5000 - loss: 0.127, training accuracy: 96.88%
Iteration 3400 of 5000 - loss: 1.146, training accuracy: 75.00%
Iteration 3450 of 5000 - loss: 1.431, training accuracy: 87.50%
Iteration 3500 of 5000 - loss: 0.270, training accuracy: 90.62%
Iteration 3550 of 5000 - loss: 0.781, training accuracy: 87.50%
Iteration 3600 of 5000 - loss: 0.869, training accuracy: 93.75%
Iteration 3650 of 5000 - loss: 1.385, training accuracy: 78.12%
Iteration 3700 of 5000 - loss: 0.580, training accuracy: 87.50%
Iteration 3750 of 5000 - loss: 0.307, tr

Iteration 4450 of 5000 - loss: 0.013, training accuracy: 100.00%
Iteration 4500 of 5000 - loss: 0.101, training accuracy: 96.88%
Iteration 4550 of 5000 - loss: 0.010, training accuracy: 100.00%
Iteration 4600 of 5000 - loss: 0.020, training accuracy: 100.00%
Iteration 4650 of 5000 - loss: 0.060, training accuracy: 96.88%
Iteration 4700 of 5000 - loss: 0.012, training accuracy: 100.00%
Iteration 4750 of 5000 - loss: 0.104, training accuracy: 96.88%
Iteration 4800 of 5000 - loss: 0.041, training accuracy: 96.88%
Iteration 4850 of 5000 - loss: 0.008, training accuracy: 100.00%
Iteration 4900 of 5000 - loss: 0.022, training accuracy: 100.00%
Iteration 4950 of 5000 - loss: 0.031, training accuracy: 100.00%
Iteration 0 of 5000 - loss: 2.548, training accuracy: 12.50%
Iteration 50 of 5000 - loss: 0.529, training accuracy: 81.25%
Iteration 100 of 5000 - loss: 0.118, training accuracy: 100.00%
Iteration 150 of 5000 - loss: 0.450, training accuracy: 90.62%
Iteration 200 of 5000 - loss: 0.317, tr