In [1]:
import warnings
warnings.filterwarnings("ignore")
import math
import sys
import time
import numpy as np
import os
import tensorflow as tf

def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [17]:
height = 28
width = 28
channels = 1
n_inputs = height * width

conv1_fmaps = 32
conv1_ksize = 3
conv1_stride = 1
conv1_pad = "SAME"

conv2_fmaps = 64
conv2_ksize = 3
conv2_stride = 2
conv2_pad = "SAME"

pool3_fmaps = conv2_fmaps

n_fc1 = 64
n_outputs = 10

reset_graph()

with tf.name_scope("inputs"):
    X = tf.placeholder(tf.float32, shape=[None, n_inputs], name="X")
    X_reshaped = tf.reshape(X, shape=[-1, height, width, channels])
    y = tf.placeholder(tf.int32, shape=[None], name="y")

conv1 = tf.layers.conv2d(X_reshaped, filters=conv1_fmaps, kernel_size=conv1_ksize,
                         strides=conv1_stride, padding=conv1_pad,
                         activation=tf.nn.relu, name="conv1")
conv2 = tf.layers.conv2d(conv1, filters=conv2_fmaps, kernel_size=conv2_ksize,
                         strides=conv2_stride, padding=conv2_pad,
                         activation=tf.nn.relu, name="conv2")

with tf.name_scope("pool3"):
    pool3 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
    pool3_flat = tf.reshape(pool3, shape=[-1, pool3_fmaps * 7 * 7])

with tf.name_scope("fc1"):
    fc1 = tf.layers.dense(pool3_flat, n_fc1, activation=tf.nn.relu, name="fc1")

with tf.name_scope("output"):
    logits = tf.layers.dense(fc1, n_outputs, name="output")
    Y_proba = tf.nn.softmax(logits, name="Y_proba")

with tf.name_scope("train"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y)
    loss = tf.reduce_mean(xentropy)
    optimizer = tf.train.AdamOptimizer()
    training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

with tf.name_scope("init_and_save"):
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()

In [4]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train.astype(np.float32).reshape(-1, 28*28) / 255.0
X_test = X_test.astype(np.float32).reshape(-1, 28*28) / 255.0
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)
X_valid, X_train = X_train[:5000], X_train[5000:]
y_valid, y_train = y_train[:5000], y_train[5000:]

In [5]:
def shuffle_batch(X, y, batch_size):
    rnd_idx = np.random.permutation(len(X))
    n_batches = len(X) // batch_size
    for batch_idx in np.array_split(rnd_idx, n_batches):
        X_batch, y_batch = X[batch_idx], y[batch_idx]
        yield X_batch, y_batch

In [20]:
from datetime import datetime
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "tf_logs"
logdir = "{}/run-{}".format(root_logdir, now)

loss_summary = tf.summary.scalar("Loss", loss)
accuracy_summary = tf.summary.scalar("Accuracy", accuracy)
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

In [21]:
n_epochs = 5
batch_size = 100
n_batch = np.ceil(len(X_train) / batch_size)

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        summary_loss, summary_acc = sess.run([loss_summary, accuracy_summary], feed_dict={X: X_batch, y: y_batch})
        file_writer.add_summary(summary_loss)
        file_writer.add_summary(summary_acc)
        acc_batch = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
        print(epoch, "Last batch accuracy:", acc_batch, "Test accuracy:", acc_test)

        save_path = saver.save(sess, "./checkpoints/my_mnist_model")

0 Last batch accuracy: 0.99 Test accuracy: 0.9781
1 Last batch accuracy: 0.97 Test accuracy: 0.9831
2 Last batch accuracy: 0.98 Test accuracy: 0.9799
3 Last batch accuracy: 0.98 Test accuracy: 0.9881
4 Last batch accuracy: 0.99 Test accuracy: 0.9876


In [22]:
file_writer.close()

------

In [3]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train.astype(np.float32) / 255.0
X_test = X_test.astype(np.float32) / 255.0
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)
X_valid, X_train = X_train[:5000], X_train[5000:]
y_valid, y_train = y_train[:5000], y_train[5000:]

In [4]:
class mnist:
    def __init__(self, X, y):
        indices = np.random.permutation(len(X))
        self.X = X[indices]
        self.y = y[indices]
        self.i = 0

    def next_batch(self, batch_size):
        X_batch = self.X[self.i * batch_size : (self.i + 1) * batch_size]
        y_batch = self.y[self.i * batch_size : (self.i + 1) * batch_size]
        self.i += 1
        return X_batch, y_batch

In [65]:
reset_graph()

In [14]:
height = 28
width = 28
inputs = 28 * 28
n_outputs = 10
learning_rate = 0.001


with tf.name_scope("inputs"):
    X = tf.placeholder(tf.float32, shape=[None, height, width], name="X")
    X_reshaped = tf.reshape(X, shape=[-1, height, width, 1])
    y = tf.placeholder(tf.int32, shape=[None], name="y")

with tf.name_scope("model"):
    conv1 = tf.layers.conv2d(X_reshaped, filters=32, kernel_size=3, strides=[1, 1], 
                        padding="SAME", activation=tf.nn.relu, name="conv1")
    conv2 = tf.layers.conv2d(conv1, filters=64, kernel_size=3, strides=[1, 1],
                        padding="same", activation=tf.nn.relu, name="conv2")
    max_pool = tf.nn.max_pool(conv2, ksize=[1,2,2,1], strides=[1,2,2,1], 
                        padding="VALID", name="max_pooling")
    max_pool_flat = tf.reshape(max_pool, shape=[-1, 64 * 14 * 14])
    dense = tf.layers.dense(max_pool_flat, units=128, activation=tf.nn.relu, name="dense")
    logits = tf.layers.dense(dense, units=n_outputs, name="dense_layer")

with tf.name_scope("train"):
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)
    
with tf.name_scope("train"):
    correct = tf.equal(tf.argmax(logits, 1), tf.cast(y, tf.int64))
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    init = tf.global_variables_initializer()

In [5]:
n_epochs = 20
batch_size = 128

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        start = time.time()
        data = mnist(X_train, y_train)
        n_batches = math.ceil(len(X_train) / batch_size)
        for i in range(n_batches):
            X_batch, y_batch = data.next_batch(batch_size)
            print("\r{}%".format(100 * i // n_batches), end="")
            sys.stdout.flush()
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        print("\rtraining time: %.2fs" % (time.time() - start))
            
        loss_train, acc = sess.run([loss, accuracy], feed_dict={ 
                                X: X_batch, y: y_batch})
        acc_test = accuracy.eval({X: X_test, y: y_test})
        print("\rEpoch: %s" % (epoch + 1),
                "\ttraining loss: %.4f" % loss_train, 
                "\ttraining accuracy: %.4f" % acc, 
                "\ttest accuracy: %.4f" % acc_test, 
                "\ttime: %.2fs" % (time.time() - start))

     #   batch *= 2  # conv linear + tf.nn.relu  # batchnorm

training time: 54.73s
Epoch: 1 	training loss: 0.0184 	training accuracy: 1.0000 	test accuracy: 0.9816 	time: 56.11s
training time: 5.83s
Epoch: 2 	training loss: 0.0185 	training accuracy: 1.0000 	test accuracy: 0.9852 	time: 6.11s
training time: 5.83s
Epoch: 3 	training loss: 0.0653 	training accuracy: 0.9886 	test accuracy: 0.9871 	time: 6.10s
training time: 5.83s
Epoch: 4 	training loss: 0.0016 	training accuracy: 1.0000 	test accuracy: 0.9878 	time: 6.11s
training time: 5.86s
Epoch: 5 	training loss: 0.0024 	training accuracy: 1.0000 	test accuracy: 0.9875 	time: 6.14s
training time: 5.86s
Epoch: 6 	training loss: 0.0025 	training accuracy: 1.0000 	test accuracy: 0.9885 	time: 6.13s
training time: 5.90s
Epoch: 7 	training loss: 0.0010 	training accuracy: 1.0000 	test accuracy: 0.9897 	time: 6.17s
training time: 5.87s
Epoch: 8 	training loss: 0.0053 	training accuracy: 1.0000 	test accuracy: 0.9887 	time: 6.14s
training time: 5.88s
Epoch: 9 	training loss: 0.0034 	training accurac

In [13]:
reset_graph()

In [15]:
n_epochs = 20
batch_size = 128

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        start = time.time()
        data = mnist(X_train, y_train)
        n_batches = math.ceil(len(X_train) / batch_size)
        print("batch size: ", batch_size, "n_batches: ", n_batches)
        for i in range(n_batches):
            X_batch, y_batch = data.next_batch(batch_size)
            print("\r{}%".format(100 * i // n_batches), end="")
            sys.stdout.flush()
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        print("\rtraining time: %.2fs" % (time.time() - start))    
        
        loss_train, acc = sess.run([loss, accuracy], feed_dict={ 
                                X: X_batch, y: y_batch})
        acc_test = accuracy.eval({X: X_test, y: y_test})
        batch_size += 5                                ################ batch * 2,  check test ACCURACY & PAPER
        if batch_size >= (len(X_train) // 2):
            break
        print("\rEpoch: %s" % (epoch + 1),
                "\ttraining loss: %.4f" % loss_train, 
                "\ttraining accuracy: %.4f" % acc, 
                "\ttest accuracy: %.4f" % acc_test, 
                "\ttime: %.2fs" % (time.time() - start))

batch size:  128 n_batches:  430
training time: 6.02s
Epoch: 1 	training loss: 0.0159 	training accuracy: 1.0000 	test accuracy: 0.9850 	time: 6.32s
batch size:  133 n_batches:  414
training time: 6.13s
Epoch: 2 	training loss: 0.0272 	training accuracy: 0.9859 	test accuracy: 0.9885 	time: 6.40s
batch size:  138 n_batches:  399
training time: 6.06s
Epoch: 3 	training loss: 0.0572 	training accuracy: 0.9868 	test accuracy: 0.9803 	time: 6.33s
batch size:  143 n_batches:  385
training time: 5.98s
Epoch: 4 	training loss: 0.0098 	training accuracy: 1.0000 	test accuracy: 0.9886 	time: 6.25s
batch size:  148 n_batches:  372
training time: 5.95s
Epoch: 5 	training loss: 0.0049 	training accuracy: 1.0000 	test accuracy: 0.9864 	time: 6.22s
batch size:  153 n_batches:  360
training time: 5.84s
Epoch: 6 	training loss: 0.0011 	training accuracy: 1.0000 	test accuracy: 0.9908 	time: 6.11s
batch size:  158 n_batches:  349
training time: 5.77s
Epoch: 7 	training loss: 0.0010 	training accuracy: 

In [13]:
reset_graph()

In [14]:
height = 28   ######### conv linear + relu
width = 28
inputs = 28 * 28
n_outputs = 10
learning_rate = 0.001


with tf.name_scope("inputs"):
    X = tf.placeholder(tf.float32, shape=[None, height, width], name="X")
    X_reshaped = tf.reshape(X, shape=[-1, height, width, 1])
    y = tf.placeholder(tf.int32, shape=[None], name="y")

with tf.name_scope("model"):
    conv1 = tf.layers.conv2d(X_reshaped, filters=32, kernel_size=3, strides=[1, 1], 
                        padding="SAME", activation=None, name="conv1")
    relu1 = tf.nn.relu(conv1)
    conv2 = tf.layers.conv2d(relu1, filters=64, kernel_size=3, strides=[1, 1],
                        padding="same", activation=None, name="conv2")
    relu2 = tf.nn.relu(conv2)
    max_pool = tf.nn.max_pool(relu2, ksize=[1,2,2,1], strides=[1,2,2,1], 
                        padding="VALID", name="max_pooling")
    max_pool_flat = tf.reshape(max_pool, shape=[-1, 64 * 14 * 14])
    dense = tf.layers.dense(max_pool_flat, units=128, activation=tf.nn.relu, name="dense")
    logits = tf.layers.dense(dense, units=n_outputs, name="dense_layer")

with tf.name_scope("train"):
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)
    
with tf.name_scope("train"):
    correct = tf.equal(tf.argmax(logits, 1), tf.cast(y, tf.int64))
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    init = tf.global_variables_initializer()

In [15]:
n_epochs = 20
batch_size = 128

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        start = time.time()
        data = mnist(X_train, y_train)
        n_batches = math.ceil(len(X_train) / batch_size)
        for i in range(n_batches):
            X_batch, y_batch = data.next_batch(batch_size)
            print("\r{}%".format(100 * i // n_batches), end="")
            sys.stdout.flush()
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        loss_train, acc = sess.run([loss, accuracy], feed_dict={ 
                                X: X_batch, y: y_batch})
        acc_test = accuracy.eval({X: X_test, y: y_test})
        print("\rEpoch: %s" % (epoch + 1),
                "\ttraining loss: %.4f" % loss_train, 
                "\ttraining accuracy: %.4f" % acc, 
                "\ttest accuracy: %.4f" % acc_test, 
                "\ttime: %.2fs" % (time.time() - start))

Epoch: 1 	training loss: 0.0150 	training accuracy: 1.0000 	test accuracy: 0.9844 	time: 9.98s
Epoch: 2 	training loss: 0.0311 	training accuracy: 0.9886 	test accuracy: 0.9885 	time: 6.19s
Epoch: 3 	training loss: 0.0783 	training accuracy: 0.9773 	test accuracy: 0.9812 	time: 6.21s
Epoch: 4 	training loss: 0.0101 	training accuracy: 1.0000 	test accuracy: 0.9884 	time: 6.23s
Epoch: 5 	training loss: 0.0024 	training accuracy: 1.0000 	test accuracy: 0.9891 	time: 6.23s
Epoch: 6 	training loss: 0.0027 	training accuracy: 1.0000 	test accuracy: 0.9902 	time: 6.23s
Epoch: 7 	training loss: 0.0025 	training accuracy: 1.0000 	test accuracy: 0.9905 	time: 6.24s
Epoch: 8 	training loss: 0.0020 	training accuracy: 1.0000 	test accuracy: 0.9888 	time: 6.25s
Epoch: 9 	training loss: 0.0012 	training accuracy: 1.0000 	test accuracy: 0.9889 	time: 6.27s
Epoch: 10 	training loss: 0.0009 	training accuracy: 1.0000 	test accuracy: 0.9874 	time: 6.25s
Epoch: 11 	training loss: 0.0025 	training accura

In [24]:
reset_graph()

In [25]:
height = 28   ######### conv linear + relu  tf.layers.batch_normalization
width = 28
inputs = 28 * 28
n_outputs = 10
learning_rate = 0.001


with tf.name_scope("inputs"):
    X = tf.placeholder(tf.float32, shape=[None, height, width], name="X")
    X_reshaped = tf.reshape(X, shape=[-1, height, width, 1])
    y = tf.placeholder(tf.int32, shape=[None], name="y")
    training = tf.placeholder_with_default(False, shape=[], name='training')

with tf.name_scope("model"):
    conv1 = tf.layers.conv2d(X_reshaped, filters=32, kernel_size=3, strides=[1, 1], 
                        padding="SAME", activation=None, name="conv1")
    bn1 = tf.layers.batch_normalization(conv1, training=training, momentum=0.9)
    relu1 = tf.nn.relu(bn1)
    conv2 = tf.layers.conv2d(relu1, filters=64, kernel_size=3, strides=[1, 1],
                        padding="same", activation=None, name="conv2")
    bn2 = tf.layers.batch_normalization(conv2, training=training, momentum=0.9)
    relu2 = tf.nn.relu(bn2)
    max_pool = tf.nn.max_pool(relu2, ksize=[1,2,2,1], strides=[1,2,2,1], 
                        padding="VALID", name="max_pooling")
    max_pool_flat = tf.reshape(max_pool, shape=[-1, 64 * 14 * 14])
    dense = tf.layers.dense(max_pool_flat, units=128, activation=tf.nn.relu, name="dense")
    logits = tf.layers.dense(dense, units=n_outputs, name="dense_layer")
    logits_bn = tf.layers.batch_normalization(logits, training=training, momentum=0.9)

with tf.name_scope("train"):
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_bn, labels=y))
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)
 #   extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
 #   with tf.control_dependencies(extra_update_ops):
 #       training_op = optimizer.minimize(loss)
    
with tf.name_scope("train"):
    correct = tf.equal(tf.argmax(logits, 1), tf.cast(y, tf.int64))
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    init = tf.global_variables_initializer()

In [26]:
n_epochs = 20
batch_size = 128
# extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        start = time.time()
        data = mnist(X_train, y_train)
        n_batches = math.ceil(len(X_train) / batch_size)
        for i in range(n_batches):
            X_batch, y_batch = data.next_batch(batch_size)
            print("\r{}%".format(100 * i // n_batches), end="")
            sys.stdout.flush()
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch,    # , extra_update_ops]
                                            training: True})
        loss_train, acc = sess.run([loss, accuracy], feed_dict={ 
                                X: X_batch, y: y_batch})
        acc_test = accuracy.eval({X: X_test, y: y_test})
        print("\rEpoch: %s" % (epoch + 1),
                "\ttraining loss: %.4f" % loss_train, 
                "\ttraining accuracy: %.4f" % acc, 
                "\ttest accuracy: %.4f" % acc_test, 
                "\ttime: %.2fs" % (time.time() - start))

Epoch: 1 	training loss: 2.1515 	training accuracy: 0.4318 	test accuracy: 0.5079 	time: 9.01s
Epoch: 2 	training loss: 2.2164 	training accuracy: 0.7614 	test accuracy: 0.7449 	time: 8.64s
Epoch: 3 	training loss: 2.2626 	training accuracy: 0.5000 	test accuracy: 0.5189 	time: 8.65s
Epoch: 4 	training loss: 2.3004 	training accuracy: 0.1136 	test accuracy: 0.1141 	time: 8.64s
Epoch: 5 	training loss: 2.3080 	training accuracy: 0.1136 	test accuracy: 0.1135 	time: 8.69s
Epoch: 6 	training loss: 2.3100 	training accuracy: 0.1136 	test accuracy: 0.1135 	time: 8.69s
Epoch: 7 	training loss: 2.2970 	training accuracy: 0.1250 	test accuracy: 0.1135 	time: 8.66s
Epoch: 8 	training loss: 2.2996 	training accuracy: 0.1364 	test accuracy: 0.1135 	time: 8.66s
Epoch: 9 	training loss: 2.2609 	training accuracy: 0.1705 	test accuracy: 0.1135 	time: 8.70s
Epoch: 10 	training loss: 2.3266 	training accuracy: 0.0909 	test accuracy: 0.1135 	time: 8.69s
Epoch: 11 	training loss: 2.3174 	training accura

KeyboardInterrupt: 

In [38]:
reset_graph()

In [39]:
height = 28   ######### tf.nn.batch_normalization
width = 28
inputs = 28 * 28
n_outputs = 10
learning_rate = 0.001


with tf.name_scope("inputs"):
    X = tf.placeholder(tf.float32, shape=[None, height, width], name="X")
    X_reshaped = tf.reshape(X, shape=[-1, height, width, 1])
    y = tf.placeholder(tf.int32, shape=[None], name="y")
    training = tf.placeholder_with_default(False, shape=[], name="training")


conv1 = tf.layers.conv2d(X_reshaped, filters=32, kernel_size=3, strides=[1, 1], 
                    padding="SAME", activation=None, name="conv1")

with tf.name_scope("batch_norm1"):
    prev_mean, prev_var = tf.nn.moments(conv1, axes=[0,1,2]) # axes=[0]
    scale = tf.Variable(tf.ones([32]))
    offset = tf.Variable(tf.zeros([32]))
    variance_epsilon = 0.001
    exponential_moving_average = tf.train.ExponentialMovingAverage(decay=0.9)
    apply_op = exponential_moving_average.apply([prev_mean, prev_var])
    with tf.control_dependencies([apply_op]):
    #    mean, var = tf.identity(prev_mean), tf.identity(prev_var)
        (mean, var) = tf.cond(training, lambda : (tf.identity(prev_mean), tf.identity(prev_var)), 
                              lambda : (exponential_moving_average.average(prev_mean), 
                                        exponential_moving_average.average(prev_var)))
    bn1 = tf.nn.batch_normalization(conv1, mean, var, offset, scale, variance_epsilon)

relu1 = tf.nn.relu(bn1)
conv2 = tf.layers.conv2d(relu1, filters=64, kernel_size=3, strides=[1, 1],
                    padding="same", activation=None, name="conv2")

with tf.name_scope("batch_norm2"):
    prev_mean, prev_var = tf.nn.moments(conv2, axes=[0,1,2]) # axes=[0]
    scale = tf.Variable(tf.ones([64]))
    offset = tf.Variable(tf.zeros([64]))
    variance_epsilon = 0.001
    exponential_moving_average = tf.train.ExponentialMovingAverage(decay=0.9)
    apply_op = exponential_moving_average.apply([prev_mean, prev_var])
    with tf.control_dependencies([apply_op]):
    #    mean, var = tf.identity(prev_mean), tf.identity(prev_var)
        (mean, var) = tf.cond(training, lambda : (tf.identity(prev_mean), tf.identity(prev_var)), 
                              lambda : (exponential_moving_average.average(prev_mean), 
                                        exponential_moving_average.average(prev_var)))
    bn2 = tf.nn.batch_normalization(conv2, mean, var, offset, scale, variance_epsilon)

relu2 = tf.nn.relu(bn2)
max_pool = tf.nn.max_pool(relu2, ksize=[1,2,2,1], strides=[1,2,2,1], 
                    padding="VALID", name="max_pooling")
max_pool_flat = tf.reshape(max_pool, shape=[-1, 64 * 14 * 14])
dense = tf.layers.dense(max_pool_flat, units=128, activation=tf.nn.relu, name="dense")
logits = tf.layers.dense(dense, units=n_outputs, name="dense_layer")

with tf.name_scope("batch_norm3"):
    prev_mean, prev_var = tf.nn.moments(logits, axes=[0])
    scale = tf.Variable(tf.ones([n_outputs]))
    offset = tf.Variable(tf.zeros([n_outputs]))
    variance_epsilon = 0.001
    exponential_moving_average = tf.train.ExponentialMovingAverage(decay=0.9)
    apply_op = exponential_moving_average.apply([prev_mean, prev_var])
    with tf.control_dependencies([apply_op]):
    #    mean, var = tf.identity(prev_mean), tf.identity(prev_var)
        (mean, var) = tf.cond(training, lambda : (tf.identity(prev_mean), tf.identity(prev_var)), 
                              lambda : (exponential_moving_average.average(prev_mean), 
                                        exponential_moving_average.average(prev_var)))
    logits_bn = tf.nn.batch_normalization(logits, mean, var, offset, scale, variance_epsilon)

with tf.name_scope("train"):
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_bn, labels=y))
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)
    
with tf.name_scope("train"):
    correct = tf.equal(tf.argmax(logits, 1), tf.cast(y, tf.int64))
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    init = tf.global_variables_initializer()

In [None]:
a = lambda x: x**2
a(2)

In [40]:
n_epochs = 20
batch_size = 128

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        start = time.time()
        data = mnist(X_train, y_train)
        n_batches = math.ceil(len(X_train) / batch_size)
        for i in range(n_batches):
            X_batch, y_batch = data.next_batch(batch_size)
            print("\r{}%".format(100 * i // n_batches), end="")
            sys.stdout.flush()
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch, training: True})
        loss_train, acc = sess.run([loss, accuracy], feed_dict={ 
                                X: X_batch, y: y_batch})
        acc_test = accuracy.eval({X: X_test, y: y_test})
        print("\rEpoch: %s" % (epoch + 1),
                "\ttraining loss: %.4f" % loss_train, 
                "\ttraining accuracy: %.4f" % acc, 
                "\ttest accuracy: %.4f" % acc_test, 
                "\ttime: %.2fs" % (time.time() - start))

Epoch: 1 	training loss: 0.2020 	training accuracy: 0.8977 	test accuracy: 0.9408 	time: 14.55s
Epoch: 2 	training loss: 0.1362 	training accuracy: 0.9318 	test accuracy: 0.9241 	time: 14.21s
Epoch: 3 	training loss: 0.0702 	training accuracy: 1.0000 	test accuracy: 0.9797 	time: 14.20s
Epoch: 4 	training loss: 0.0474 	training accuracy: 0.9659 	test accuracy: 0.9722 	time: 14.17s
Epoch: 5 	training loss: 0.0298 	training accuracy: 1.0000 	test accuracy: 0.9779 	time: 14.19s
Epoch: 6 	training loss: 0.0272 	training accuracy: 1.0000 	test accuracy: 0.9815 	time: 14.19s
Epoch: 7 	training loss: 0.0150 	training accuracy: 1.0000 	test accuracy: 0.9882 	time: 14.23s
Epoch: 8 	training loss: 0.0100 	training accuracy: 0.9886 	test accuracy: 0.9782 	time: 14.29s
Epoch: 9 	training loss: 0.0076 	training accuracy: 0.9886 	test accuracy: 0.9855 	time: 14.23s
Epoch: 10 	training loss: 0.0083 	training accuracy: 0.9886 	test accuracy: 0.9882 	time: 14.20s
Epoch: 11 	training loss: 0.0038 	train

In [53]:
reset_graph()

In [54]:
height = 28   ######### MAXOUT
width = 28
inputs = 28 * 28
n_outputs = 10
learning_rate = 0.001


with tf.name_scope("inputs"):
    X = tf.placeholder(tf.float32, shape=[None, height, width], name="X")
    X_reshaped = tf.reshape(X, shape=[-1, height, width, 1])
    y = tf.placeholder(tf.int32, shape=[None], name="y")

with tf.name_scope("model"):
    conv1 = tf.layers.conv2d(X_reshaped, filters=32, kernel_size=3, strides=[1, 1], 
                        padding="SAME", activation=None, name="conv1")
    maxout1 = tf.contrib.layers.maxout(conv1, num_units=16)
    conv2 = tf.layers.conv2d(maxout1, filters=64, kernel_size=3, strides=[1, 1],
                        padding="same", activation=None, name="conv2")
    maxout2 = tf.contrib.layers.maxout(conv2, num_units=16)
    max_pool = tf.nn.max_pool(maxout2, ksize=[1,2,2,1], strides=[1,2,2,1], 
                        padding="VALID", name="max_pooling")
    max_pool_flat = tf.reshape(max_pool, shape=[-1, 16 * 14 * 14])  # 64 * 14 * 14
    dense = tf.layers.dense(max_pool_flat, units=128, activation=tf.nn.relu, name="dense")
    logits = tf.layers.dense(dense, units=n_outputs, name="dense_layer")

with tf.name_scope("train"):
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)
    
with tf.name_scope("train"):
    correct = tf.equal(tf.argmax(logits, 1), tf.cast(y, tf.int64))
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    init = tf.global_variables_initializer()

In [55]:
n_epochs = 20
batch_size = 128

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        start = time.time()
        data = mnist(X_train, y_train)
        n_batches = math.ceil(len(X_train) / batch_size)
        for i in range(n_batches):
            X_batch, y_batch = data.next_batch(batch_size)
            print("\r{}%".format(100 * i // n_batches), end="")
            sys.stdout.flush()
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        loss_train, acc = sess.run([loss, accuracy], feed_dict={ 
                                X: X_batch, y: y_batch})
        acc_test = accuracy.eval({X: X_test, y: y_test})
        print("\rEpoch: %s" % (epoch + 1),
                "\ttraining loss: %.4f" % loss_train, 
                "\ttraining accuracy: %.4f" % acc, 
                "\ttest accuracy: %.4f" % acc_test, 
                "\ttime: %.2fs" % (time.time() - start))

Epoch: 1 	training loss: 0.0678 	training accuracy: 0.9886 	test accuracy: 0.9775 	time: 12.20s
Epoch: 2 	training loss: 0.0479 	training accuracy: 0.9773 	test accuracy: 0.9855 	time: 11.40s
Epoch: 3 	training loss: 0.0688 	training accuracy: 0.9886 	test accuracy: 0.9855 	time: 11.46s
Epoch: 4 	training loss: 0.0174 	training accuracy: 1.0000 	test accuracy: 0.9893 	time: 11.44s
Epoch: 5 	training loss: 0.0253 	training accuracy: 0.9886 	test accuracy: 0.9886 	time: 11.46s
Epoch: 6 	training loss: 0.0095 	training accuracy: 1.0000 	test accuracy: 0.9903 	time: 11.45s
Epoch: 7 	training loss: 0.0049 	training accuracy: 1.0000 	test accuracy: 0.9889 	time: 11.50s
Epoch: 8 	training loss: 0.0026 	training accuracy: 1.0000 	test accuracy: 0.9903 	time: 11.48s
Epoch: 9 	training loss: 0.0011 	training accuracy: 1.0000 	test accuracy: 0.9895 	time: 11.49s
Epoch: 10 	training loss: 0.0022 	training accuracy: 1.0000 	test accuracy: 0.9870 	time: 11.48s
Epoch: 11 	training loss: 0.0032 	train

In [79]:
reset_graph()

In [80]:
height = 28   ######### learning rate decay Adam Highest accuracy
width = 28
inputs = 28 * 28
n_outputs = 10
learning_rate = 0.001
batch_size = 128

initial_learning_rate = 0.001
decay_rate = 0.9
decay_steps = int(len(X_train) / batch_size)
global_step = tf.Variable(0, trainable=False, name="global_step")
modified_learning_rate = tf.train.exponential_decay(initial_learning_rate, global_step=global_step, 
                                                    decay_steps=decay_steps, decay_rate=decay_rate)


with tf.name_scope("inputs"):
    X = tf.placeholder(tf.float32, shape=[None, height, width], name="X")
    X_reshaped = tf.reshape(X, shape=[-1, height, width, 1])
    y = tf.placeholder(tf.int32, shape=[None], name="y")

with tf.name_scope("model"):
    conv1 = tf.layers.conv2d(X_reshaped, filters=32, kernel_size=3, strides=[1, 1], 
                        padding="SAME", activation=None, name="conv1")
    relu1 = tf.nn.relu(conv1)
    conv2 = tf.layers.conv2d(relu1, filters=64, kernel_size=3, strides=[1, 1],
                        padding="same", activation=None, name="conv2")
    relu2 = tf.nn.relu(conv2)
    max_pool = tf.nn.max_pool(relu2, ksize=[1,2,2,1], strides=[1,2,2,1], 
                        padding="VALID", name="max_pooling")
    max_pool_flat = tf.reshape(max_pool, shape=[-1, 64 * 14 * 14])
    dense = tf.layers.dense(max_pool_flat, units=128, activation=tf.nn.relu, name="dense")
    logits = tf.layers.dense(dense, units=n_outputs, name="dense_layer")

with tf.name_scope("train"):
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))
  #  optimizer = tf.train.GradientDescentOptimizer(learning_rate)  
  #  optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9)
    
  #  optimizer = tf.train.GradientDescentOptimizer(modified_learning_rate)  
  #  optimizer = tf.train.MomentumOptimizer(learning_rate=modified_learning_rate, momentum=0.9)
    optimizer = tf.train.AdamOptimizer(learning_rate=modified_learning_rate)
    training_op = optimizer.minimize(loss, global_step=global_step)  ## global_step=global_step
    
with tf.name_scope("train"):
    correct = tf.equal(tf.argmax(logits, 1), tf.cast(y, tf.int64))
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    init = tf.global_variables_initializer()

In [81]:
n_epochs = 20
batch_size = 128

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        start = time.time()
        data = mnist(X_train, y_train)
        n_batches = math.ceil(len(X_train) / batch_size)
        for i in range(n_batches):
            X_batch, y_batch = data.next_batch(batch_size)
            print("\r{}%".format(100 * i // n_batches), end="")
            sys.stdout.flush()
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        loss_train, acc = sess.run([loss, accuracy], feed_dict={ 
                                X: X_batch, y: y_batch})
        acc_test = accuracy.eval({X: X_test, y: y_test})
        print("\rEpoch: %s" % (epoch + 1),
                "\ttraining loss: %.4f" % loss_train, 
                "\ttraining accuracy: %.4f" % acc, 
                "\ttest accuracy: %.4f" % acc_test, 
                "\ttime: %.2fs" % (time.time() - start))

Epoch: 1 	training loss: 0.0216 	training accuracy: 1.0000 	test accuracy: 0.9850 	time: 6.56s
Epoch: 2 	training loss: 0.0353 	training accuracy: 0.9773 	test accuracy: 0.9886 	time: 6.37s
Epoch: 3 	training loss: 0.0421 	training accuracy: 0.9886 	test accuracy: 0.9875 	time: 6.39s
Epoch: 4 	training loss: 0.0088 	training accuracy: 1.0000 	test accuracy: 0.9904 	time: 6.39s
Epoch: 5 	training loss: 0.0033 	training accuracy: 1.0000 	test accuracy: 0.9900 	time: 6.38s
Epoch: 6 	training loss: 0.0026 	training accuracy: 1.0000 	test accuracy: 0.9906 	time: 6.38s
Epoch: 7 	training loss: 0.0041 	training accuracy: 1.0000 	test accuracy: 0.9906 	time: 6.39s
Epoch: 8 	training loss: 0.0009 	training accuracy: 1.0000 	test accuracy: 0.9905 	time: 6.40s
Epoch: 9 	training loss: 0.0001 	training accuracy: 1.0000 	test accuracy: 0.9898 	time: 6.39s
Epoch: 10 	training loss: 0.0002 	training accuracy: 1.0000 	test accuracy: 0.9920 	time: 6.34s
Epoch: 11 	training loss: 0.0000 	training accura

In [17]:
reset_graph()

In [18]:
height = 28
width = 28
inputs = 28 * 28
n_outputs = 10
learning_rate = 0.001
conv2_dropout_rate = 0.25
dense_dropout_rate = 0.5


with tf.name_scope("inputs"):
    X = tf.placeholder(tf.float32, shape=[None, height, width], name="X")
    X_reshaped = tf.reshape(X, shape=[-1, height, width, 1])
    y = tf.placeholder(tf.int32, shape=[None], name="y")
    training = tf.placeholder_with_default(False, shape=[], name="training")

with tf.name_scope("model"):
    conv1 = tf.layers.conv2d(X_reshaped, filters=32, kernel_size=3, strides=[1, 1], 
                        padding="SAME", activation=tf.nn.relu, name="conv1")
    conv2 = tf.layers.conv2d(conv1, filters=64, kernel_size=3, strides=[1, 1],
                        padding="same", activation=tf.nn.relu, name="conv2")
    max_pool = tf.nn.max_pool(conv2, ksize=[1,2,2,1], strides=[1,2,2,1], 
                        padding="VALID", name="max_pooling")
    max_pool_flat = tf.reshape(max_pool, shape=[-1, 64 * 14 * 14])
    max_pool_drop = tf.layers.dropout(max_pool_flat, conv2_dropout_rate, training=training)
    dense = tf.layers.dense(max_pool_flat, units=128, activation=tf.nn.relu, name="dense")
    dense_drop = tf.layers.dropout(dense, dense_dropout_rate, training=training)
    logits = tf.layers.dense(dense, units=n_outputs, name="dense_layer")

with tf.name_scope("train"):
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)
    
with tf.name_scope("train"):
    correct = tf.equal(tf.argmax(logits, 1), tf.cast(y, tf.int64))
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    init = tf.global_variables_initializer()

In [19]:
def get_model_params():
    gvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    return {gvar.op.name: value for gvar, value in zip(gvars, tf.get_default_session().run(gvars))}

def restore_model_params(model_params):
    gvar_names = list(model_params.keys())
    assign_ops = {gvar_name: tf.get_default_graph().get_operation_by_name(gvar_name + "/Assign")
                  for gvar_name in gvar_names}
    init_values = {gvar_name: assign_op.inputs[1] for gvar_name, assign_op in assign_ops.items()}
    feed_dict = {init_values[gvar_name]: model_params[gvar_name] for gvar_name in gvar_names}
    tf.get_default_session().run(assign_ops, feed_dict=feed_dict)

In [20]:
n_epochs = 10000
batch_size = 128
iteration = 0
best_loss = np.infty
check_interval = 500
check_since_last_progress = 0
max_checks = 20
best_model_params = None

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        start = time.time()
        data = mnist(X_train, y_train)
        n_batches = math.ceil(len(X_train) / batch_size)
        for i in range(n_batches):
            iteration += 1
            X_batch, y_batch = data.next_batch(batch_size)
            print("\r{}%".format(100 * i // n_batches), end="")
            sys.stdout.flush()
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch, training: True})
            if iteration % check_interval == 0:
                loss_val = loss.eval({X: X_valid, y: y_valid})
                if loss_val < best_loss:
                    best_loss = loss_val
                    check_since_last_progress = 0
                    best_model_params = get_model_params()
                else:
                    check_since_last_progress += 1
        
        acc_batch = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
        acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
        print("\rEpoch: %s" % (epoch + 1),
                "\ttraining accuracy: %.4f" % acc_batch, 
                "\tvalid accuracy: %.4f" % acc_val, 
                "\ttest accuracy: %.4f" % acc_test, 
                "\ttime: %.2fs" % (time.time() - start))
        if check_since_last_progress > max_checks:
            print("Early Stopping")
            break
            
    if best_model_params:
        restore_model_params(best_model_params)
    acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
    print("Final accuracy on test set:", acc_test)

Epoch: 1 	training accuracy: 1.0000 	valid accuracy: 0.9814 	test accuracy: 0.9841 	time: 7.09s
Epoch: 2 	training accuracy: 0.9773 	valid accuracy: 0.9888 	test accuracy: 0.9878 	time: 6.46s
Epoch: 3 	training accuracy: 0.9773 	valid accuracy: 0.9770 	test accuracy: 0.9753 	time: 6.48s
Epoch: 4 	training accuracy: 1.0000 	valid accuracy: 0.9882 	test accuracy: 0.9883 	time: 6.43s
Epoch: 5 	training accuracy: 1.0000 	valid accuracy: 0.9878 	test accuracy: 0.9885 	time: 6.42s
Epoch: 6 	training accuracy: 1.0000 	valid accuracy: 0.9892 	test accuracy: 0.9882 	time: 6.44s
Epoch: 7 	training accuracy: 1.0000 	valid accuracy: 0.9908 	test accuracy: 0.9887 	time: 6.43s
Epoch: 8 	training accuracy: 1.0000 	valid accuracy: 0.9892 	test accuracy: 0.9897 	time: 6.39s
Epoch: 9 	training accuracy: 1.0000 	valid accuracy: 0.9896 	test accuracy: 0.9893 	time: 6.48s
Epoch: 10 	training accuracy: 1.0000 	valid accuracy: 0.9878 	test accuracy: 0.9879 	time: 6.50s
Epoch: 11 	training accuracy: 1.0000 	v