### A binary to train MNIST using multiple GPUs with synchronous updates.

In [1]:
# TF and tf.keras
import tensorflow as tf
from tensorflow import keras

from keras import backend as K

import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

tf.set_random_seed(19)

print('TF Version : ', tf.__version__)
print(K.tensorflow_backend._get_available_gpus())

Using TensorFlow backend.


TF Version :  1.12.0
['/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3']


### Load MNIST Dataset 

In [2]:
mnist = keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Img size : 28 x 28 x 1
# Num classes : 10

print('Training data shape : ', train_images.shape)
print('Class dataset : ', train_labels.shape)
print('Testing data shape : ', test_images.shape)
print('Class dataset : ', test_labels.shape)

Training data shape :  (60000, 28, 28)
Class dataset :  (60000,)
Testing data shape :  (10000, 28, 28)
Class dataset :  (10000,)


### Define model

### Run

In [3]:

# Network Topologies
n_hidden_1 = 128
n_input    = 784
n_classes  = 10
display_step = 5


class Model:
    def __init__(self, sess, name='aa', learning_rate=1e-3, batch_size=100):
        self.sess = sess
        self.name = name
        
        self.learning_rate = learning_rate
        self.batch_size    = batch_size
        
        self.weights  = {
            'wc1': tf.Variable(tf.random_normal([3, 3, 1, 64], stddev=0.1)),
            'wd1': tf.Variable(tf.random_normal([14*14*64, n_classes], stddev=0.1))
        }
        self.biases   = {
            'bc1': tf.Variable(tf.random_normal([64], stddev=0.1)),
            'bd1': tf.Variable(tf.random_normal([n_classes], stddev=0.1))
        }
        
        self.Build()
        # Saver
        self.save_step = 1;
        self.savedir = "nets/"
        #self.saver = tf.train.Saver(max_to_keep=3) 
        
        self.sess.run(tf.global_variables_initializer())
    
    
    # Set the convolution process
    def conv_simple(self, _input, _w, _b):
        # Reshape input
        _input_r = tf.reshape(_input, shape=[-1, 28, 28, 1])
        # Convolution
        _conv1 = tf.nn.conv2d(_input_r, _w['wc1'], strides=[1, 1, 1, 1], padding='SAME')
        # Add-bias
        _conv2 = tf.nn.bias_add(_conv1, _b['bc1'])
        # Pass ReLu
        _conv3 = tf.nn.relu(_conv2)
        # Max-pooling
        _pool  = tf.nn.max_pool(_conv3, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
        # Vectorize
        _dense = tf.reshape(_pool, [-1, _w['wd1'].get_shape().as_list()[0]])
        # Fully-connected layer
        _out = tf.add(tf.matmul(_dense, _w['wd1']), _b['bd1'])
        # Return everything
        out = {
            'input_r': _input_r, 'conv1': _conv1, 'conv2': _conv2, 'conv3': _conv3
            , 'pool': _pool, 'dense': _dense, 'out': _out
        }
        return out
    
    # Build model and loss function
    def Build(self):
        # Set Inputs and Outputs
        self.X = tf.placeholder(tf.float32, shape=[None, n_input], name='X')
        self.Y = tf.placeholder(tf.float32, shape=[None, n_classes], name='Y')

        # Set Layers
        self._pred = self.conv_simple(self.X, self.weights, self.biases)['out']
        
        # Loss and Optimizers
        self.cost = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self._pred, labels=self.Y)
        self.cost = tf.reduce_mean(self.cost)
        
        self.optm = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost)
        
        self.corr = tf.equal(tf.argmax(self._pred, 1), tf.argmax(self.Y, 1))
        self.accr = tf.reduce_mean(tf.cast(self.corr, tf.float32))

        print('Model definition completed') # Check the grammar error above
    
    # Execute the forward process with batch dataset
    def run_single_step(self, batch_xs, batch_ys):
        _, avg_cost = sess.run(
                        [self.optm, self.cost],
                        feed_dict={self.X:batch_xs, self.Y:batch_ys})
        return avg_cost
    
    def run_inside(self, tag, batch_xs):
        N_TRN = len(batch_xs)
        batch_xs = self.sess.run(tf.reshape(batch_xs, shape=(N_TRN,-1)))
        print(batch_xs.shape)
        result = self.sess.run(self.conv_simple(self.X, self.weights, self.biases), feed_dict={self.X:batch_xs})
        return result[tag]
    
    def get_prediction(self, x_test, y_test):
        print("Test Label: ", self.sess.run(tf.argmax(y_test, 1)))
        print("Prediction Label: ", self.sess.run(tf.argmax(self.logits, 1), feed_dict={self.X: x_test}))
        return 
    
    def get_accuracy(self, x_test, y_test):
        print(x_test.shape)
        return self.sess.run(self.accr, feed_dict={self.X: x_test, self.Y: y_test})

In [4]:
def trainer(train_images, train_labels, test_images, test_lables, 
            sess, name, learning_rate=1e-3, batch_size=100, training_epochs=100):
    with tf.device('/cpu:0'):
        N_TRN = len(train_labels)
        N_TST = len(test_labels)

        train_images_flat = sess.run(tf.reshape(train_images, shape=(N_TRN,-1)))
        test_images_flat  = sess.run(tf.reshape(test_images, shape=(N_TST,-1)))
        train_labels_1h   = sess.run(tf.one_hot(train_labels, depth=n_classes))
        test_labels_1h    = sess.run(tf.one_hot(test_labels, depth=n_classes))
        
        #############
        
        with tf.variable_scope(tf.get_variable_scope()): 
            avg_cost = 0.0
            num_batch = int(N_TRN / batch_size)

            # Get random minibatch for each epoch
            randindices = np.random.permutation(len(train_labels_1h))    

            model = Model(sess, name, learning_rate=learning_rate, batch_size=batch_size)
            
            for i in range(num_batch):
                # Obtain a batch
                cur_indices = randindices[i*batch_size:(i+1)*batch_size]
                batch_xs    = train_images_flat[cur_indices, :]
                batch_ys    = train_labels_1h[cur_indices, :]            
            
                for i in range(4):
                    with tf.device('/GPU:%d' % i):
                        with tf.name_scope('%s_%d' % ('TOWER', i)) as scope:

                            avg_cost_tmp = model.run_single_step(batch_xs, batch_ys)
                            avg_cost += avg_cost_tmp / num_batch                    
                            # Display training steps
                            if True: #epoch % display_step == 0:
                                train_acc = model.get_accuracy(batch_xs, batch_ys)
                                test_acc  = model.get_accuracy(test_images_flat, test_labels_1h)

                                print('Epoch: %03d/%03d cost: %0.9f train_acc: %.3f test_acc %.3f'
                                      % (-1, training_epochs, avg_cost, train_acc, test_acc))                    

#             # Minibatch learning    
#             for epoch in range(training_epochs): 


#                 # Display training steps
#                 if epoch % display_step == 0:
#                     train_acc = model.get_accuracy(batch_xs, batch_ys)
#                     test_acc  = model.get_accuracy(test_images_flat, test_labels_1h)

#                     print('Epoch: %03d/%03d cost: %0.9f train_acc: %.3f test_acc %.3f'
#                           % (epoch, training_epochs, avg_cost, train_acc, test_acc))

#                 # Save net
#                 if epoch % model.save_step == 0:
#                     pass
#                     #model.saver.save(sess, 'nets/cnn_mnist_simple.ckpt-' + str(epoch))
        print('Done')
    return model

### Main function

In [5]:
%%time
NAME = 'MNIST'

# Session
tf.reset_default_graph()
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

# Train the model
model = trainer(train_images, train_labels, 
                test_images, test_labels, 
                sess, NAME,
                learning_rate=1e-4,  batch_size=100, training_epochs=100)

Model definition completed
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 0.523303833 train_acc: 0.040 test_acc 0.097
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 1.024244741 train_acc: 0.050 test_acc 0.099
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 1.503737590 train_acc: 0.050 test_acc 0.100
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 1.962734019 train_acc: 0.050 test_acc 0.101
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 2.391527913 train_acc: 0.080 test_acc 0.102
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 2.802261709 train_acc: 0.100 test_acc 0.106
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 3.195162404 train_acc: 0.100 test_acc 0.111
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 3.570871023 train_acc: 0.100 test_acc 0.114
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 3.952977982 train_acc: 0.090 test_acc 0.119
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 4.324473546 train_acc: 0.100 test_acc 0.124
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 4.685608953 train_acc: 0.110 test_acc 

Epoch: -01/100 cost: 19.907477824 train_acc: 0.490 test_acc 0.493
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 20.032794609 train_acc: 0.440 test_acc 0.496
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 20.156305885 train_acc: 0.440 test_acc 0.499
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 20.277592723 train_acc: 0.450 test_acc 0.501
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 20.396321615 train_acc: 0.450 test_acc 0.503
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 20.496707935 train_acc: 0.510 test_acc 0.501
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 20.595291138 train_acc: 0.530 test_acc 0.503
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 20.691830076 train_acc: 0.510 test_acc 0.506
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 20.786230570 train_acc: 0.520 test_acc 0.507
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 20.895679398 train_acc: 0.530 test_acc 0.508
(100, 784)
(10000, 784)
Epoch: -01/100 cost: 21.003919830 train_acc: 0.530 test_acc 0.512
(100, 784)
(10000, 784)
Epoch: -01

KeyboardInterrupt: 