### CNN MNIST for multi-gpu version

In [1]:
import os
import numpy as np
from scipy import ndimage
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import datetime

from tensorflow.examples.tutorials.mnist import input_data
%matplotlib inline  
print ("CURRENT TF VERSION IS [%s]" % (tf.__version__))
print ("PACKAGES LOADED")

CURRENT TF VERSION IS [1.12.0]
PACKAGES LOADED


### Check the available GPUs in the notebook

In [2]:
from tensorflow.python.client import device_lib

def check_available_pus(dev_type='GPU'):
    local_devices = device_lib.list_local_devices()
    gpu_names = [x.name for x in local_devices if x.device_type == dev_type]
    gpu_num = len(gpu_names)

    print('{0} {1}(s) are detected : {2}'.format(gpu_num, dev_type, gpu_names))

    return gpu_num

### Load MNIST Dataset

In [3]:
mnist = keras.datasets.mnist
(trainimg, trainlabel), (testimg, testlabel) = mnist.load_data()

# Img size : 28 x 28 x 1
# Num classes : 10

print('Training data shape : ', trainimg.shape)
print('Class dataset : ', trainlabel.shape)
print('Testing data shape : ', testimg.shape)
print('Class dataset : ', testlabel.shape)

Training data shape :  (60000, 28, 28)
Class dataset :  (60000,)
Testing data shape :  (10000, 28, 28)
Class dataset :  (10000,)


### Data preprocessing

In [4]:
N_TRN = len(trainlabel)
N_TST = len(testlabel)

onehot_mat = np.eye(10)

#trainimg  = np.reshape(trainimg, [-1, 784])
trainimg  = np.reshape(trainimg, [-1, 28, 28, 1])
#testimg   = np.reshape(testimg, [-1, 784])
testimg   = np.reshape(testimg, [-1, 28, 28, 1])

trainlabel = np.concatenate([[onehot_mat[int(x),:]] for x in trainlabel], axis=0)
testlabel   = np.concatenate([[onehot_mat[int(x),:]] for x in testlabel], axis=0)

print('Training data shape : ', trainimg.shape)
print('Class dataset : ', trainlabel.shape)
print('Testing data shape : ', testimg.shape)
print('Class dataset : ', testlabel.shape)

Training data shape :  (60000, 28, 28, 1)
Class dataset :  (60000, 10)
Testing data shape :  (10000, 28, 28, 1)
Class dataset :  (10000, 10)


### Define CNN model

In [5]:
def model(X, reuse=False, is_trn=True):
    with tf.variable_scope('L1', reuse=reuse):
        X  = tf.layers.batch_normalization(inputs=X, reuse=reuse)
        L1 = tf.layers.conv2d(inputs=X
                              , filters=64
                              , kernel_size=[3, 3]
                              , reuse=reuse
                              , activation=tf.nn.relu
                              , padding='SAME')
        L1 = tf.layers.max_pooling2d(inputs=L1
                                     , pool_size=[2, 2]
                                     , strides=[2, 2])
        L1 = tf.layers.dropout(inputs=L1
                               , rate=0.7
                               , training=is_trn)

    with tf.variable_scope('L2', reuse=reuse):
        L1 = tf.layers.batch_normalization(inputs=L1, reuse=reuse)
        L2 = tf.layers.conv2d(inputs=L1
                              , filters=128
                              , kernel_size=[3, 3]
                              , reuse=reuse
                              , activation=tf.nn.relu
                              , padding='SAME')
        L2 = tf.layers.max_pooling2d(inputs=L2
                                     , pool_size=[2, 2]
                                     , strides=[2, 2])
        L2 = tf.layers.dropout(inputs=L2
                               , rate=0.7
                               , training=is_trn)

    with tf.variable_scope('L2-1', reuse=reuse):
        L2   = tf.layers.batch_normalization(inputs=L2, reuse=reuse)
        L2_1 = tf.layers.conv2d(inputs=L2
                                , filters=128
                                , kernel_size=[3, 3]
                                , reuse=reuse
                                , activation=tf.nn.relu
                                , padding='SAME')
        L2_1 = tf.layers.max_pooling2d(inputs=L2_1
                                       , pool_size=[2, 2]
                                       , strides=[2, 2])
        L2_1 = tf.layers.dropout(inputs=L2_1
                                 , rate=0.7
                                 , training=is_trn)
        
    with tf.variable_scope('L3', reuse=reuse):
        L2_1 = tf.layers.batch_normalization(inputs=L2, reuse=reuse)
        L3   = tf.contrib.layers.flatten(inputs=L2_1)
        L3   = tf.layers.dense(L3, 1024, activation=tf.nn.relu)
        L3   = tf.layers.dropout(L3, 0.5, is_trn)

    with tf.variable_scope('L4', reuse=reuse):
        L3 = tf.layers.batch_normalization(inputs=L3, reuse=reuse)
        L4 = tf.layers.dense(L3, 256, activation=tf.nn.relu)

    with tf.variable_scope('LF', reuse=reuse):
        LF = tf.layers.dense(L4, 10, activation=None)

    return LF

### Image Augmentation

In [6]:
def augment_img(xs):
    out  = np.copy(xs)
    xs_r = np.reshape(xs, [-1, 28, 28])
    for i in range(xs_r.shape[0]):
        xs_img = xs_r[i, :, :]
        bg_value = 0
        # ROTATE
        angle = np.random.randint(-15, 15, 1).astype(float)
        xs_img = ndimage.rotate(xs_img, angle, reshape=False, cval=bg_value)
        # ZOOM
        rg = 0.1
        zoom_factor = np.random.uniform(1., 1.+rg)
        h, w = xs_img.shape[:2]
        zh   = int(np.round(zoom_factor * h))
        zw   = int(np.round(zoom_factor * w))
        top  = (zh - h) // 2
        left = (zw - w) // 2
        zoom_tuple = (zoom_factor,) * 2 + (1,) * (xs_img.ndim - 2)
        temp = ndimage.zoom(xs_img[top:top+zh, left:left+zw], zoom_tuple)
        trim_top  = ((temp.shape[0] - h) // 2)
        trim_left = ((temp.shape[1] - w) // 2)
        xs_img = temp[trim_top:trim_top+h, trim_left:trim_left+w]
        # SHIFT
        shift = np.random.randint(-3, 3, 2)
        xs_img = ndimage.shift(xs_img, shift, cval=bg_value)
        # RESHAPE
        xs_v = np.reshape(xs_img, [1, -1])
        out[i, :] = xs_v
    return out

### Build Graph

In [7]:
tf.reset_default_graph()

with tf.device(tf.DeviceSpec(device_type="CPU", device_index=0)):
    # need to change learning rates and batch size by number of GPU
    BATCH_SIZE    = 256
    LEARNING_RATE = 0.001
    TOTAL_EPOCHS  = 100
    DISPLAY_STEP  = 4
    N_CLASSES     = 10
    NUM_GPUS      = check_available_pus('GPU')
    NUM_CPUS      = check_available_pus('CPU')
    NUM_GPUS      = 1

    # Placeholders
    X      = tf.placeholder(tf.float32, [None, 28, 28, 1])
    Y      = tf.placeholder(tf.float32, [None, N_CLASSES])
    IS_TRN = tf.placeholder(tf.bool)

    losses = []
    accres = []
    # Split placeholders for each GPU operation
    X_A = tf.split(X, int(NUM_GPUS))
    Y_A = tf.split(Y, int(NUM_GPUS))

    for gpu_id in range(int(NUM_GPUS)):
        with tf.device(tf.DeviceSpec(device_type="GPU", device_index=gpu_id)):
            with tf.variable_scope(tf.get_variable_scope(), reuse=(gpu_id > 0)):
                pred = model(X_A[gpu_id], gpu_id > 0)
                cost = tf.nn.softmax_cross_entropy_with_logits_v2(
                                logits=pred,
                                labels=Y_A[gpu_id])
                corr = tf.equal(tf.argmax(pred, 1), tf.argmax(Y_A[gpu_id], 1))    
                accr = tf.reduce_mean(tf.cast(corr, "float"))

                losses.append(cost)
                accres.append(accr)


    loss = tf.reduce_mean(tf.concat(losses, axis=0))
    accr = tf.reduce_mean(accres)

    optimizer = tf.train.AdamOptimizer(LEARNING_RATE).minimize(
        loss, colocate_gradients_with_ops=True)  # Important!

print('Graph Ready!')

4 GPU(s) are detected : ['/device:GPU:0', '/device:GPU:1', '/device:GPU:2', '/device:GPU:3']
1 CPU(s) are detected : ['/device:CPU:0']
Graph Ready!


### Run the model

In [8]:
init = tf.global_variables_initializer()
sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))
sess.run(init)

start_time = datetime.datetime.now()

'''
--- Training time : 0:04:37.169936 seconds /w 4 GPUs ---
--- Training time : 0:05:00.693084 seconds /w 4 GPUs ---
--- Training time : 0:03:47.201289 seconds /w 1 GPUs ---b
--- Training time : 0:03:17.412582 seconds /w 1 GPUs ---
--- Training time : 0:03:44.552861 seconds /w 1 GPUs ---
'''


for epoch in range(TOTAL_EPOCHS):
    total_cost = 0.
    total_batch = int(N_TRN/BATCH_SIZE)

    # Get random minibatch for each epoch
    randindices = np.random.permutation(len(trainlabel))

    # Iteration
    for i in range(total_batch):
        # Obtain a batch
        cur_indices = randindices[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
        batch_xs    = trainimg[cur_indices, :]
        batch_ys    = trainlabel[cur_indices, :]

        # Augment Dataset
        #batch_xs = augment_img(batch_xs)

        #batch_xs = batch_xs.reshape(-1, 28, 28, 1)
        feeds = {X: batch_xs, Y: batch_ys, IS_TRN: True }

        _, cost_val = sess.run([optimizer, loss],
                               feed_dict=feeds)
        total_cost += cost_val
    total_cost = total_cost / total_batch

    # Display learning process
    if (epoch+1) % DISPLAY_STEP == 0:
        print ("Epoch: %03d/%03d cost: %.9f" % (epoch+1, TOTAL_EPOCHS, total_cost))
        randidx = np.random.permutation(trainimg.shape[0])[:500]

        #trn_sample = trainimg[randidx].reshape(-1, 28, 28, 1)
        #feeds = {X: trn_sample, Y: trainlabel[randidx], IS_TRN: False}
        feeds = {X: trainimg[randidx], Y: trainlabel[randidx], IS_TRN: False}
        train_acc = sess.run(accr, feed_dict=feeds)
        print (" TRAIN ACCURACY: %.5f" % (train_acc))

        feeds = {X: testimg, Y: testlabel, IS_TRN: False}
        val_acc = sess.run(accr, feed_dict=feeds)
        print (" VALIDATION ACCURACY: %.5f" % (val_acc))    

print("--- Training time : {0} seconds /w {1} GPUs ---".format(
    datetime.datetime.now() - start_time, NUM_GPUS))

Epoch: 004/100 cost: 0.233077529
 TRAIN ACCURACY: 0.94200
 VALIDATION ACCURACY: 0.93340
Epoch: 008/100 cost: 0.136628641
 TRAIN ACCURACY: 0.96800
 VALIDATION ACCURACY: 0.95860
Epoch: 012/100 cost: 0.100919831
 TRAIN ACCURACY: 0.97200
 VALIDATION ACCURACY: 0.97010
Epoch: 016/100 cost: 0.079937519
 TRAIN ACCURACY: 0.97800
 VALIDATION ACCURACY: 0.97080
Epoch: 020/100 cost: 0.069139902
 TRAIN ACCURACY: 0.97000
 VALIDATION ACCURACY: 0.97460
Epoch: 024/100 cost: 0.059150019
 TRAIN ACCURACY: 0.98200
 VALIDATION ACCURACY: 0.97750
Epoch: 028/100 cost: 0.053326347
 TRAIN ACCURACY: 0.98000
 VALIDATION ACCURACY: 0.97990
Epoch: 032/100 cost: 0.049237883
 TRAIN ACCURACY: 0.99000
 VALIDATION ACCURACY: 0.98000
Epoch: 036/100 cost: 0.044035270
 TRAIN ACCURACY: 0.98800
 VALIDATION ACCURACY: 0.98210
Epoch: 040/100 cost: 0.040775237
 TRAIN ACCURACY: 0.99400
 VALIDATION ACCURACY: 0.98310
Epoch: 044/100 cost: 0.037874635
 TRAIN ACCURACY: 0.99000
 VALIDATION ACCURACY: 0.98210
Epoch: 048/100 cost: 0.036475991