In [1]:
import keras
import numpy as np
mnist = keras.datasets.mnist
(trainimg, trainlabel), (testimg, testlabel) = mnist.load_data()

# Img size : 28 x 28 x 1
# Num classes : 10

print('Training data shape : ', trainimg.shape)
print('Class dataset : ', trainlabel.shape)
print('Testing data shape : ', testimg.shape)
print('Class dataset : ', testlabel.shape)

N_TRN = len(trainlabel)
N_TST = len(testlabel)

onehot_mat = np.eye(10)

#trainimg  = np.reshape(trainimg, [-1, 784])
#testimg   = np.reshape(testimg, [-1, 784])
trainimg  = np.reshape(trainimg, [-1, 28, 28, 1])
testimg   = np.reshape(testimg, [-1, 28, 28, 1])

trainlabel = np.concatenate([[onehot_mat[int(x),:]] for x in trainlabel], axis=0)
testlabel   = np.concatenate([[onehot_mat[int(x),:]] for x in testlabel], axis=0)

print('Training data shape : ', trainimg.shape)
print('Class dataset : ', trainlabel.shape)
print('Testing data shape : ', testimg.shape)
print('Class dataset : ', testlabel.shape)

Using TensorFlow backend.


Training data shape :  (60000, 28, 28)
Class dataset :  (60000,)
Testing data shape :  (10000, 28, 28)
Class dataset :  (10000,)
Training data shape :  (60000, 28, 28, 1)
Class dataset :  (60000, 10)
Testing data shape :  (10000, 28, 28, 1)
Class dataset :  (10000, 10)


In [2]:
from scipy import ndimage

def augment_img(xs):
    out  = np.copy(xs)
    xs_r = np.reshape(xs, [-1, 28, 28])
    for i in range(xs_r.shape[0]):
        xs_img = xs_r[i, :, :]
        bg_value = 0
        # ROTATE
        angle = np.random.randint(-15, 15, 1).astype(float)
        xs_img = ndimage.rotate(xs_img, angle, reshape=False, cval=bg_value)
        # ZOOM
        rg = 0.1
        zoom_factor = np.random.uniform(1., 1.+rg)
        h, w = xs_img.shape[:2]
        zh   = int(np.round(zoom_factor * h))
        zw   = int(np.round(zoom_factor * w))
        top  = (zh - h) // 2
        left = (zw - w) // 2
        zoom_tuple = (zoom_factor,) * 2 + (1,) * (xs_img.ndim - 2)
        temp = ndimage.zoom(xs_img[top:top+zh, left:left+zw], zoom_tuple)
        trim_top  = ((temp.shape[0] - h) // 2)
        trim_left = ((temp.shape[1] - w) // 2)
        xs_img = temp[trim_top:trim_top+h, trim_left:trim_left+w]
        # SHIFT
        shift = np.random.randint(-3, 3, 2)
        xs_img = ndimage.shift(xs_img, shift, cval=bg_value)
        # RESHAPE
        xs_v = np.reshape(xs_img, [1, -1])
        out[i, :] = xs_v
    return out

In [3]:
import datetime

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
from tensorflow.python.client import device_lib


def check_available_gpus():
    local_devices = device_lib.list_local_devices()
    gpu_names = [x.name for x in local_devices if x.device_type == 'GPU']
    gpu_num = len(gpu_names)

    print('{0} GPUs are detected : {1}'.format(gpu_num, gpu_names))

    return gpu_num


def model(X, reuse=False, is_trn=True):
    with tf.variable_scope('L1', reuse=reuse):
        X  = tf.layers.batch_normalization(inputs=X, reuse=reuse)
        L1 = tf.layers.conv2d(inputs=X
                              , filters=64
                              , kernel_size=[3, 3]
                              , reuse=reuse
                              , activation=tf.nn.relu
                              , padding='SAME')
        L1 = tf.layers.max_pooling2d(inputs=L1
                                     , pool_size=[2, 2]
                                     , strides=[2, 2])
        L1 = tf.layers.dropout(inputs=L1
                               , rate=0.7
                               , training=is_trn)

    with tf.variable_scope('L2', reuse=reuse):
        L1 = tf.layers.batch_normalization(inputs=L1, reuse=reuse)
        L2 = tf.layers.conv2d(inputs=L1
                              , filters=128
                              , kernel_size=[3, 3]
                              , reuse=reuse
                              , activation=tf.nn.relu
                              , padding='SAME')
        L2 = tf.layers.max_pooling2d(inputs=L2
                                     , pool_size=[2, 2]
                                     , strides=[2, 2])
        L2 = tf.layers.dropout(inputs=L2
                               , rate=0.7
                               , training=is_trn)

    with tf.variable_scope('L2-1', reuse=reuse):
        L2   = tf.layers.batch_normalization(inputs=L2, reuse=reuse)
        L2_1 = tf.layers.conv2d(inputs=L2
                                , filters=128
                                , kernel_size=[3, 3]
                                , reuse=reuse
                                , activation=tf.nn.relu
                                , padding='SAME')
        L2_1 = tf.layers.max_pooling2d(inputs=L2_1
                                       , pool_size=[2, 2]
                                       , strides=[2, 2])
        L2_1 = tf.layers.dropout(inputs=L2_1
                                 , rate=0.7
                                 , training=is_trn)
        
    with tf.variable_scope('L3', reuse=reuse):
        L2_1 = tf.layers.batch_normalization(inputs=L2, reuse=reuse)
        L3   = tf.contrib.layers.flatten(inputs=L2_1)
        L3   = tf.layers.dense(L3, 1024, activation=tf.nn.relu)
        L3   = tf.layers.dropout(L3, 0.5, is_trn)

    with tf.variable_scope('L4', reuse=reuse):
        L3 = tf.layers.batch_normalization(inputs=L3, reuse=reuse)
        L4 = tf.layers.dense(L3, 256, activation=tf.nn.relu)

    with tf.variable_scope('LF', reuse=reuse):
        LF = tf.layers.dense(L4, 10, activation=None)

    return LF


if __name__ == '__main__':
    # need to change learning rates and batch size by number of GPU
    batch_size = 256
    learning_rate = 0.001
    total_epoch = 100

    gpu_num = check_available_gpus()

    X = tf.placeholder(tf.float32, [None, 28, 28, 1])
    Y = tf.placeholder(tf.float32, [None, 10])

    losses = []
    X_A = tf.split(X, int(gpu_num))
    Y_A = tf.split(Y, int(gpu_num))

    '''
    Multi GPUs Usage
    Results on P40
     * Single GPU computation time: 0:00:22.252533
     * 2 GPU computation time: 0:00:12.632623
     * 4 GPU computation time: 0:00:11.083071
     * 8 GPU computation time: 0:00:11.990167
     
    Need to change batch size and learning rates
         for training more efficiently
    
    Reference: https://research.fb.com/wp-content/uploads/2017/06/imagenet1kin1h5.pdf
    '''
    for gpu_id in range(int(gpu_num)):
        with tf.device(tf.DeviceSpec(device_type="GPU", device_index=gpu_id)):
            with tf.variable_scope(tf.get_variable_scope(), reuse=(gpu_id > 0)):
                cost = tf.nn.softmax_cross_entropy_with_logits_v2(
                                logits=model(X_A[gpu_id], gpu_id > 0),
                                labels=Y_A[gpu_id])
                losses.append(cost)

    loss = tf.reduce_mean(tf.concat(losses, axis=0))

    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(
        loss, colocate_gradients_with_ops=True)  # Important!

    init = tf.global_variables_initializer()
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))
    sess.run(init)

    #mnist = input_data.read_data_sets('/tmp/tensorflow/mnist/input_data', one_hot=True)
    #total_batch = int(mnist.train.num_examples/batch_size)
    total_batch = int(N_TRN/batch_size)
    #print("total: %s, %s, %s" % (mnist.train.num_examples, total_batch, batch_size))
    
    start_time = datetime.datetime.now()

    for epoch in range(total_epoch):
        total_cost = 0
        # Get random minibatch for each epoch
        randindices = np.random.permutation(len(trainlabel))
        
        for i in range(total_batch):
            #batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            
            cur_indices = randindices[i*batch_size:(i+1)*batch_size]
            batch_xs    = trainimg[cur_indices, :]
            batch_ys    = trainlabel[cur_indices, :]  
            batch_xs = augment_img(batch_xs)

            batch_xs = batch_xs.reshape(-1, 28, 28, 1)
            #batch_xs = batch_xs.reshape(-1, 28, 28, 1)
            _, cost_val = sess.run([optimizer, loss],
                                   feed_dict={X: batch_xs,
                                              Y: batch_ys})
            total_cost += cost_val

        print("total cost : %s" % total_cost)

    print("--- Training time : {0} seconds /w {1} GPUs ---".format(
        datetime.datetime.now() - start_time, gpu_num))

4 GPUs are detected : ['/device:GPU:0', '/device:GPU:1', '/device:GPU:2', '/device:GPU:3']


ValueError: could not broadcast input array from shape (1,784) into shape (28,28,1)