# Multi-GPU Training Example

Train a convolutional neural network on multiple GPU with TensorFlow.

This example is using TensorFlow layers, see 'convolutional_network_raw' example
for a raw TensorFlow implementation with variables.


- Original source: https://github.com/aymericdamien/TensorFlow-Examples/

## Training with multiple GPU cards

In this example, we are using data parallelism to split the training accross multiple GPUs. Each GPU has a full replica of the neural network model, and the weights (i.e. variables) are updated synchronously by waiting that each GPU process its batch of data.

First, each GPU process a distinct batch of data and compute the corresponding gradients, then, all gradients are accumulated in the CPU and averaged. The model weights are finally updated with the gradients averaged, and the new model weights are sent back to each GPU, to repeat the training process.

<img src="https://www.tensorflow.org/images/Parallelism.png" alt="Parallelism" style="width: 400px;"/>

## CIFAR-10 Dataset Overview



In [None]:
# Build a convolutional neural network
def conv_net(tf, x, n_classes, dropout, reuse, is_training):
    # Define a scope for reusing the variables
    with tf.variable_scope('ConvNet', reuse=reuse):
        # MNIST data input is a 1-D vector of 784 features (28*28 pixels)
        # Reshape to match picture format [Height x Width x Channel]
        # Tensor input become 4-D: [Batch Size, Height, Width, Channel]
        x = tf.reshape(x, shape=[-1, 32, 32, 3])

        
        # Convolution Layer with 64 filters and a kernel size of 5
        x = tf.layers.conv2d(x, 64, 5, activation=tf.nn.relu)
        # Max Pooling (down-sampling) with strides of 2 and kernel size of 2
        x = tf.layers.max_pooling2d(x, 2, 2)

        # Convolution Layer with 256 filters and a kernel size of 5
        x = tf.layers.conv2d(x, 256, 3, activation=tf.nn.relu)
        # Convolution Layer with 512 filters and a kernel size of 5
        x = tf.layers.conv2d(x, 512, 3, activation=tf.nn.relu)
        # Max Pooling (down-sampling) with strides of 2 and kernel size of 2
        x = tf.layers.max_pooling2d(x, 2, 2)

        # Flatten the data to a 1-D vector for the fully connected layer
        x = tf.layers.flatten(x)

        # Fully connected layer (in contrib folder for now)
        x = tf.layers.dense(x, 2048)
        # Apply Dropout (if is_training is False, dropout is not applied)
        x = tf.layers.dropout(x, rate=dropout, training=is_training)

        # Fully connected layer (in contrib folder for now)
        x = tf.layers.dense(x, 1024)
        # Apply Dropout (if is_training is False, dropout is not applied)
        x = tf.layers.dropout(x, rate=dropout, training=is_training)

        # Output layer, class prediction
        out = tf.layers.dense(x, n_classes)
        # Because 'softmax_cross_entropy_with_logits' loss already apply
        # softmax, we only apply softmax to testing network
        out = tf.nn.softmax(out) if not is_training else out

    return out

In [None]:
def data_input_fn(tf, filenames, batch_size=128, shuffle=False, repeat=None, num_classes=10, num_gpus=0,
                 is_training=True):

    def parser(serialized_example):
        """Parses a single tf.Example into image and label tensors."""
        features = tf.parse_single_example(
            serialized_example,
            features={
                'image': tf.FixedLenFeature([], tf.string),
                'label': tf.FixedLenFeature([], tf.int64),
            })
        image = tf.decode_raw(features['image'], tf.uint8)
        image = tf.cast(
                    tf.transpose(tf.reshape(image, [3, 32, 32]), [1, 2, 0]),
                    tf.float32)

        # Normalize the values of the image from the range [0, 255] to [-0.5, 0.5]
        image = tf.cast(image, tf.float32) / 255 - 0.5
        label = tf.cast(features['label'], tf.int32)
        label = tf.one_hot(label, num_classes)
        return image, label

    def _input_fn():
        # Import MNIST data
        dataset = tf.data.TFRecordDataset(filenames)

        # Map the parser over dataset, and batch results by up to batch_size
        dataset = dataset.map(parser).prefetch(batch_size * num_gpus * 5)
        if shuffle:
            dataset = dataset.shuffle(buffer_size=batch_size * num_gpus)
        if is_training:
            dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(batch_size * num_gpus))
        dataset = dataset.repeat(repeat)
        iterator = dataset.make_one_shot_iterator()

        return iterator

    return _input_fn()

In [None]:
# Build the function to average the gradients
def average_gradients(tf, tower_grads):
    average_grads = []
    for grad_and_vars in zip(*tower_grads):
        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        grads = []
        for g, _ in grad_and_vars:
            # Add 0 dimension to the gradients to represent the tower.
            expanded_g = tf.expand_dims(g, 0)

            # Append on a 'tower' dimension which we will average over below.
            grads.append(expanded_g)

        # Average over the 'tower' dimension.
        grad = tf.concat(grads, 0)
        grad = tf.reduce_mean(grad, 0)

        # Keep in mind that the Variables are redundant because they are shared
        # across towers. So .. we will just return the first tower's pointer to
        # the Variable.
        v = grad_and_vars[0][1]
        grad_and_var = (grad, v)
        average_grads.append(grad_and_var)
    return average_grads

In [None]:
def wrapper():
    from hops import devices, tensorboard, hdfs
    import numpy as np
    import tensorflow as tf
    import time
    import os
    
    # Parameters
    num_gpus = devices.get_num_gpus()
    num_steps = 2000000
    learning_rate = 0.001
    batch_size = 2056
    display_step = 10
    
    # Network Parameters
    num_input = 32*32*3 # Cifar data input (img shape: 32*32)
    num_classes = 10 # MNIST total classes (0-9 digits)
    dropout = 0.75 # Dropout, probability to keep units
    
    logdir = tensorboard.logdir()
    data_dir = hdfs.project_path() + "TestJob/data/cifar10/"
    train_filenames = [data_dir + "/train/train.tfrecords"]
    validation_filenames = [data_dir + "/validation/validation.tfrecords"]
    test_filenames = [data_dir + "eval/eval.tfrecords"]
    
    train_iterator = data_input_fn(tf, train_filenames, 
                                   batch_size=batch_size,
                                   num_gpus=num_gpus)
    eval_iterator = data_input_fn(tf, validation_filenames, 
                                  batch_size=batch_size, 
                                  num_gpus=num_gpus)
    test_iterator = data_input_fn(tf, test_filenames, 
                                  is_training=False, # Read all at once
                                  shuffle=False, 
                                  repeat=1, 
                                  num_gpus=num_gpus)
    
    # Place all ops on CPU by default
    with tf.device('/cpu:0'):
        tower_grads = []
        reuse_vars = False
        # Only rank 0 needs to create the directories
        if not os.path.exists(logdir + '/train'):
            os.mkdir(logdir + '/train')
        if not os.path.exists(logdir + '/test'):
            os.mkdir(logdir + '/test')
        
        with tf.name_scope('input'):
            # tf Graph input
            X = tf.placeholder(tf.float32, [None, 32, 32, 3], name="images")
            Y = tf.placeholder(tf.float32, [None, num_classes], name="labels")

        # Loop over all GPUs and construct their own computation graph
        for i in range(num_gpus):
            with tf.device('/gpu:%d' % i):

                # Split data between GPUs
                _x = X[i * batch_size: (i+1) * batch_size]
                _y = Y[i * batch_size: (i+1) * batch_size]

                # Because Dropout have different behavior at training and prediction time, we
                # need to create 2 distinct computation graphs that share the same weights.

                # Create a graph for training
                logits_train = conv_net(tf, _x, num_classes, dropout,
                                        reuse=reuse_vars, is_training=True)
                # Create another graph for testing that reuse the same weights
                logits_test = conv_net(tf, _x, num_classes, dropout,
                                       reuse=True, is_training=False)

                with tf.name_scope('Loss'):
                    # Define loss and optimizer (with train logits, for dropout to take effect)
                    loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
                        logits=logits_train, labels=_y))
                    
                
                optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
                grads = optimizer.compute_gradients(loss_op)

                # Only first GPU compute accuracy
                if i == 0:
                    with tf.name_scope('Accuracy'):
                        # Evaluate model (with test logits, for dropout to be disabled)
                        correct_pred = tf.equal(tf.argmax(logits_test, 1), tf.argmax(_y, 1))
                        accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
                        
                reuse_vars = True
                tower_grads.append(grads)
                
            with tf.device('/cpu:0'):
                if i == 0:
                    tf.summary.scalar("accuracy", accuracy)
                    tf.summary.scalar('loss', loss_op)
                    tf.summary.image("images", _x)

        tower_grads = average_gradients(tf, tower_grads)
        train_op = optimizer.apply_gradients(tower_grads)

        # Merge all the summaries and write them out to /tmp/mnist_logs (by default)
        merged = tf.summary.merge_all()
        
        # Initializing the variables
        init = tf.global_variables_initializer()
    
        # Launch the graph
        with tf.Session() as sess:
            print('Initialziing')
            sess.run(init)
            
            train_writer = tf.summary.FileWriter(logdir + '/train', sess.graph)
            test_writer = tf.summary.FileWriter(logdir+ '/test', sess.graph)
            
            step = 1
            # Keep training until reach max iterations
            for step in range(1, num_steps + 1):
                # Get a batch for each GPU
                batch_x, batch_y = sess.run(train_iterator.get_next())
                
                # Run optimization op (backprop)
                ts = time.time()
                sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})
                te = time.time() - ts
                if (step % display_step == 0 or step == 1):
                    # Calculate batch loss and accuracy
                    batch_x_eval, batch_y_eval = sess.run(eval_iterator.get_next())

                    loss, acc, summary = sess.run([loss_op, accuracy, merged], 
                                         feed_dict={X: batch_x_eval,
                                                    Y: batch_y_eval})
                    with tf.device('/cpu:0'):
                        tf.summary.scalar("images/sec", int(len(batch_x)/te))
                        train_writer.add_summary(summary, step)
                step += 1

            # Calculate accuracy for 1000 mnist test images
            batch_x_test, batch_y_test = sess.run(test_iterator.get_next())
            num_test = len(batch_x_test)
            loss, acc, summary = sess.run([loss_op, accuracy, merged], 
                                          feed_dict={X: batch_x_test,
                                                     Y: batch_y_test})
            with tf.device('/cpu:0'):
                test_writer.add_summary(summary, step)


In [None]:
from hops import tflauncher
tflauncher.launch(spark, wrapper)