# LeNet on MNIST Dataset Using Multiple GPUs

In [1]:
# Go over the resources on the current system

from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/cpu:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 16562518561336613333, name: "/gpu:0"
 device_type: "GPU"
 memory_limit: 67108864
 locality {
   bus_id: 1
 }
 incarnation: 7913761063815264308
 physical_device_desc: "device: 0, name: GRID K520, pci bus id: 0000:00:03.0", name: "/gpu:1"
 device_type: "GPU"
 memory_limit: 67108864
 locality {
   bus_id: 1
 }
 incarnation: 10782718246840404907
 physical_device_desc: "device: 1, name: GRID K520, pci bus id: 0000:00:04.0", name: "/gpu:2"
 device_type: "GPU"
 memory_limit: 67108864
 locality {
   bus_id: 1
 }
 incarnation: 4133597402929815830
 physical_device_desc: "device: 2, name: GRID K520, pci bus id: 0000:00:05.0", name: "/gpu:3"
 device_type: "GPU"
 memory_limit: 67108864
 locality {
   bus_id: 1
 }
 incarnation: 16752761831954155716
 physical_device_desc: "device: 3, name: GRID K520, pci bus id: 0000:00:06.0"]

## Loading Dataset

In [2]:
from tensorflow.examples.tutorials.mnist import input_data

mnist = input_data.read_data_sets("MNIST_data/", reshape=False)
X_train, y_train           = mnist.train.images, mnist.train.labels
X_validation, y_validation = mnist.validation.images, mnist.validation.labels
X_test, y_test             = mnist.test.images, mnist.test.labels

assert(len(X_train) == len(y_train))
assert(len(X_validation) == len(y_validation))
assert(len(X_test) == len(y_test))

print()
print("Image Shape: {}".format(X_train[0].shape))
print()
print("Training Set:   {} samples".format(len(X_train)))
print("Validation Set: {} samples".format(len(X_validation)))
print("Test Set:       {} samples".format(len(X_test)))

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz

Image Shape: (28, 28, 1)

Training Set:   55000 samples
Validation Set: 5000 samples
Test Set:       10000 samples


Add padding to the MNIST dataset as MNIST used 28x28x1 images while LeNet architecture only accepts 32x32xC images, where C is the number of color channels.

In [3]:
import numpy as np

# Pad images with 0s
X_train      = np.pad(X_train, ((0,0),(2,2),(2,2),(0,0)), 'constant')
X_validation = np.pad(X_validation, ((0,0),(2,2),(2,2),(0,0)), 'constant')
X_test       = np.pad(X_test, ((0,0),(2,2),(2,2),(0,0)), 'constant')

## Shuffle Dataset

In [4]:
import random
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

X_train, y_train = shuffle(X_train, y_train)

## Setup Parameters

In [5]:
import tensorflow as tf

batch_size = 512
num_gpus = 4
num_epochs = 20
learning_rate = 0.001

## Setup LeNet Architecture

![LeNet Architecture](lenet.png)
Source: Yan LeCun

Here we setup the LeNet Model such that all the variables are on the CPU. Later on, we will reuse the same variables on each GPU to compute gradients. The CPU then does averaging gradients returned from each GPU and update the variables. 

In [6]:
from tensorflow.contrib.layers import flatten

def LeNetModel(images):    
    mu = 0
    sigma = 0.1

    with tf.variable_scope('conv1') as scope:
        with tf.device("/cpu:0"):
            conv1_W = tf.get_variable("conv1_W",initializer=tf.truncated_normal(shape=(5, 5, 1, 6), mean = mu, stddev = sigma))
            conv1_b = tf.get_variable("conv1_b",initializer=tf.zeros(6))
        conv1   = tf.nn.conv2d(images, conv1_W, strides=[1, 1, 1, 1], padding='VALID') + conv1_b
        conv1 = tf.nn.relu(conv1, name=scope.name)

    pool1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool1')

    with tf.variable_scope('conv2') as scope:
        with tf.device("/cpu:0"):
            conv2_W = tf.get_variable("conv2_W",initializer=tf.truncated_normal(shape=(5, 5, 6, 16), mean = mu, stddev = sigma))
            conv2_b = tf.get_variable("conv2_b",initializer=tf.zeros(16))
        conv2   = tf.nn.conv2d(pool1, conv2_W, strides=[1, 1, 1, 1], padding='VALID') + conv2_b

        conv2 = tf.nn.relu(conv2, name=scope.name)

    pool2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool2')

    with tf.variable_scope('fc1') as scope:
        fc0   = flatten(pool2)

        with tf.device("/cpu:0"):
            fc1_W = tf.get_variable("fc1_W",initializer=tf.truncated_normal(shape=(400, 120), mean = mu, stddev = sigma))
            fc1_b = tf.get_variable("fc1_b",initializer=tf.zeros(120))

        fc1    = tf.nn.relu(tf.matmul(fc0, fc1_W) + fc1_b, name=scope.name)

    with tf.variable_scope('fc2') as scope:
        with tf.device("/cpu:0"):
            fc2_W  = tf.get_variable("fc2_W",initializer=tf.truncated_normal(shape=(120, 84), mean = mu, stddev = sigma))
            fc2_b  = tf.get_variable("fc2_b",initializer=tf.zeros(84))

        fc2    = tf.nn.relu(tf.matmul(fc1, fc2_W) + fc2_b, name=scope.name)

    with tf.variable_scope('fc3') as scope:
        with tf.device("/cpu:0"):
            fc3_W  = tf.get_variable("fc3_W",initializer=tf.truncated_normal(shape=(84, 10), mean = mu, stddev = sigma))
            fc3_b  = tf.get_variable("fc3_b",initializer=tf.zeros(10))
        logits = tf.add(tf.matmul(fc2, fc3_W), fc3_b, name=scope.name)
    
    return logits

## Loss, Averaging Gradients and Make Parallel Function

In [7]:
# define the loss using cross entropy on outputed softmax probabilities

def loss(logits, labels):
    one_hot_y = tf.one_hot(labels, 10)
    cross_entropy_all = tf.nn.softmax_cross_entropy_with_logits(labels=one_hot_y, logits=logits)    
    cross_entropy_mean = tf.reduce_mean(cross_entropy_all, name='cross_entropy_mean')
    
    return cross_entropy_mean

In [8]:
# the function to average the gradients returned from the GPU
# this function is taken from Tensorflow tutorial

def average_gradients(tower_grads):
    average_grads = []
    for grad_and_vars in zip(*tower_grads):
        grads = []
        for g, _ in grad_and_vars:
            expanded_g = tf.expand_dims(g, 0)
            grads.append(expanded_g)

        grad = tf.concat(0, grads)
        grad = tf.reduce_mean(grad, 0)
        
        v = grad_and_vars[0][1]
        grad_and_var = (grad, v)
        average_grads.append(grad_and_var)
    return average_grads

In [9]:
# a function split the parameters of a function fn into num_gpus chunks and for each chunk, build an operation on each GPU 
# and finally gather the results and return it 
# this function has been adapted from EffectiveTensorflow

def make_parallel(fn, num_gpus, **kwargs):
  input_splits = {}
  for k, v in kwargs.items():
    input_splits[k] = tf.split(0, num_gpus, v)

  output_splits = []
  for i in range(num_gpus):
    with tf.device("/gpu:" + str(i)):
      with tf.variable_scope(tf.get_variable_scope(), reuse=True):
        output_splits.append(fn(**{k : v[i] for k, v in input_splits.items()}))

  return tf.concat(0, output_splits)

## Training

In [None]:
def train(X_train, y_train):
    with tf.Graph().as_default(), tf.device("/cpu:0"):
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

        whole_batch_x = tf.placeholder(tf.float32, (None, 32, 32, 1))
        whole_batch_y = tf.placeholder(tf.int32, (None))

        images_per_gpu = tf.split(0, num_gpus, whole_batch_x)
        labels_per_gpu = tf.split(0, num_gpus, whole_batch_y)
        
        tower_grads = []
        with tf.variable_scope(tf.get_variable_scope()):
            for gpu_idx in range(num_gpus):
                with tf.device("/gpu:" + str(gpu_idx)):
                    with tf.name_scope('%s_%d' % ("tower_gpu", gpu_idx)) as scope:
                        with tf.variable_scope("same_on_each_device", reuse=(gpu_idx>0)):
                            loss_v = loss(LeNetModel(images_per_gpu[gpu_idx]), labels_per_gpu[gpu_idx])                        
                            grads_and_vars = optimizer.compute_gradients(loss_v)
                            tower_grads.append(grads_and_vars)
        
        avg_grad = average_gradients(tower_grads)
        apply_grad_op = optimizer.apply_gradients(avg_grad)
        
        with tf.variable_scope("same_on_each_device", reuse=True):
            val_batch_x = tf.placeholder(tf.float32, (None, 32, 32, 1))
            val_batch_y = tf.placeholder(tf.int32, (None))
            val_batch_one_hot_y = tf.one_hot(val_batch_y, 10)
            val_batch_logits =  make_parallel(LeNetModel, num_gpus, images = val_batch_x)
            correct_prediction = tf.equal(tf.argmax(val_batch_logits, 1), tf.argmax(val_batch_one_hot_y, 1))
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        
        init = tf.global_variables_initializer()

        sess = tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=True))
        
        sess.run(init)
        print("Training...")
        print()
        for i in range(num_epochs):
            X_train, y_train = shuffle(X_train, y_train)
            num_examples = len(X_train)
            max_steps = num_examples // (num_gpus*batch_size)
            for offset in range(0, max_steps):
                end = offset + num_gpus*batch_size
                batch_x, batch_y = X_train[offset:end], y_train[offset:end]
                _ = sess.run([apply_grad_op], feed_dict={whole_batch_x: batch_x, whole_batch_y: batch_y})
            
            num_examples = len(X_validation)
            total_accuracy = 0
            for offset in range(0, num_examples, batch_size):
                batch_x, batch_y = X_validation[offset:offset+batch_size], y_validation[offset:offset+batch_size]
                acc = sess.run(accuracy, feed_dict={val_batch_x: batch_x, val_batch_y: batch_y})
                total_accuracy += (acc * len(batch_x))
            validation_accuracy = total_accuracy / num_examples

            print("EPOCH {} ...".format(i+1))
            print("Validation Accuracy = {:.3f}".format(validation_accuracy))
            print()

In [None]:
import time

start = time.time()
train(X_train, y_train)
end = time.time()
print("it takes %f (s)" % (end-start))

## It took around 17s to train on GPUs compared to 42s on one single GPU

Training...

EPOCH 1 ...
Validation Accuracy = 0.811

EPOCH 2 ...
Validation Accuracy = 0.914

EPOCH 3 ...
Validation Accuracy = 0.943

