## Make *tf.slim* to work with *tf.data* in *parallel*.
### STEPS
    * create data pipeline
    * load models(Inception V4, Vgg19 and Dense121)
    * load ckpt(only weights)
    * make it into parallel framework for gpu

In [2]:
import tensorflow as tf
import numpy as np
from glob import glob

from datasets.utils import get_imgdir_label, label_to_index, parser, preprocessors
from models import nets_factory


import tensorflow.contrib.slim as slim
import csv, os, functools, itertools, collections

os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [7]:
# model config
lr = 0.001 #learning_rate
batch_size = 32 # batch_size 
weight_decay = 0.0 # regularizer constant 
model_name = ["densenet121","inception_v4","vgg19"][0] # model selection 
variable_to_exclude = ['InceptionV4/AuxLogits','InceptionV4/Logits']
num_classes = 7 # number of classes
weights_loc = "./weights/" # weights locations 
pretrained_dir = glob(weights_loc + "pretrained/" + "*{}*.ckpt".format(model_name)) #pretrained weights directory
trained_dir =  glob(weights_loc + "trained/" + "*{}*.ckpt".format(model_name))  #post trained weights save location

# data config
dataset_dir = "/home/jacob/intel_project_temp/datasets/ISIC_2018/" # direct to your dataset location
class_list = ["MEL","NV","BCC","AKIEC","BKL","DF","VASC"] # total of 7 classes in the dataset 
class_dict = {c:i for i,c in enumerate(class_list)} # to map str::class -> int::class for train/test
class_frequency = {c:None for c in class_list} # prevenlance of each class in the dataset. Will used to adjust loss.
ratio = 0.8


In [5]:
# 1. create datapipeline
tf.reset_default_graph()
img_lab = get_imgdir_label(dataset_dir) # get list of tuples [(img_dir_i, str::label_i) for i in range(len(dataset))]
img_lab = label_to_index(img_lab, class_dict) # get list of tuples [(img_dir_i, int::label_i) for i in range(len(dataset))]

img_lab_train = img_lab[:int(len(img_lab) * ratio)] # train set: 80% of img_lab 
img_lab_test = list(set(img_lab) - set(img_lab_train)) # test set 20% of img_lab

imgs_train, labels_train = zip(*img_lab_train) # transpose train_data 
imgs_test, labels_test = zip(*img_lab_test) # transpose test_data

# establish data_pipeline for trainset 
dataset_train = tf.data.Dataset.from_tensor_slices((list(imgs_train),list(labels_train))) 
dataset_train = dataset_train.\
           map(parser).\
           batch(32).\
           repeat(32)

# establish data_pipeline for testset
dataset_test = tf.data.Dataset.from_tensor_slices((list(imgs_test),list(labels_test)))
dataset_test = dataset_test.\
            map(parser).\
            batch(batch_size).\
            repeat(1)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.


In [8]:
network_fn = nets_factory.get_network_fn(
    name = model_name,
    weight_decay = weight_decay,
    num_classes = num_classes,
    is_training = True)

train_generator = dataset_train.make_one_shot_iterator()
x , y = train_generator.get_next()
logits, end_points = network_fn(x)

INFO:tensorflow:Scale of 0 disables regularizer.


In [None]:
exclusions = [scope.strip()
              for scope in variable_to_exclude]

# TODO(sguada) variables.filter_variables()
variables_to_restore = []
for var in slim.get_model_variables():
    for exclusion in exclusions:
        if var.op.name.startswith(exclusion):
            break
    else:
        variables_to_restore.append(var)

        
weights_loader = slim.assign_from_checkpoint_fn(
    './weights/pretrained/inception_v4.ckpt',
    variables_to_restore,
    ignore_missing_vars=True)


In [None]:
saver = tf.train.Saver()

with tf.Session() as sess:
    weights_loader(sess)
    

In [None]:
# make it into parallel framework for gpu
def training_model(input_fn):
    inputs = input_fn()
    image = inputs[0]
    label = tf.cast(inputs[1], tf.int32)
    logits = model.forward(image)
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label, logits=logits)
    loss = tf.reduce_mean(loss)
    return loss 

def parallel_training(model_fn, dataset):
    iterator = dataset.make_one_shot_iterator()
    
    def input_fn():
        with tf.device(None):
            # remove any device specifications for the input data
            return iterator.get_next()
    
    optimizer = tf.train.AdamOptimizer(learning_rate=1E-3)
    
    update_op, loss = create_parallel_optimization(model_fn,
                                                   input_fn,
                                                   optimizer)

    do_training(update_op, loss)
    


def assign_to_device(device, ps_device):
    """Returns a function to place variables on the ps_device.

    Args:
        device: Device for everything but variables
        ps_device: Device to put the variables on. Example values are /GPU:0 and /CPU:0.

    If ps_device is not set then the variables will be placed on the default device.
    The best device for shared varibles depends on the platform as well as the
    model. Start with CPU:0 and then test GPU:0 to see if there is an
    improvement.
    """
    def _assign(op):
        
        PS_OPS = [
        'Variable', 'VariableV2', 'AutoReloadVariable', 'MutableHashTable',
        'MutableHashTableOfTensors', 'MutableDenseHashTable']
        
        node_def = op if isinstance(op, tf.NodeDef) else op.node_def
        
        if node_def.op in PS_OPS:
            return ps_device
        else:
            return device
    return _assign    
    
def do_training(update_op, loss):
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        try:
            step = 0
            while True:
                _, loss_value = sess.run((update_op, loss))
                if step % 10 == 0:
                    print('Step {} with loss {}'.format(step, loss_value))
                step += 1
        except tf.errors.OutOfRangeError:
            # we're through the dataset
            pass
    print('Final loss: {}'.format(loss_value))
    
def create_parallel_optimization(model_fn, input_fn, optimizer, controller="/cpu:0"):
    # This function is defined below; it returns a list of device ids like
    # `['/gpu:0', '/gpu:1']`
    devices = get_available_gpus()
    #devices = ['/gpu:0', '/gpu:1']
        
    # This list keeps track of the gradients per tower and the losses
    tower_grads = []
    losses = []
    
    # Get the current variable scope so we can reuse all variables we need once we get
    # to the second iteration of the loop below
    with tf.variable_scope(tf.get_variable_scope()) as outer_scope:
        for i, id in enumerate(devices):
            name = 'tower_{}'.format(i)
            # Use the assign_to_device function to ensure that variables are created on the
            # controller.
            with tf.device(assign_to_device(id, controller)), tf.name_scope(name):
                
                # Compute loss and gradients, but don't apply them yet
                loss = model_fn(input_fn)
                
                with tf.name_scope("compute_gradients"):
                    # `compute_gradients` returns a list of (gradient, variable) pairs
                    grads = optimizer.compute_gradients(loss)
                    tower_grads.append(grads)
                    
                losses.append(loss)
            
            # After the first iteration, we want to reuse the variables.
            outer_scope.reuse_variables()
                
    # Apply the gradients on the controlling device
    with tf.name_scope("apply_gradients"), tf.device(controller):
        # Note that what we are doing here mathematically is equivalent to returning the
        # average loss over the towers and compute the gradients relative to that.
        # Unfortunately, this would place all gradient-computations on one device, which is
        # why we had to compute the gradients above per tower and need to average them here.
        
        # This function is defined below; it takes the list of (gradient, variable) lists
        # and turns it into a single (gradient, variables) list.
        gradients = average_gradients(tower_grads)
        global_step = tf.train.get_or_create_global_step()
        apply_gradient_op = optimizer.apply_gradients(gradients, global_step)
        avg_loss = tf.reduce_mean(losses)

    return apply_gradient_op, avg_loss    

# Source:
# https://stackoverflow.com/questions/38559755/how-to-get-current-available-gpus-in-tensorflow
def get_available_gpus():
    """
        Returns a list of the identifiers of all visible GPUs.
    """
    from tensorflow.python.client import device_lib
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']


# Source:
# https://github.com/tensorflow/models/blob/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py#L101
def average_gradients(tower_grads):
    average_grads = []
    for grad_and_vars in zip(*tower_grads):
        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        grads = []
        for g, _ in grad_and_vars:
            # Add 0 dimension to the gradients to represent the tower.
            if g is not None:
                expanded_g = tf.expand_dims(g, 0)

            # Append on a 'tower' dimension which we will average over below.
            grads.append(expanded_g)

        # Average over the 'tower' dimension.
        grad = tf.concat(grads, 0)
        grad = tf.reduce_mean(grad, 0)

        # Keep in mind that the Variables are redundant because they are shared
        # across towers. So .. we will just return the first tower's pointer to
        # the Variable.
        v = grad_and_vars[0][1]
        grad_and_var = (grad, v)
        average_grads.append(grad_and_var)
    return average_grads