In [1]:
import contextlib
import functools
import gc
import multiprocessing
import os
import shutil
import tarfile
import time
import timeit
import urllib.request

import cv2
import numpy as np
import pandas as pd
import tensorflow as tf
import json

print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
with tf.device("GPU:0"):
  tf.ones(())  # Make sure we can run on GPU

#data_root = "/tmp/demo_images"
#profile_dir = os.path.join(data_root, "profiles")
#os.makedirs(profile_dir, exist_ok=True)

# This ensures that XLA and ptxas work well together, and helps with scaling.
print("XLA_FLAGS='{}'".format(os.getenv("XLA_FLAGS")))

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
42,application_1587025294218_0040,pyspark,idle,Link,Link


SparkSession available as 'spark'.
2.2.0-rc2
Num GPUs Available:  0
XLA_FLAGS='None'

## Configure task

In [2]:
num_images_per_label = 50000
RESOLUTION = (224, 224)
NUM_CHANNELS = 3
NUM_TOTAL_IMAGES = num_images_per_label * 2

## Model and data setup.

In [3]:
#def get_input_shape():
#  if tf.keras.backend.image_data_format() == "channels_last":
#    return RESOLUTION + (NUM_CHANNELS,)
#  elif tf.keras.backend.image_data_format() == "channels_first":
#    return (NUM_CHANNELS,) + RESOLUTION
#  raise ValueError("Unknown format.")
#
## Native jpg layout.
#NHWC_INPUT_SHAPE = RESOLUTION + (NUM_CHANNELS,)

def get_input_shape():
    # Input image dimensions
    img_rows, img_cols = 28, 28
    return (img_rows, img_cols, 1)


def transform_image(image):
  image = image / 255.0  # Scale occurs in random transformation
  
  image = tf.image.random_flip_left_right(image)
  image = tf.image.random_flip_up_down(image)
  image += tf.random.normal(tf.shape(image), mean=-0.5, stddev=0.1, dtype=image.dtype)
  return image


def make_model(input_dtype=tf.float32, transform_on_device=False):
    
  kernel = 3
  pool = 2
  dropout = 0.5
  num_classes = 10

  input_shape = get_input_shape()

  input_layer = tf.keras.layers.Conv2D(32, kernel_size=(kernel, kernel),
                                       padding='same',
                                       activation='relu',
                                       input_shape=input_shape,
                                       dtype=input_dtype
                                      )

  layer = input_layer
  if transform_on_device:
    layer = tf.keras.layers.Lambda(transform_image)(layer)
    
  model = tf.keras.Sequential()
  model.add(layer)
  model.add(tf.keras.layers.Conv2D(64, (kernel, kernel),  padding='same',activation='relu'))
  model.add(tf.keras.layers.MaxPooling2D(pool_size=(pool, pool)))
  model.add(tf.keras.layers.Dropout(dropout))
  model.add(tf.keras.layers.Flatten())
  model.add(tf.keras.layers.Dense(1024, activation='relu'))        
  model.add(tf.keras.layers.Dropout(dropout))
  model.add(tf.keras.layers.Dense(num_classes))

  return model


## Training loop

But wait, this is much more complicated than the slides...

Indeed. This is because it runs all of the options discussed, and tries to clean up after itsef (Hence the context managers), profiles, and provide accurate steady state measurements.

### Toggles for various optimizations:

In [4]:
@contextlib.contextmanager
def stop_profiler():
  """Used to guarantee that the TensorFlow profiler does not remain on.
  
  We don't want to mix traces from different runs as it would make them hard
  to interpret, so this ensures that the profiler is disabled even if our
  training loop crashes.
  """
  try:
    yield
  finally:
    tf.summary.trace_off()

@contextlib.contextmanager
def use_mixed_precision(loss_scale):
  """Enable mixed precision, and reset the policy after training."""
  old_policy = tf.keras.mixed_precision.experimental.global_policy()

  try:
    policy = tf.compat.v2.keras.mixed_precision.experimental.Policy(
        "mixed_float16", loss_scale=loss_scale)
    tf.keras.mixed_precision.experimental.set_policy(policy)
    yield
  finally:
    tf.keras.mixed_precision.experimental.set_policy(old_policy)


@contextlib.contextmanager
def enable_xla():
  """Enable XLA, and disable it after training is complete."""
  try:
    tf.config.optimizer.set_jit(True)
    yield
  finally:
    tf.config.optimizer.set_jit(False)


_THREADS_PER_GPU = 2
@contextlib.contextmanager
def tuning_context():
  """Hand tuned model configurations.
  
  Historically these knobs have improved performance, but as of 10/28/2019 they
  actually hurt performance. However they are provided simply for completeness
  to show some of the lower level knobs.
  """
  try:
    os.environ['TF_GPU_THREAD_MODE'] = "gpu_private"
    os.environ['TF_GPU_THREAD_COUNT'] = str(_THREADS_PER_GPU)
    tf.keras.backend.set_image_data_format("channels_first")
    yield
  finally:
    os.environ.pop('TF_GPU_THREAD_MODE', None)
    os.environ.pop('TF_GPU_THREAD_COUNT', None)
    tf.keras.backend.set_image_data_format("channels_last")

In [5]:
@contextlib.contextmanager
def null_context():
  """Implementation detail. Used if a given toggle is disabled."""
  yield

def null_decorator(f):
  """Implementation detail. Used if tf.function is disabled."""
  return f

def train_model(data_fn, global_batch_size, use_tf_function=False, 
                strategy=None, xla=False, mixed_precision=False, 
                loss_scale="dynamic", collect_profile=False, tuned=False, 
                transform_on_device=False):

  # Ensure runs are independent.
  tf.keras.backend.clear_session()
  gc.collect()
  time.sleep(3)

  if tuned:
    assert strategy, "Tuned version assumes a distribuation strategy is present."

  dtype = tf.float16 if mixed_precision else tf.float32
  step_decorator = tf.function if use_tf_function and not strategy else null_decorator
  strategy_scope = strategy.scope() if strategy else null_context()
  xla_scope = enable_xla() if xla else null_context()
  precision_scope = use_mixed_precision(loss_scale) if mixed_precision else null_context()
  tuning_scope = tuning_context() if tuned else null_context()

  with strategy_scope, xla_scope, precision_scope, stop_profiler(), tuning_scope:
    model = make_model(input_dtype=dtype, transform_on_device=transform_on_device)
    
    optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
    if mixed_precision:
      optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, loss_scale=loss_scale)

#    model.compile(
#            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
#            optimizer=tf.keras.optimizers.Adam(0.001),
#            metrics=['accuracy'],
#        )
    
    @step_decorator
    def replica_step(features, labels):
      with tf.GradientTape() as tape:
        logits = model(features, training=True)
#        replica_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels, logits)
        replica_loss = tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True, axis=-1)
        loss = tf.nn.compute_average_loss(replica_loss, global_batch_size=global_batch_size)
      grads = tape.gradient(loss, model.trainable_variables)
      optimizer.apply_gradients(zip(grads, model.trainable_variables))
      return loss

    step_fn = replica_step
    if strategy and use_tf_function:
      @tf.function
      def replicated_step(features, labels):
        loss = strategy.experimental_run_v2(replica_step, (features, labels))
        return loss

      step_fn = replicated_step

    if strategy:
      dataset = data_fn(batch_size=global_batch_size, dtype=dtype, 
                        transform_on_device=transform_on_device)
      if tuned:
        options = tf.data.Options()
        private_threads = (multiprocessing.cpu_count() - 
                           strategy.num_replicas_in_sync * (_THREADS_PER_GPU + 1))
        options.experimental_threading.private_threadpool_size = private_threads
        dataset = dataset.with_options(options)
      data = strategy.experimental_distribute_dataset(dataset)
    else:
      assert not transform_on_device
      data = data_fn(batch_size=global_batch_size, dtype=dtype)

    schedule = [
        5,                             # Burn in
        5 if collect_profile else 0,   # Profiling
        30,                            # Steady state throughput
    ]
    times = []
    
    for step_number, inputs in enumerate(data):
      loss = step_fn(*inputs)

      # Burn in
      if schedule[0]:
        schedule[0] -= 1
        if not schedule[0]:
          print("Burn in complete.")
          if schedule[1]:
            time.sleep(2)  # Let running ops finish to start from a clean trace.
            tf.summary.trace_on(profiler=True)
          else:
            # Skip straight to steady state throughput
            start_time = timeit.default_timer()
            iter_count = 0
        continue

      # Op profiler
      if schedule[1]:
        schedule[1] -= 1
        if not schedule[1]:
          tf.summary.trace_export(name="my_trace", profiler_outdir=profile_dir)
          start_time = timeit.default_timer()
          iter_count = 0
        continue

      # Profile steady state execution
      schedule[2] -= 1
      iter_count += 1
      times.append(timeit.default_timer())
      if not schedule[2]:
        break
        
    run_time = timeit.default_timer() - start_time
    step_time = run_time / iter_count
    # print(np.array(times[1:]) - np.array(times[:-1]))
    data = {}
    data[global_batch_size] = {"batch_size": global_batch_size, "steps": iter_count,"Mean_step_time_sec": step_time, "Images_per_sec": int(global_batch_size / step_time)}
    json_data = json.dumps(data)
    with open("stats.json", "w") as f:
        json.dump(data, f)
        
    print("{} steps".format(iter_count))
    print("Mean step time: {:>6.2f} sec".format(step_time))
    print("Images / sec:   {:>6d}".format(int(global_batch_size / step_time)))


## First pass at a training function.

### Define a generator based data pipeline

In [6]:
def random_flip(image):
  hflip = np.random.random() > 0.5
  vflip = np.random.random() > 0.5
  if hflip and vflip:
    image = cv2.flip(image, -1)
  elif hflip:
    image = cv2.flip(image, -1)
  elif vflip:
    image = cv2.flip(image, 1)
  return image

def normalize_and_add_noise(image):
  image = image.astype(np.float32) / 255 - 0.5
  image += np.random.normal(loc=0, scale=0.1, size=image.shape)
  return image

def make_batch(features, labels):
  x = tf.convert_to_tensor(np.stack(features, axis=0))
  y = tf.convert_to_tensor(np.array(labels, dtype=np.float32)[:, np.newaxis])
  features.clear()
  labels.clear()
  return x, y

def data_generator(batch_size, **kwargs):
  epoch_order = np.random.permutation(get_paths_and_labels())
  features, labels = [], []
  for image_path, label in epoch_order:
    image = cv2.imread(image_path)

    # Resize to training resolution
    image = cv2.resize(image, RESOLUTION)

    # Randomly horizontal and vertical flip
    image = random_flip(image)

    # Normalize, center, and add Gaussian noise
    image = normalize_and_add_noise(image)
    
    features.append(image)
    labels.append(label)
    if len(features) == batch_size:
      yield make_batch(features, labels)

In [7]:
#for batch_size in [32, 64, 128]:
#  print("Batch size: {}".format(batch_size))
#  train_model(data_generator, batch_size)
#  print()

## Add tf.data

### Use the images directly

In [8]:
#def make_jpg_dataset(batch_size, dtype=tf.float32, transform_on_device=False, 
#                     already_resized=False):
#  if already_resized:
#    raise NotImplementedError(
#        "`already_resized` is only implemented for the TFRecords path.")
#
#  def get_bytes_and_label(filepath):
#    image_bytes = tf.io.read_file(filepath)
#    label = tf.strings.regex_full_match(filepath, pos_dir + ".+")
#    return image_bytes, tf.expand_dims(label, 0)
#
#  def process_image(image_bytes, label):
#    image = tf.io.decode_jpeg(image_bytes)
#    image = tf.cast(image, dtype)
#    image = tf.image.resize(image, RESOLUTION)
#
#    if tf.shape(image)[2] == 1:
#      # Some images are greyscale.
#      image = tf.tile(image, (1, 1, 3))
#
#    image.set_shape(NHWC_INPUT_SHAPE)
#
#    if not transform_on_device:
#      image = transform_image(image)
#
#    if tf.keras.backend.image_data_format() == "channels_first":
#      image = tf.transpose(image, (2, 0, 1))
#    
#    return image, tf.cast(label, dtype)
#
#  dataset = tf.data.Dataset.list_files([pos_dir + "/*", neg_dir + "/*"])
#  dataset = dataset.shuffle(NUM_TOTAL_IMAGES)
#  dataset = dataset.map(get_bytes_and_label, num_parallel_calls=tf.data.experimental.AUTOTUNE)
#  dataset = dataset.map(process_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
#  dataset = dataset.batch(batch_size, drop_remainder=True)
#  return dataset.prefetch(tf.data.experimental.AUTOTUNE)

### Use TFRecords

In [9]:
def make_dataset(batch_size, dtype=tf.float32, transform_on_device=False, 
                 already_resized=True):

  # put already_resized false if you would like to measure reading jpg files directly

#  def parse_fn(record):
#    RECORD_SCHEMA = {
#     "image": tf.io.FixedLenFeature([], dtype=tf.string),
#     "label": tf.io.FixedLenFeature([1], dtype=tf.int64)
#    }
#    record = tf.io.parse_single_example(record, RECORD_SCHEMA)
#    image = tf.io.decode_jpeg(record["image"])
#    image = tf.cast(image, dtype)
#    if not already_resized:
#      image = tf.image.resize(image, RESOLUTION)
#
#    if tf.shape(image)[2] == 1:
#      # Some images are greyscale.
#      image = tf.tile(image, (1, 1, 3))
#
#    image.set_shape(NHWC_INPUT_SHAPE)
#    
#    if not transform_on_device:
#      image = transform_image(image)
#
#    if tf.keras.backend.image_data_format() == "channels_first":
#      image = tf.transpose(image, (2, 0, 1))
#    
#    return image, tf.cast(record["label"], dtype)

    
  def parse_fn(record):
    img_rows, img_cols = 28, 28
    RECORD_SCHEMA = {
        'image_raw': tf.io.FixedLenFeature([img_rows * img_cols], tf.float32),
        'label': tf.io.FixedLenFeature([], tf.int64)
    }
    record = tf.io.parse_single_example(record, RECORD_SCHEMA)
    image = record['image_raw']
    label = record['label']   
    return image, label

  def _reshape_img(image, label):
      image = tf.reshape(image, [28, 28, 1])
      # label = tf.one_hot(label, num_classes)
      return image, label


  from hops import hdfs
  import pydoop.hdfs as pydoop

#  data_dir = hdfs.project_path()
#  filenames = pydoop.path.abspath(data_dir + "TourData/tfrecord_data/")
#  pattern = tf.io.gfile.glob(filenames + "*.tfrecords")

  data_dir = hdfs.project_path()
  filenames = pydoop.path.abspath(data_dir + "TourData/mnist/train/df-mnist_train.tfrecord")
  pattern = tf.io.gfile.glob(filenames + "/part-r-*")

  dataset = tf.data.Dataset.list_files(pattern)
  dataset = dataset.interleave(tf.data.TFRecordDataset, num_parallel_calls=tf.data.experimental.AUTOTUNE)
  dataset = dataset.shuffle(4 * batch_size)
  dataset = dataset.map(parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
  dataset = dataset.map(
        _reshape_img, num_parallel_calls=tf.data.experimental.AUTOTUNE)
  dataset = dataset.batch(batch_size, drop_remainder=True)
  return dataset.prefetch(tf.data.experimental.AUTOTUNE)

### *already_resized* and *transform_on_device*

Even with maximum parallelization, the CPU can only produce a bit over 3000 examples per second. This is fine for 1 GPU training since the GPU maxes out in the low 2000's, but would prevent reasonable scaling to more GPUs. This is due to two principle bottlenecks:

#### Native image size

The downloaded thumbnails tend to be around 400x600 resolution, whereas we're training at 224x224. This means that we have to move approximately 6x as many bytes into memory, spend a correspondingly long time decoding the jpg's, and incur an extra memcpy for the resize. It turns out to be quite important to resize the images to 224x224 and use those resized images in the input pipeline.

#### Random augmentation

The CPU simply cannot add noise to the images quickly enough to keep up with the GPU, so to maintain performance for the multi-GPU case we have to move those transformations from the input pipeline to the start of the model. Even though that puts them on the critical path, the GPU can process the augmentation so quickly that it isn't an issue.

In [10]:
def measure_dataset_throughput(dataset_fn, label=""):
  count = 0
  bluh = 0
  batch_size = 32
#  for i, _ in enumerate(dataset_fn(batch_size=batch_size).take(50)):
  for i, _ in enumerate(dataset_fn(batch_size=batch_size)):
    bluh += 1    
    if i == 3:
      st = timeit.default_timer()
    if i > 3:
      count += 1
  step_time = (timeit.default_timer() - st) / count
  print(step_time, (timeit.default_timer() - st), count, bluh, _[0].shape)  
  print("{:<45}  {:>6.0f} Images / sec".format(label, batch_size / step_time // 100 * 100))


def make_synthetic_dataset(batch_size, dtype=tf.float32, **kwargs):
  num_images_per_label  = 50000
  dataset = tf.data.Dataset.range(2 * num_images_per_label)
  def map_fn(_):
    x = tf.zeros(shape=get_input_shape(), dtype=dtype)
    y = tf.zeros(shape=(1,), dtype=dtype)
    return x, y
  dataset = dataset.map(map_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
  dataset = dataset.batch(batch_size, drop_remainder=True)
  return dataset.prefetch(tf.data.experimental.AUTOTUNE)


#measure_dataset_throughput(
#    functools.partial(make_jpg_dataset, dtype=tf.float16),
#    "Use JPEGs directly")

measure_dataset_throughput(
    functools.partial(make_dataset, dtype=tf.float16),
    "Use TFRecords")

measure_dataset_throughput(
    functools.partial(make_dataset, dtype=tf.float16, transform_on_device=True, 
                      already_resized=True),
    "Use TFRecords with scaling optimizations")

# Use synthetic data to ensure model is not input bound.
measure_dataset_throughput(
    functools.partial(make_synthetic_dataset, dtype=tf.float16),
    "Synthetic data.")


0.0158557931850241 4.883587235934101 308 312 (32, 28, 28, 1)
Use TFRecords                                    2000 Images / sec
0.012512271983818114 3.853782159043476 308 312 (32, 28, 28, 1)
Use TFRecords with scaling optimizations         2500 Images / sec
0.00542479583306327 16.93079052399844 3121 3125 (32, 28, 28, 1)
Synthetic data.                                  5800 Images / sec

In [11]:
def debugger_example():
    from hops import hdfs
    from hops import tensorboard    
    # How you easily get the TensorBoard logdir for summaries
    tensorboard_logdir = tensorboard.logdir()
    
    for batch_size in [32, 64, 128]:
      print("Batch size: {}".format(batch_size))
      train_model(make_dataset, batch_size)
      hdfs.copy_to_hdfs("stats.json", "Logs/tf_performance/no_extra_opt-{}.json".format(batch_size), overwrite=True)  
      print()
    

from hops import experiment
experiment.launch(debugger_example, name='tensorboard debugger', local_logdir=True)

Finished Experiment 

('hdfs://10.128.0.3:8020/Projects/demo_deep_learning_admin000/Experiments/application_1587025294218_0040_1', {'log': 'Experiments/application_1587025294218_0040_1/output.log'})

## Add tf.function

In [12]:
def debugger_example():

    from hops import hdfs
    from hops import tensorboard    
    # How you easily get the TensorBoard logdir for summaries
    tensorboard_logdir = tensorboard.logdir()
    
    for batch_size in [32, 64, 128]:
      print("Batch size: {}".format(batch_size))
      train_model(make_dataset, batch_size, use_tf_function=True)
      hdfs.copy_to_hdfs("stats.json", "Logs/tf_performance/tf_function_{}.json".format(batch_size), overwrite=True)  
      print()

from hops import experiment
experiment.launch(debugger_example, name='tensorboard debugger', local_logdir=True)


Finished Experiment 

('hdfs://10.128.0.3:8020/Projects/demo_deep_learning_admin000/Experiments/application_1587025294218_0040_2', {'log': 'Experiments/application_1587025294218_0040_2/output.log'})

## Add XLA

In [13]:
def debugger_example():

    from hops import hdfs
    from hops import tensorboard    
    # How you easily get the TensorBoard logdir for summaries
    tensorboard_logdir = tensorboard.logdir()
    
    for batch_size in [32 , 64, 128]:
      print("Batch size: {}".format(batch_size))
      train_model(make_dataset, batch_size, use_tf_function=True, xla=True)
      hdfs.copy_to_hdfs("stats.json", "Logs/tf_performance/tf_function_xla_{}.json".format(batch_size), overwrite=True)  
      print()

from hops import experiment
experiment.launch(debugger_example, name='tensorboard debugger', local_logdir=True)

Finished Experiment 

('hdfs://10.128.0.3:8020/Projects/demo_deep_learning_admin000/Experiments/application_1587025294218_0040_3', {'log': 'Experiments/application_1587025294218_0040_3/output.log'})

## Add mixed precision

In [14]:
def debugger_example():
     
    from hops import hdfs
    from hops import tensorboard    
    # How you easily get the TensorBoard logdir for summaries
    tensorboard_logdir = tensorboard.logdir()
    
    
    for batch_size in [32, 64, 128, 256]:
      print("Batch size: {}".format(batch_size))
      train_model(make_dataset, batch_size, use_tf_function=True, xla=True, mixed_precision=True)
      hdfs.copy_to_hdfs("stats.json", "Logs/tf_performance/tf_function_xla_mixprec_{}.json".format(batch_size), overwrite=True)  
      print()

from hops import experiment
experiment.launch(debugger_example, name='tensorboard debugger', local_logdir=True)

Finished Experiment 

('hdfs://10.128.0.3:8020/Projects/demo_deep_learning_admin000/Experiments/application_1587025294218_0040_4', {'log': 'Experiments/application_1587025294218_0040_4/output.log'})

In [18]:
import json
from hops import hdfs
import pydoop.hdfs as pydoop

import json
from hops import hdfs
import pydoop.hdfs as pydoop

data_dir = hdfs.project_path()
final_stats = {}
for i in pydoop.hdfs().list_directory(data_dir + "Logs/tf_performance/"):
    path_list = i['name'].split("/")
    method = path_list[len(path_list)-1].split(".")[0]
    method = method.split("-")[0]
    for z in pydoop.hdfs().list_directory(i['name']):
        stats = json.loads(hdfs.load(z['name']))
        stats[method] = stats.pop(list(stats.keys())[0])
        final_stats.update(stats)
        
#print(final_stats)        
pd.options.display.max_columns = None
dataframe = pd.DataFrame.from_dict(final_stats, orient="index").sort_values(by=['batch_size'])        
dataframe        

                             batch_size  steps  Mean_step_time_sec  \
tf_function_32                       32     30            0.023410   
tf_function_xla_32                   32     30            0.002274   
tf_function_xla_mixprec_32           32     30            0.027946   
no_extra_opt                         64     30            0.060711   
tf_function_64                       64     30            0.028070   
tf_function_xla_64                   64     30            0.024084   
tf_function_xla_mixprec_64           64     30            0.033215   
tf_function_128                     128     30            0.034588   
tf_function_xla_128                 128     30            0.055679   
tf_function_xla_mixprec_128         128     30            0.035541   
tf_function_xla_mixprec_256         256     30            0.123290   

                             Images_per_sec  
tf_function_32                         1366  
tf_function_xla_32                    14069  
tf_function_xla_mixpr