In [None]:
# config
import os
PROJECT_ID = 'PROJECT_ID'  # change here
BUCKET = PROJECT_ID + '_accelerator_demo'
REGION = 'us-central1'
os.environ["PROJECT_ID"] = PROJECT_ID
os.environ["BUCKET"] = BUCKET

!gcloud config set project {PROJECT_ID}
!gsutil mb gs://{BUCKET}
!gcloud config set compute/region {REGION}

In [None]:
%%bash
mkdir train
echo "" > train/__init__.py
mkdir config

In [None]:
%%writefile train/model_definition.py
import tensorflow as tf

def create_model(input_shape):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.BatchNormalization(input_shape=input_shape))
    model.add(tf.keras.layers.Conv2D(64, (5, 5), padding='same', activation='elu'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2,2)))
    model.add(tf.keras.layers.Dropout(0.25))

    model.add(tf.keras.layers.BatchNormalization(input_shape=input_shape))
    model.add(tf.keras.layers.Conv2D(128, (5, 5), padding='same', activation='elu'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(tf.keras.layers.Dropout(0.25))

    model.add(tf.keras.layers.BatchNormalization(input_shape=input_shape))
    model.add(tf.keras.layers.Conv2D(256, (5, 5), padding='same', activation='elu'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2,2)))
    model.add(tf.keras.layers.Dropout(0.25))

    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(256))
    model.add(tf.keras.layers.Activation('elu'))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(10))
    model.add(tf.keras.layers.Activation('softmax'))
    return model

# Train on single Processor

In [None]:
%%writefile train/train_single_cpu_gpu.py
import os
import time
import tensorflow as tf
import numpy as np
from . import model_definition

#Get data

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()

# add empty color dimension
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)

#Train model

model = model_definition.create_model(input_shape=x_train.shape[1:])
model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3, ),
  loss='sparse_categorical_crossentropy',
  metrics=['sparse_categorical_accuracy'])
start = time.time()
model.fit(
    x_train.astype(np.float32), y_train.astype(np.float32),
    epochs=17,
    steps_per_epoch=60,
    validation_data=(x_test.astype(np.float32), y_test.astype(np.float32)),
    validation_freq=17
)
print("Training time with single GPUs: {}".format(time.time() - start))

model.save_weights('./fashion_mnist_single.h5', overwrite=True)

## Submit single worker with CPU

TODO 1: set module name defined above

In [None]:
%%bash

now=$(date +"%Y%m%d_%H%M%S")
JOB_NAME="single_cpu_fashion_minst_$now"

gcloud ai-platform jobs submit training $JOB_NAME \
  --staging-bucket=gs://$BUCKET \
  --package-path=train \
  --module-name= \ #TODO 1
  --runtime-version=2.1 \
  --python-version=3.7 \
  --region=us-central1 \

## Submit single worker with single GPU

TODO 2: set module name defined above

TODO 3: set `scale-tier` to prefefined single GPU tier. Refer to [this page](https://cloud.google.com/ai-platform/training/docs/machine-types#scale_tiers)

In [None]:
%%bash

now=$(date +"%Y%m%d_%H%M%S")
JOB_NAME="single_gpu_fashion_minst_$now"

gcloud ai-platform jobs submit training $JOB_NAME \
  --staging-bucket=gs://$BUCKET \
  --package-path=train \
  --module-name= \ # TODO 2 
  --runtime-version=2.1 \
  --python-version=3.7 \
  --region=us-central1 \
  --scale-tier= # TODO 3

# Train on multiple GPUs

TODO 4: Define distribute strategy for multiple GPU on single machine. Refer to [this page](https://www.tensorflow.org/guide/distributed_training)

TODO 5: Define scope of distribution strategy.

TODO 6: Specify 4 NVIDIA Tesla K80 GPU in config yaml file. Refere to [this page](https://cloud.google.com/ai-platform/training/docs/using-gpus)

TODO 7: Specify the module and config file in training submit command

In [None]:
%%writefile train/train_mult_cpu_gpu.py
import os
import os
import time
import tensorflow as tf
import numpy as np
from . import model_definition

#Get data

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()

# add empty color dimension
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)

##################### Multiple GPUs or CPUs ###################
strategy = #TODO 4

with # TODO 5:
###############################################################
    model = model_definition.create_model(input_shape=x_train.shape[1:])
    model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3, ),
      loss='sparse_categorical_crossentropy',
      metrics=['sparse_categorical_accuracy'])
start = time.time()
model.fit(
    x_train.astype(np.float32), y_train.astype(np.float32),
    epochs=17,
    steps_per_epoch=60,
    validation_data=(x_test.astype(np.float32), y_test.astype(np.float32)),
    validation_freq=17
)
print("Training time with multiple GPUs: {}".format(time.time() - start))
model.save_weights('./fashion_mnist_mult_gpu.h5', overwrite=True)

In [None]:
%%writefile config/multi_gpu_config.yaml
trainingInput:
  scaleTier: CUSTOM
  # Configure a master worker with 4 T4 GPUs
  masterType: n1-highcpu-16
  masterConfig:
    acceleratorConfig:
      count: # Todo 6
      type: # Todo 6

In [None]:
%%bash

now=$(date +"%Y%m%d_%H%M%S")
JOB_NAME="multi_gpu_fashion_minst_$now"

gcloud ai-platform jobs submit training $JOB_NAME \
  --staging-bucket=gs://$BUCKET \
  --package-path=train \
  --module-name= \ # Todo 7
  --runtime-version=2.1 \
  --python-version=3.7 \
  --region=us-central1 \
  --config # Todo 8

# Example training on TPU

TODO 9: Write code for TPU training by Refering to [this page](https://www.tensorflow.org/guide/tpu). No need to pass any arguments in TPUClusterResolver class.

In [None]:
%%writefile train/train_tpu.py
import os
import os
import time
import tensorflow as tf
import numpy as np
from . import model_definition

#Get data

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()

# add empty color dimension
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)

##################### Run on TPUs ###################
# TODO 9

with strategy.scope():
###############################################################
    model = model_definition.create_model(input_shape=x_train.shape[1:])
    model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3, ),
      loss='sparse_categorical_crossentropy',
      metrics=['sparse_categorical_accuracy'])
start = time.time()
model.fit(
    x_train.astype(np.float32), y_train.astype(np.float32),
    epochs=17,
    steps_per_epoch=60,
    validation_data=(x_test.astype(np.float32), y_test.astype(np.float32)),
    validation_freq=17
)
print("Training time with TPUs: {}".format(time.time() - start))
model.save_weights('./fashion_mnist_tpu.h5', overwrite=True)

## Submit single worker with TPUv2 Pod (8 cores)

TODO 10: set module name defined above

TODO 11: set `scale-tier` to prefefined single TPU tier. Refer to [this page](https://cloud.google.com/ai-platform/training/docs/machine-types#scale_tiers)

In [None]:
%%bash

now=$(date +"%Y%m%d_%H%M%S")
JOB_NAME="tpu_fashion_minst_$now"

gcloud ai-platform jobs submit training $JOB_NAME \
  --staging-bucket=gs://$BUCKET \
  --package-path=train \
  --module-name= \ #TODO 10 
  --runtime-version=2.1 \
  --python-version=3.7 \
  --scale-tier= \ #TODO 11 
  --region=us-central1

## Submit single worker TPUv3 (8 cores)

TODO 12: specify 8 TPU V3 for accelerator by refering to [this page](https://cloud.google.com/ai-platform/training/docs/reference/rest/v1/AcceleratorType)

TODO 13: Specify the module and config file in training submit command

In [None]:
%%writefile config/tpuv3_config.yaml
trainingInput:
  scaleTier: CUSTOM
  masterType: n1-highcpu-16
  workerType: cloud_tpu
  workerCount: 1
  workerConfig:
    acceleratorConfig:
      type: # TODO 12
      count: # TODO 12

In [None]:
%%bash

now=$(date +"%Y%m%d_%H%M%S")
JOB_NAME="tpu_v3_fashion_minst_$now"

gcloud ai-platform jobs submit training $JOB_NAME \
  --staging-bucket=gs://$BUCKET \
  --package-path=train \
  --module-name= \ # TODO 13 
  --runtime-version=2.1 \
  --python-version=3.7 \
  --region=us-central1 \
  --config # TODO 13

# Train on multiple device with GPUs

TODO 14: Specify distribute strategy for multiple device GPU training

TODO 15: Specify master node with 4 NVIDIA TESLA K80, 3 worker node with 4 NVIDIA TESLA K80.

TODO 16: Specify the module and config file in training submit command

In [None]:
%%writefile train/train_mult_worker_mirrored.py
import os
import os
import time
import tensorflow as tf
import numpy as np
from . import model_definition

#Get data

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()

# add empty color dimension
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)

##################### Run on multiple workers with GPU ###################
strategy = # TODO 14

with strategy.scope():
###############################################################
    model = model_definition.create_model(input_shape=x_train.shape[1:])
    model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3, ),
      loss='sparse_categorical_crossentropy',
      metrics=['sparse_categorical_accuracy'])
start = time.time()
model.fit(
    x_train.astype(np.float32), y_train.astype(np.float32),
    epochs=17,
    steps_per_epoch=60,
    validation_data=(x_test.astype(np.float32), y_test.astype(np.float32)),
    validation_freq=17
)
print("Training time with multiple GPUs: {}".format(time.time() - start))
model.save_weights('./fashion_mnist_mult_mirrored.h5', overwrite=True)

In [None]:
%%writefile config/multi_worker_gpu.yaml
trainingInput:
  scaleTier: CUSTOM
  # Configure a master worker with 4 T4 GPUs
  masterType: n1-highcpu-16
  masterConfig:
    acceleratorConfig:
      count: # TODO 15
      type: # TODO 15
  # Configure 3 workers, each with 4 T4 GPUs
  workerCount: # TODO 15
  workerType: n1-highcpu-16
  workerConfig:
    acceleratorConfig:
      count: # TODO 15
      type: # TODO 15

In [None]:
%%bash

now=$(date +"%Y%m%d_%H%M%S")
JOB_NAME="multi_worker_gpu_fashion_minst_$now"

gcloud ai-platform jobs submit training $JOB_NAME \
  --staging-bucket=gs://$BUCKET \
  --package-path=train \
  --module-name= \ # TODO 16 
  --runtime-version=2.1 \
  --python-version=3.7 \
  --region=us-central1 \
  --config # TODO 16

Copyright 2020 Google Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License