# Examples of Launching a Vertex AI Training Job

Contributor: michaelmenzel@google.com

Disclaimer: This is a code example and not intended to be used in production. The author does not take any liability for the use of this code example.

In [None]:
BUCKET='gs://sandbox-michael-menzel-training-europe-west4/trainings/mnist-distributed-vertex'
SERVICE_ACCOUNT='928871478446-compute@developer.gserviceaccount.com'
REGION='europe-west4'

!pip3 install --user --quiet ipywidgets tensorflow tensorflow-datasets pyyaml

## Run locally

In [None]:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_datasets as tfds

(train_data, val_data), mnist_info = tfds.load("mnist", 
                                               split=['train', 'test'], as_supervised=True, 
                                               try_gcs=True, with_info=True)

@tf.function
def norm_data(image, label): 
    return tf.cast(image, tf.float32) / 255., label

train_ds = (train_data
            .map(norm_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
            .batch(128, drop_remainder=True)
            .cache()
            .prefetch(tf.data.experimental.AUTOTUNE))
val_ds = (val_data
          .map(norm_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
          .batch(128, drop_remainder=True)
          .cache()
          .prefetch(tf.data.experimental.AUTOTUNE))

model = keras.Sequential([
        keras.layers.Reshape(target_shape=(28, 28, 1), input_shape=(28, 28)),
        keras.layers.Conv2D(filters=64, kernel_size=(5, 5), padding='same', activation='elu'),
        keras.layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2)),
        keras.layers.Conv2D(filters=128, kernel_size=(5, 5), padding='same', activation='elu'),
        keras.layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2)),
        keras.layers.Conv2D(filters=256, kernel_size=(5, 5), padding='same', activation='elu'),
        keras.layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2)),
        keras.layers.Flatten(),
        keras.layers.Dense(256, activation='elu'),
        keras.layers.Dense(10, activation='softmax')
    ])

model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy',
              metrics=['sparse_categorical_accuracy'])
model.fit(train_ds, validation_data=val_ds, epochs=1)
model.evaluate(val_ds)
model.save('my_model')

## Automation Level 1: Submit a training job with Cloud SDK CLI

In [None]:
%cd ../trainers/mnist-distributed-vertex/

!gcloud builds submit \
    --substitutions=_TRAINER_NAME=mnist-training,_REGISTRY=eu.gcr.io,_DOCKERFILE=Dockerfile,TAG_NAME=latest \
    --gcs-log-dir=$BUCKET/build/

!gcloud ai custom-jobs create --display-name mnist-training-job-$(date +%s) \
    --region=$REGION \
    --config=train-config-single-manual.yaml

%cd -

## Automation Level 2: Scripting!

In [None]:
!../trainers/mnist-distributed-vertex/launch_experiment.sh

## MNIST model benchmark on a single machine

In [None]:
!../trainers/mnist-distributed-vertex/launch_experiment.sh -e mnist-single -B 128 -E 10,20 -c train-config-single.yaml -j $BUCKET -r $REGION -t -s $SERVICE_ACCOUNT

## MNIST model benchmark on a single machine with GPUs

In [None]:
!../trainers/mnist-distributed-vertex/launch_experiment.sh -e mnist-single-gpu -B 128 -E 20,30 -c train-config-single-gpu.yaml -d Dockerfile.gpu -j $BUCKET -r $REGION -s $SERVICE_ACCOUNT

## MNIST model trained on a distributed cluster

In [None]:
!../trainers/mnist-distributed-vertex/launch_experiment.sh -e mnist-multi -B 128 -E 10,20 -c train-config-cluster.yaml -j $BUCKET -r $REGION -s $SERVICE_ACCOUNT

## MNIST model trained on a distributed cluster with GPUs

In [None]:
!../trainers/mnist-distributed-vertex/launch_experiment.sh -e mnist-multi-gpu -B 128 -E 10,50,100 -c train-config-cluster-gpu.yaml -d Dockerfile.gpu -j $BUCKET -r $REGION -s $SERVICE_ACCOUNT

## Automation Level 3: Python Client

In [None]:
!pip3 install --user --quiet google-cloud-aiplatform
from google.cloud import aiplatform
from datetime import datetime
import os

timestamp = str(int(datetime.now().timestamp()))

aiplatform.init(location=REGION)
custom_job = aiplatform.CustomJob.from_local_script(
        display_name='mnist-from-python-'+timestamp,
        staging_bucket=os.path.join(BUCKET, 'mnist-from-python-'+timestamp),
        script_path='../trainers/mnist-distributed-vertex/trainer/main.py',
        container_uri='europe-docker.pkg.dev/vertex-ai/training/tf-cpu.2-9:latest',
        requirements=['cloudml-hypertune', 'tensorflow-datasets'],
        replica_count=1, 
        args=['--batch-size', '128', '--num-epochs', '5']
    )

custom_job.run(sync=False)