# Examples of Launching a Vertex AI Training Job with Python SDK

Contributor: michaelmenzel@google.com

Disclaimer: This is a code example and not intended to be used in production. The author does not take any liability for the use of this code example.

In [None]:
!pip install --user -q google-cloud-aiplatform

In [None]:
from datetime import datetime
from google.cloud import aiplatform

In [None]:
PROJECT_ID = 'sandbox-michael-menzel'
STAGING_BUCKET='gs://sandbox-michael-menzel-training-europe-west4/trainings/mnist-distributed-vertex'


TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
EXPERIMENT = f'{PROJECT_ID}-mnist-pysdk'
JOB_NAME = f'{EXPERIMENT}-{TIMESTAMP}'
TRAINER_NAME = 'mnist-training'

aiplatform.init(location='europe-west4', experiment=EXPERIMENT)

## Launch Training Job from Script

In [None]:
vertex_ai_custom_job = aiplatform.CustomJob.from_local_script(
    display_name=JOB_NAME,
    script_path='../trainers/mnist-distributed-vertex/trainer/main.py',
    container_uri='europe-docker.pkg.dev/vertex-ai/training/tf-gpu.2-9:latest',
    requirements=['google-cloud-aiplatform[cloud_profiler]==1.15.0'],
    machine_type='n1-standard-8',
    replica_count=1,
    accelerator_type = 'NVIDIA_TESLA_V100',
    accelerator_count = 1,
    args=['--num-epochs=20'],
    staging_bucket=STAGING_BUCKET
)
vertex_ai_custom_job.run(sync=False)

## Build Container Image

In [None]:
!cd ../trainers/mnist-distributed-vertex; gcloud builds submit --substitutions TAG_NAME=$TIMESTAMP,_TRAINER_NAME=$TRAINER_NAME

In [None]:
!cd ../trainers/mnist-distributed-vertex; gcloud builds submit --substitutions TAG_NAME=$TIMESTAMP-gpu,_TRAINER_NAME=$TRAINER_NAME,_DOCKERFILE=Dockerfile.gpu

In [None]:
!cd ../trainers/mnist-distributed-vertex; gcloud builds submit --substitutions TAG_NAME=$TIMESTAMP-tpu,_TRAINER_NAME=$TRAINER_NAME,_DOCKERFILE=Dockerfile.tpu

## Launch Training Jobs from Container

In [None]:
vertex_ai_custom_job_cpu = aiplatform.CustomJob(
    display_name=f'{JOB_NAME}-cpu',
    worker_pool_specs=[{
        'machine_spec': {
            'machine_type': 'n1-standard-8',
        },
        'container_spec': {
            'image_uri': f'eu.gcr.io/{PROJECT_ID}/{TRAINER_NAME}:{TIMESTAMP}',
            'args': [
                '--num-epochs=15'
            ]
        },
        'replica_count': 1
    }],
    staging_bucket=STAGING_BUCKET,
)
vertex_ai_custom_job_cpu.run(sync=False)

In [None]:
vertex_ai_custom_job_gpu = aiplatform.CustomJob(
    display_name=f'{JOB_NAME}-gpu',
    worker_pool_specs=[{
        'machine_spec': {
            'machine_type': 'n1-standard-8',
            'accelerator_type': 'NVIDIA_TESLA_V100',
            'accelerator_count': 1,
        },
        'container_spec': {
            'image_uri': f'eu.gcr.io/{PROJECT_ID}/{TRAINER_NAME}:{TIMESTAMP}-gpu',
            'args': [
                '--num-epochs=15'
            ]
        },
        'replica_count': 1
    }],
    staging_bucket=STAGING_BUCKET,
)
vertex_ai_custom_job_gpu.run(sync=False)

## Launch Training Pipeline

In [None]:
vertex_ai_custom_job = aiplatform.CustomContainerTrainingJob(
    display_name=f'{JOB_NAME}-tpu',
    container_uri=f'eu.gcr.io/{PROJECT_ID}/{TRAINER_NAME}:{TIMESTAMP}-tpu',
    staging_bucket=STAGING_BUCKET,
)

In [None]:
vertex_ai_model = vertex_ai_custom_job.run(
    machine_type='cloud-tpu',
    replica_count=1,
    accelerator_type = 'TPU_V2',
    accelerator_count = 8,
    args=['--num-epochs=50'],
    sync=False
)

## Launch Hyperparameter Tuning

In [None]:
vertex_ai_tuning_job = aiplatform.HyperparameterTuningJob(
    display_name=f'{JOB_NAME}-hp',
    custom_job=vertex_ai_custom_job_gpu,
    metric_spec={
        'loss': 'minimize'
    },
    parameter_spec={
        'batch-size': aiplatform.hyperparameter_tuning.IntegerParameterSpec(min=2, max=24, scale='linear'),
        'learning-rate': aiplatform.hyperparameter_tuning.DoubleParameterSpec(min=0.0005, max=0.01, scale='linear'), 
        'num-epochs': aiplatform.hyperparameter_tuning.IntegerParameterSpec(min=10, max=100, scale='linear'),
        'long-runner': aiplatform.hyperparameter_tuning.CategoricalParameterSpec(['True', 'False'])
    },
    max_trial_count=30,
    parallel_trial_count=3,
)

vertex_ai_tuned_model = vertex_ai_tuning_job.run(sync=False)