## Running custom model training on Vertex Pipelines

In [1]:
USER_FLAG = "--user"

In [None]:
!pip3 install {USER_FLAG} google-cloud-aiplatform==1.7.0 --upgrade
!pip3 install {USER_FLAG} kfp==1.8.9 google-cloud-pipeline-components==0.2.0

### Vertex Pipelines setup

In [3]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

In [1]:
!python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
!python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

KFP SDK version: 1.8.9
google_cloud_pipeline_components version: 0.2.0


In [2]:
import os
PROJECT_ID = "impact-analytics-sandbox"

# Get your Google Cloud project ID from gcloud
if not os.getenv("IS_TESTING"):
    shell_output=!gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID: ", PROJECT_ID)

#OR
# if PROJECT_ID == "" or PROJECT_ID is None:
#     PROJECT_ID = "your-project-id"  # @param {type:"string"}

Project ID:  impact-analytics-sandbox


In [1]:
BUCKET_NAME="gs://impact-analytics-experiments-bucket01"

In [2]:
from kfp.v2 import compiler, dsl
from kfp.v2.dsl import pipeline

from google.cloud import aiplatform
from google_cloud_pipeline_components import aiplatform as gcc_aip

In [3]:
PATH=%env PATH
%env PATH={PATH}:/home/jupyter/.local/bin
REGION="us-central1"

PIPELINE_ROOT = f"{BUCKET_NAME}/pipeline_root/"
PIPELINE_ROOT

env: PATH=/opt/conda/bin:/opt/conda/condabin:/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games:/home/jupyter/.local/bin


'gs://impact-analytics-experiments-bucket01/pipeline_root/'

### Configuring a custom model training job

In [23]:
cd jupyter/

/home/jupyter


In [6]:
%%bash
mkdir traincontainer

In [39]:
# Push container to Container Registry
IMAGE_URI="gcr.io/impact-analytics-sandbox/lstm:v1"

In [40]:
cd traincontainer

[Errno 2] No such file or directory: 'traincontainer'
/home/jupyter/traincontainer


In [45]:
!docker build ./ -t $IMAGE_URI

Sending build context to Docker daemon  19.97kB
Step 1/13 : FROM python:3.7
 ---> 6bcd1cc24497
Step 2/13 : USER root
 ---> Using cache
 ---> 85bf2d7eea6d
Step 3/13 : RUN apt-get update
 ---> Using cache
 ---> be037e0c38a8
Step 4/13 : RUN apt-get install ffmpeg libsm6 libxext6  -y
 ---> Using cache
 ---> 8244cf8c1625
Step 5/13 : RUN apt-get --assume-yes install wget
 ---> Using cache
 ---> 7c67716328cb
Step 6/13 : RUN apt-get -y install git
 ---> Using cache
 ---> e6a19dbffd58
Step 7/13 : RUN wget -nv https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz &&         mkdir /root/tools &&         tar xvzf google-cloud-sdk.tar.gz -C /root/tools &&         rm google-cloud-sdk.tar.gz &&         /root/tools/google-cloud-sdk/install.sh --usage-reporting=false         --path-update=false --bash-completion=false         --disable-installation-options &&         rm -rf /root/.config/* &&         ln -s /root/.config /config &&         rm -rf /root/tools/google-cloud-sdk/.install/.backup

In [48]:
!docker push $IMAGE_URI

The push refers to repository [gcr.io/impact-analytics-sandbox/lstm]

[1B59a14291: Preparing 
[1B01c7de11: Preparing 
[1B843899a8: Preparing 
[1B42a0e767: Preparing 
[1B0eb184d5: Preparing 
[1Bc6fca19a: Preparing 
[1B75841daa: Preparing 
[1B5ba7c779: Preparing 
[1B1565d249: Preparing 
[1B5b508dcf: Preparing 
[1B20347499: Preparing 
[1B2401de62: Preparing 
[1B42b3d47b: Preparing 
[1Bb28cf833: Preparing 
[1B76ceab7a: Preparing 
[1Bd5cccd2c: Preparing 
[17B9a14291: Pushed   4.441GB/4.383GB[15A[2K[17A[2K[17A[2K[17A[2K[17A[2K[10A[2K[17A[2K[17A[2K[9A[2K[17A[2K[8A[2K[17A[2K[14A[2K[7A[2K[17A[2K[14A[2K[17A[2K[4A[2K[17A[2K[11A[2K[17A[2K[2A[2K[14A[2K[16A[2K[11A[2K[11A[2K[17A[2K[11A[2K[17A[2K[11A[2K[17A[2K[14A[2K[17A[2K[11A[2K[14A[2K[17A[2K[14A[2K[11A[2K[14A[2K[11A[2K[17A[2K[14A[2K[11A[2K[14A[2K[11A[2K[17A[2K[11A[2K[17A[2K[11A[2K[14A[2K[11A[2K[17A[2K[14A[2K[17A[2K[11A[2K[

In [49]:
cd ..

/home/jupyter


In [1]:
cd traincontainer01

/home/jupyter/traincontainer01


In [2]:
# Push container to Container Registry
IMAGE_URI="gcr.io/impact-analytics-sandbox/gru:v1"

In [3]:
!docker build ./ -t $IMAGE_URI

Sending build context to Docker daemon  18.94kB
Step 1/13 : FROM python:3.7
 ---> 6bcd1cc24497
Step 2/13 : USER root
 ---> Using cache
 ---> 85bf2d7eea6d
Step 3/13 : RUN apt-get update
 ---> Using cache
 ---> be037e0c38a8
Step 4/13 : RUN apt-get install ffmpeg libsm6 libxext6  -y
 ---> Using cache
 ---> 8244cf8c1625
Step 5/13 : RUN apt-get --assume-yes install wget
 ---> Using cache
 ---> 7c67716328cb
Step 6/13 : RUN apt-get -y install git
 ---> Using cache
 ---> e6a19dbffd58
Step 7/13 : RUN wget -nv https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz &&         mkdir /root/tools &&         tar xvzf google-cloud-sdk.tar.gz -C /root/tools &&         rm google-cloud-sdk.tar.gz &&         /root/tools/google-cloud-sdk/install.sh --usage-reporting=false         --path-update=false --bash-completion=false         --disable-installation-options &&         rm -rf /root/.config/* &&         ln -s /root/.config /config &&         rm -rf /root/tools/google-cloud-sdk/.install/.backup

In [4]:
!docker push $IMAGE_URI

The push refers to repository [gcr.io/impact-analytics-sandbox/gru]

[1B73580cd2: Preparing 
[1B43934e3d: Preparing 
[1B843899a8: Preparing 
[1B42a0e767: Preparing 
[1B0eb184d5: Preparing 
[1Bc6fca19a: Preparing 
[1B75841daa: Preparing 
[1B5ba7c779: Preparing 
[1B1565d249: Preparing 
[1B5b508dcf: Preparing 
[1B20347499: Preparing 
[1B2401de62: Preparing 
[1B42b3d47b: Preparing 
[1Bb28cf833: Preparing 
[1B76ceab7a: Preparing 
[1Bd5cccd2c: Preparing 
[17B3580cd2: Pushed   4.441GB/4.383GB[13A[2K[17A[2K[9A[2K[6A[2K[4A[2K[3A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[17A[2K[

In [4]:
REGION="us-central1"
PROJECT_ID = 'impact-analytics-sandbox'

In [5]:
PIPELINE_ROOT = 'gs://impact-analytics-experiments-bucket01/'
BUCKET_NAME = 'gs://impact-analytics-experiments-bucket01/'

### Building a pipeline with pre-built components

In [6]:
@pipeline(pipeline_root=PIPELINE_ROOT, name="custom-model-sequence",)

def pipeline(
    project: str = 'impact-analytics-sandbox',
    gcp_region: str = 'us-central1',
    container_uri_lstm: str = "",
    container_uri_gru: str = "",
    bucket: str = BUCKET_NAME,
):
    
    training_op_lstm = gcc_aip.CustomContainerTrainingJobRunOp(
        display_name="pipeline-lstm-custom-train",
        container_uri=container_uri_lstm,
        project=project,
        location=gcp_region,
        staging_bucket=bucket,
    )
    
    print(training_op_lstm.outputs["model"])
    print(training_op_lstm.outputs["model"].name)
    print(type(training_op_lstm.outputs["model"].name))
    
    training_op_gru = gcc_aip.CustomContainerTrainingJobRunOp(
        display_name="pipeline-gru-custom-train",
        container_uri=container_uri_gru,
        project=project,
        location=gcp_region,
        staging_bucket=bucket,
        
    )

In [6]:
# Sequiential execution

@pipeline(pipeline_root=PIPELINE_ROOT, name="custom-model-sequence",)

def pipeline(
    project: str = 'impact-analytics-sandbox',
    gcp_region: str = 'us-central1',
    container_uri_lstm: str = "",
    container_uri_gru: str = "",
    bucket: str = BUCKET_NAME,
):
    
    training_op_lstm = gcc_aip.CustomContainerTrainingJobRunOp(
        display_name="pipeline-lstm-custom-train",
        container_uri=container_uri_lstm,
        project=project,
        location=gcp_region,
        staging_bucket=bucket,
    )
    
    # print(training_op_lstm.outputs["model"])
    print(training_op_lstm.outputs["model"].name)
    print(type(training_op_lstm.outputs["model"].name))
    
    training_op_gru = gcc_aip.CustomContainerTrainingJobRunOp(
        display_name="pipeline-gru-custom-train",
        container_uri=container_uri_gru,
        project=project,
        location=gcp_region,
        staging_bucket=bucket,
        model_description='after the ' + training_op_lstm.outputs["model"].name,
    ).after(training_op_lstm)

In [None]:
# Single 
@pipeline(pipeline_root=PIPELINE_ROOT, name="custom-model-sequence",)

def pipeline(
    project: str = 'impact-analytics-sandbox',
    gcp_region: str = 'us-central1',
    container_uri_lstm: str = "",
    bucket: str = BUCKET_NAME,
):
    
    training_op_lstm = gcc_aip.CustomContainerTrainingJobRunOp(
        display_name="pipeline-lstm-custom-train",
        container_uri=container_uri_lstm,
        project=project,
        location=gcp_region,
        staging_bucket=bucket,
    )

In [7]:
compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="custom_train_pipeline.json"
)

model
<class 'str'>




In [8]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

In [9]:
pipeline_job = aiplatform.PipelineJob(
    display_name="custom-train-pipeline",
    template_path="custom_train_pipeline.json",
    job_id="custom-train-pipeline-{0}".format(TIMESTAMP),
    enable_caching=False,
    parameter_values={
        "container_uri_lstm": "gcr.io/{0}/lstm:v1".format(PROJECT_ID),
        "container_uri_gru": "gcr.io/{0}/gru:v1".format(PROJECT_ID),
        "bucket": BUCKET_NAME,
    },
)

In [9]:
pipeline_job = aiplatform.PipelineJob(
    display_name="custom-train-pipeline",
    template_path="custom_train_pipeline.json",
    enable_caching=False,
    job_id="custom-train-pipeline-{0}".format(TIMESTAMP),
    parameter_values={
        "container_uri_lstm": "gcr.io/{0}/lstm:v1".format(PROJECT_ID),
        "bucket": BUCKET_NAME,
    },
    enable_caching=True,
)

In [10]:
pipeline_job.submit()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/90786640424/locations/us-central1/pipelineJobs/custom-train-pipeline-20220228144410
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/90786640424/locations/us-central1/pipelineJobs/custom-train-pipeline-20220228144410')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/custom-train-pipeline-20220228144410?project=90786640424


In [6]:
!docker image ls

REPOSITORY                                              TAG       IMAGE ID       CREATED          SIZE
gcr.io/impact-analytics-sandbox/gru                     v1        ebd6a86a8a72   37 minutes ago   6.28GB
gcr.io/impact-analytics-sandbox/lstm                    v1        a3745c04fddf   5 hours ago      6.28GB
air_rnn_model                                           latest    331ededc11d3   9 hours ago      9.49GB
python                                                  3.7       6bcd1cc24497   28 hours ago     903MB
us-docker.pkg.dev/vertex-ai/training/pytorch-xla.1-10   latest    905f5c33f870   2 months ago     13.8GB
gcr.io/inverting-proxy/agent                            <none>    fe507176d0e6   12 months ago    1.73GB
gcr.io/deeplearning-platform-release/pytorch-cpu        latest    827eae6ea7a8   17 months ago    8.03GB


In [7]:
!docker run ebd6a86a8a72

[2022-02-27 04:39:16,515] INFO | darts.models.forecasting.torch_forecasting_model | Train dataset contains 101 samples.
[2022-02-27 04:39:16,515] INFO | darts.models.forecasting.torch_forecasting_model | Train dataset contains 101 samples.
[2022-02-27 04:39:16,525] INFO | darts.models.forecasting.torch_forecasting_model | Time series values are 64-bits; casting model to float64.
[2022-02-27 04:39:16,525] INFO | darts.models.forecasting.torch_forecasting_model | Time series values are 64-bits; casting model to float64.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"
[2022-02-27 04:39:46,570] INFO | darts.models.forecasting.torch_forecasting_model | loading best-epoch=289-val_loss=0.00.ckpt
[2022-02-27 04:39:46,570] INFO | darts.models.forecasting.torch_forecasting_model | loading best-epoch=289-val_loss=0.00.ckpt
GPU availa

In [44]:
!docker image rm -f 5e0f8e1bf4c8

Untagged: gcr.io/impact-analytics-sandbox/lstm:v1
Deleted: sha256:5e0f8e1bf4c8f0c6d5648b67eec10b36fe05ce9353f150f7909c789ad45d930a
Deleted: sha256:c3281a4fc1423c526e547849c741151e725913713541b9bc56f215f2d89d5603
Deleted: sha256:c0ae6ed9c55ec4bb3398f14cfeedca4dd845af231789ced4932eb58b69c71c6d
Deleted: sha256:f8ccc71ebb91336053d7d48020acaf67418b154bef75e2622c5fd5c9487a8188
Deleted: sha256:641b8123c1b99de3f7e8c67f2f0074219949c5adae48817b89086aa20884507a


In [13]:
# Push container to Container Registry
IMAGE_URI="gcr.io/impact-analytics-sandbox/create_dataset:v1"

In [14]:
cd create_dataset

[Errno 2] No such file or directory: 'create_dataset'
/home/jupyter/create_dataset


In [None]:
!docker build ./ -t $IMAGE_URI