# ML Ops Pipeline Example

In [1]:
!pip install --user --quiet google-cloud-pipeline-components

In [2]:
from datetime import datetime
import os

from google.cloud import aiplatform as aip

from typing import NamedTuple

from kfp.v2 import dsl
from kfp.v2 import compiler

from google_cloud_pipeline_components import aiplatform as gcc_aip
from google_cloud_pipeline_components.types import artifact_types

In [3]:
PROJECT = 'sandbox-michael-menzel'
REGION = 'europe-west4'
BUCKET = 'gs://sandbox-michael-menzel-training-europe-west4/staging'

DATA_BUCKET = 'gs://sandbox-michael-menzel-data-europe-west4/tf-iris-classifier-data'
DATA_TEST = os.path.join(DATA_BUCKET, 'test.csv')
TRAINING_BUCKET = 'gs://sandbox-michael-menzel-training-europe-west4/tf-iris-classifier-training'
TRAINING_SCRIPT = os.path.join(TRAINING_BUCKET, 'tf_train.py')

PIPELINE_DISPLAY_NAME = f'training-pipeline-{int(datetime.now().timestamp())}'
PIPELINE_ROOT = os.path.join(BUCKET, 'pipeline_root/training_example')

aip.init(location=REGION, staging_bucket=BUCKET)

In [4]:
%%writefile build/tf_train.py

import numpy as np

import tensorflow as tf
import tensorflow_datasets as tfds

import argparse
import joblib
import os

parser = argparse.ArgumentParser()
parser.add_argument('--epochs', dest='epochs',
                    default=20, type=int,
                    help='Epocsh to run the model fit function.')
args = parser.parse_args()

iris_data, iris_info = tfds.load('iris', split='train', with_info=True)

iris_ds = iris_data.map(lambda x: (x['features'], tf.one_hot(x['label'], iris_info.features['label'].num_classes))).batch(5)
iris_train = iris_ds.skip(15)
iris_test = iris_ds.take(15)

model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, input_shape=iris_info.features['features'].shape),
    tf.keras.layers.Dense(iris_info.features['label'].num_classes, activation='softmax')  
])
model.summary()

model.compile(optimizer='adam', loss='categorical_crossentropy', 
              metrics=['categorical_accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
model.fit(iris_train, epochs=args.epochs, verbose=2)

model.evaluate(iris_test, verbose=2)
model.save(os.getenv("AIP_MODEL_DIR"))

Writing build/tf_train.py


In [5]:
!echo "Copying training script to $TRAINING_SCRIPT ..."
!gsutil cp build/tf_train.py "$TRAINING_SCRIPT"

Copying training script to gs://sandbox-michael-menzel-training-europe-west4/tf-iris-classifier-training/tf_train.py ...
Copying file://build/tf_train.py [Content-Type=text/x-python]...
/ [1 files][  1.1 KiB/  1.1 KiB]                                                
Operation completed over 1 objects/1.1 KiB.                                      


In [6]:
import tensorflow as tf
import tensorflow_datasets as tfds
iris_data, iris_info = tfds.load('iris', split='train', with_info=True)
with tf.io.gfile.GFile(DATA_TEST, 'w') as testfile:
    for line in [','.join(tf.squeeze(x['features']).numpy().astype(str)) for x in iris_data]:
        testfile.write(line)

INFO:absl:Load dataset info from /home/jupyter/tensorflow_datasets/iris/2.0.0
INFO:absl:Reusing dataset iris (/home/jupyter/tensorflow_datasets/iris/2.0.0)
INFO:absl:Constructing tf.data.Dataset iris for split train, from /home/jupyter/tensorflow_datasets/iris/2.0.0


In [7]:
@dsl.component  
def extract_path(artifact: dsl.Input[dsl.Artifact]) -> str:
    print(artifact)
    return artifact.path

@dsl.component(packages_to_install=['google-cloud-aiplatform', 'google-cloud-pipeline-components'])
def train_model(project_id: str, location: str, display_name: str, staging_bucket: str, training_script: dsl.Input[dsl.Artifact], model: dsl.Output[dsl.Model]):
    import json
    
    from google.cloud import aiplatform as aip
    from google_cloud_pipeline_components.types import artifact_types
    
    aip.init(location=location, project=project_id)
    
    vertex_ai_job = aip.CustomTrainingJob(
        display_name=display_name,
        script_path=training_script.path,
        container_uri='europe-docker.pkg.dev/vertex-ai/training/tf-cpu.2-6:latest',
        staging_bucket=staging_bucket,
        requirements=['tensorflow-datasets'],
        model_serving_container_image_uri='europe-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-6:latest',
    )

    vertex_ai_model = vertex_ai_job.run(
        replica_count=1, 
        machine_type='e2-standard-8',
        args=[
            f"--epochs=100",
        ]
    )
    
    model.uri = f'https://{location}-aiplatform.googleapis.com/v1/' + vertex_ai_model.resource_name
    model.metadata = {'resourceName': vertex_ai_model.resource_name}
    
@dsl.component(base_image='tensorflow/tensorflow:2.7.0', packages_to_install=['tensorflow-datasets', 'google-cloud-aiplatform'])
def online_evaluation(vertexEndpoint: dsl.Input[dsl.Artifact], model: dsl.Output[dsl.Model]):
    import tensorflow as tf
    import tensorflow_datasets as tfds

    from google.cloud import aiplatform as aip
    
    vertex_ai_endpoint = aip.Endpoint(vertexEndpoint.metadata['resourceName'])
    
    iris_data, iris_info = tfds.load('iris', split='train', with_info=True)
    iris_ds = iris_data.map(lambda x: (x['features'], tf.one_hot(x['label'], iris_info.features['label'].num_classes))).batch(5)
    iris_test = iris_ds.take(15)

    precision = tf.keras.metrics.Precision()
    recall = tf.keras.metrics.Recall()

    for X_test, y_test in iris_test.as_numpy_iterator():
        vertex_ai_predicted = vertex_ai_endpoint.predict(X_test.tolist())
        precision.update_state(y_test, vertex_ai_predicted.predictions)
        recall.update_state(y_test, vertex_ai_predicted.predictions)

    print(f'precision: {precision.result()}')
    model.metadata['precision'] = float(precision.result().numpy())
    print(f'recall: {recall.result()}')
    model.metadata['recall'] = float(recall.result().numpy())

In [8]:
from google_cloud_pipeline_components.aiplatform import utils

CustomTrainingJobRunOp = utils.convert_method_to_component(
    aip.CustomTrainingJob,
    aip.CustomTrainingJob.run,
)

In [9]:
@dsl.pipeline(
    name=PIPELINE_DISPLAY_NAME,
    description="A simple model training pipeline",
    pipeline_root=PIPELINE_ROOT,
)
def pipeline(project_id: str, location: str, training_script: str):
    from google_cloud_pipeline_components.types import artifact_types
    
    training_script_import = dsl.importer(
        artifact_uri=training_script,
        artifact_class=dsl.Artifact,
        reimport=True)

    train_model_fct_op = train_model(project_id, location, 
                                     f'{PIPELINE_DISPLAY_NAME}-tf-iris-classifier-training',
                                     TRAINING_BUCKET,
                                     training_script_import.output)
    
    training_script_path = extract_path(training_script_import.output)
        
    train_model_yaml_op = CustomTrainingJobRunOp(
        project=project_id,
        location=location,
        display_name=f'{PIPELINE_DISPLAY_NAME}-tf-iris-classifier-training',
        script_path=training_script_path.output,
        container_uri='europe-docker.pkg.dev/vertex-ai/training/tf-cpu.2-6:latest',
        staging_bucket='gs://sandbox-michael-menzel-training-europe-west4/tf-iris-classifier-training/',
        requirements=['tensorflow-datasets'],
        model_serving_container_image_uri='europe-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-6:latest',
        replica_count=1, 
        machine_type='e2-standard-8',
        args=[f"--epochs=100"])
    
    create_endpoint_op = gcc_aip.EndpointCreateOp(
        project=project_id,
        location=REGION,
        display_name = f'{PIPELINE_DISPLAY_NAME}-ep',
    )

    model_deploy_fct_op = gcc_aip.ModelDeployOp(
        model=train_model_fct_op.output,
        endpoint=create_endpoint_op.outputs['endpoint'],
        traffic_split={"0": 100},
        dedicated_resources_machine_type='n1-standard-8',
        dedicated_resources_min_replica_count=1,
        dedicated_resources_max_replica_count=1,
    )
    
    model_deploy_yaml_op = gcc_aip.ModelDeployOp(
        model=train_model_yaml_op.output,
        endpoint=create_endpoint_op.outputs['endpoint'],
        dedicated_resources_machine_type='n1-standard-8',
        dedicated_resources_min_replica_count=1,
        dedicated_resources_max_replica_count=1,
    )
    
    online_evaluation_op = online_evaluation(create_endpoint_op.outputs['endpoint']).after(model_deploy_fct_op)
    
compiler.Compiler().compile(
    pipeline_func=pipeline, 
    package_path="build/training_pipeline.json",
    type_check=False
)

job = aip.PipelineJob(
    display_name=PIPELINE_DISPLAY_NAME,
    template_path="build/training_pipeline.json",
    pipeline_root=PIPELINE_ROOT,
    parameter_values={
        'project_id': PROJECT, 
        'location': REGION, 
        'training_script': TRAINING_SCRIPT
    }
)

job.run()

type name Model is different from expected: google.VertexModel
type name Model is different from expected: google.VertexModel
INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob




INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/928871478446/locations/europe-west4/pipelineJobs/training-pipeline-1642950977-20220123151621
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/928871478446/locations/europe-west4/pipelineJobs/training-pipeline-1642950977-20220123151621')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/europe-west4/pipelines/runs/training-pipeline-1642950977-20220123151621?project=928871478446
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/928871478446/locations/europe-west4/pipelineJobs/training-pipeline-1642950977-20220123151621 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/928871478446/locations/europe-west4/pipelineJobs/training-pip