In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vision Workshop - ML Pipeline

## Overview

[Vision Workshop](https://github.com/mblanc/vision-workshop) is a series of labs on how to build an image classification system on Google Cloud. Throughout the Vision Workshop labs, you will learn how to read image data stored in data lake, perform exploratory data analysis (EDA), train a model, register your model in a model registry, evaluate your model, deploy your model to an endpoint, do real-time inference on your model.

### Objective

This notebook shows how to pull features from Feature Store for training, run data exploratory analysis on features, build a machine learning model locally, experiment with various hyperparameters, evaluate the model and deloy it to a Vertex AI endpoint. 

This lab uses the following Google Cloud services and resources:

- [Vertex AI](https://cloud.google.com/vertex-ai/)

Steps performed in this notebook:

- Create and deploy a Vertex AI Pipeline

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

### Load configuration settings from the setup notebook

Set the constants used in this notebook and load the config settings from the `00_environment_setup.ipynb` notebook.

In [None]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
BUCKET_NAME = f"{PROJECT_ID}-vision-workshop"
config = !gsutil cat gs://{BUCKET_NAME}/config/notebook_env.py
print(config.n)
exec(config.n)

### Import libraries and define constants

#### Libraries

In [None]:
# General
import os
import sys
import random
from datetime import datetime, timedelta
import json

from google.cloud import aiplatform as vertex_ai
from google_cloud_pipeline_components import aiplatform as vertex_ai_components
from google_cloud_pipeline_components.v1.batch_predict_job import ModelBatchPredictOp
from google_cloud_pipeline_components.types import artifact_types

import tensorflow as tf
import kfp
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import component
from kfp.v2.components import importer_node

In [None]:
print("kfp version:", kfp.__version__)

In [None]:
import google_cloud_pipeline_components
print("tf version:", tf.__version__)
print("kfp version:", kfp.__version__)
print("google-cloud-pipeline-components version:", google_cloud_pipeline_components.__version__)

In [None]:
vertex_ai.__version__

#### Variables

In [None]:
#Pipeline
PIPELINE_NAME = f'vision-workshop-tf-pipeline-{ID}'
PIPELINE_DIR=os.path.join(os.curdir, 'pipelines')
PIPELINE_ROOT = f"gs://{BUCKET_NAME}/pipelines"
PIPELINE_PACKAGE_PATH = f"{PIPELINE_DIR}/pipeline_{ID}.json"
COMPONENTS_DIR=os.path.join(os.curdir, 'pipelines', 'components')

# #Dataset component
DATASET_NAME = f'flowers'

# #Training component
JOB_NAME = f'image-classifier-train-tf-{ID}'
MODEL_NAME = f'image-classifier-tf-{ID}'
TRAIN_MACHINE_TYPE = 'n1-standard-4'
MODEL_SERVING_IMAGE_URI = "europe-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-8:latest"
ARGS=[ "--lr=0.003", "--epochs=10"]
IMAGE_REPOSITORY = f"vision-{ID}"
IMAGE_NAME="image-classifier"
IMAGE_TAG="v1"
IMAGE_URI=f"europe-west4-docker.pkg.dev/{PROJECT_ID}/{IMAGE_REPOSITORY}/{IMAGE_NAME}:{IMAGE_TAG}"

#### Initiate Client

In [None]:
# Vertex AI client
vertex_ai.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_NAME)

#### Set folders

In [None]:
COMPONENTS_DIR

In [None]:
!mkdir -p -m 777 $PIPELINE_DIR $COMPONENTS_DIR

### Define the pipeline using ```kfp``` and ```google_cloud_pipeline_components```

#### Build pipeline

In [None]:
ds = vertex_ai.ImageDataset.list(filter="display_name=flowers", location=REGION)[0]

In [None]:
@dsl.pipeline(
    pipeline_root=PIPELINE_ROOT,
    name=PIPELINE_NAME,
)
def pipeline(project_id:str = PROJECT_ID, 
             region:str = REGION, 
             bucket_name:str = f"gs://{BUCKET_NAME}",
             replica_count:int = 1,
             machine_type:str = "n1-standard-4",
            ):
    
    #create dataset 
    # dataset_create_op = vertex_ai_components.ImageDatasetCreateOp(project=project_id,
    #                                                    location=region,
    #                                                    display_name=DATASET_NAME,
    #                                                    import_schema_uri=vertex_ai.schema.dataset.ioformat.image.single_label_classification,
    #                                                    gcs_source=f"gs://{BUCKET_NAME}/prod/flowers.csv")
    
    importer_op = importer_node.importer(
        artifact_uri=f"https://{ds.location}-aiplatform.googleapis.com/v1/{ds.resource_name}",
        artifact_class=artifact_types.VertexDataset,
        metadata={
            "resourceName": ds.resource_name,
        },
    )
    
    #custom training job component - script
    train_model_op = vertex_ai_components.CustomContainerTrainingJobRunOp(
        display_name=JOB_NAME,
        model_display_name=MODEL_NAME,
        container_uri=IMAGE_URI,
        staging_bucket=bucket_name,
        dataset= importer_op.output, #dataset_create_op.outputs['dataset'],
        annotation_schema_uri=vertex_ai.schema.dataset.annotation.image.classification,
        base_output_dir=bucket_name,
        args = ARGS,
        replica_count= replica_count,
        machine_type= machine_type,
        accelerator_type="NVIDIA_TESLA_T4",
        accelerator_count=1,
        model_serving_container_image_uri=MODEL_SERVING_IMAGE_URI,
        project=project_id,
        location=region).after(importer_op)
    
    # batch_op = ModelBatchPredictOp(
    #     project=project_id,
    #     location=region,
    #     job_display_name="batch_predict_job",
    #     model=train_model_op.outputs["model"],
    #     gcs_source_uris=[f"gs://{BUCKET_NAME}/flowers_batch.txt"],
    #     gcs_destination_output_uri_prefix=f"gs://{BUCKET_NAME}",
    #     instances_format="file-list",
    #     predictions_format="jsonl",
    #     model_parameters={},
    #     machine_type=machine_type,
    #     starting_replica_count=1,
    #     max_replica_count=1,
    # )

    
    #create endpoint
    create_endpoint_op = vertex_ai_components.EndpointCreateOp(
        display_name=ENDPOINT_NAME,
        project=project_id, 
        location=region).after(train_model_op)

    #deploy the model
    custom_model_deploy_op = vertex_ai_components.ModelDeployOp(
        model=train_model_op.outputs["model"],
        endpoint=create_endpoint_op.outputs["endpoint"],
        deployed_model_display_name=MODEL_NAME,
        dedicated_resources_machine_type=machine_type,
        dedicated_resources_min_replica_count=replica_count
    ).after(create_endpoint_op)
    

#### Compile and run the pipeline

In [None]:
pipeline_compiler = compiler.Compiler()
pipeline_compiler.compile(
    pipeline_func=pipeline,
    package_path=PIPELINE_PACKAGE_PATH)

In [None]:
#instantiate pipeline representation
pipeline_job = vertex_ai.PipelineJob(
    location=REGION,
    display_name=PIPELINE_NAME,
    template_path=PIPELINE_PACKAGE_PATH,
    pipeline_root=PIPELINE_ROOT,
    enable_caching=True)

In [None]:
pipeline_job.run(sync=True)