In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# DEV302 - Goodbye, deployment headaches: Cloud Deploy and Vertex AI unite

{TODO: Update the links below.}

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/notebook_template.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/notebook_template.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/notebook_template.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>                                                                                               
</table>

**_NOTE_**: This notebook has been tested in the following environment:

* Python version = 3.9

## Overview

This notebook shows how to run simple Sklearn-based ML pipelines on Vertex AI Pipelines.

### Objective

In this tutorial, you learn how to build ML pipelines interactivly.

This tutorial uses the following Google Cloud ML services and resources:

- Vertex AI Pipelines
- Cloud storage

The steps performed include:

- Build a data processing component
- Build a training component
- Build a KFP ML pipeline
- Run Predictions

### Dataset

The California housing dataset contains census data of houses found in a given California district in 1990.


### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),
and [Cloud Storage pricing](https://cloud.google.com/storage/pricing),
and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Installation

Install the following packages required to execute this notebook.

{TODO: Suggest using the latest major GA version of each package; i.e., --upgrade}

In [None]:
! pip3 install --upgrade --quiet kfp google-cloud-aiplatform google-cloud-pipeline-components

### Colab only: Uncomment the following cell to restart the kernel.

In [None]:
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

## Before you begin

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

3. [Enable APIs](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,artifactregistry.googleapis.com).

4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk).

#### Set your project ID

**If you don't know your project ID**, try the following:
* Run `gcloud config list`.
* Run `gcloud projects list`.
* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

#### Region

You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [None]:
REGION = "us-central1"  # @param {type: "string"}

### UUID
If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, you create a uuid for each instance session, and append it onto the name of resources you create in this tutorial.

In [None]:
import random
import string


# Generate a uuid of a specifed length(default=8)
def generate_uuid(length: int = 4) -> str:
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


UUID = generate_uuid()

### Authenticate your Google Cloud account

Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below.

**1. Vertex AI Workbench**
* Do nothing as you are already authenticated.

**2. Local JupyterLab instance, uncomment and run:**

In [None]:
# ! gcloud auth login

**3. Colab, uncomment and run:**

In [None]:
from google.colab import auth

auth.authenticate_user()

**4. Service account or other**
* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples.

### Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

- *{Note to notebook author: For any user-provided strings that need to be unique (like bucket names or model ID's), append "-unique" to the end so proper testing can occur}*

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}

### Service Account

**If you don't know your service account**, try to get your service account using `gcloud` command by executing the second cell below.

In [None]:
SERVICE_ACCOUNT = "[your-service-account]"  # @param {type:"string"}

In [None]:
import os
import sys

IS_COLAB = "google.colab" in sys.modules
if (
    SERVICE_ACCOUNT == ""
    or SERVICE_ACCOUNT is None
    or SERVICE_ACCOUNT == "[your-service-account]"
):
    # Get your service account from gcloud
    if not IS_COLAB:
        shell_output = !gcloud auth list 2>/dev/null
        SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()

    if IS_COLAB:
        shell_output = ! gcloud projects describe  $PROJECT_ID
        project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
        SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

    print("Service Account:", SERVICE_ACCOUNT)

#### Set service account access for Vertex AI Pipelines

Run the following commands to grant your service account access to read and write pipeline artifacts in the bucket and register pipeline template in the Artifact Registry that you created in the previous step -- you only need to run these once per service account.

In [None]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectAdmin {BUCKET_URI}

! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin {BUCKET_URI}

In [None]:
! gcloud projects add-iam-policy-binding {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role=roles/artifactregistry.admin

! gcloud projects add-iam-policy-binding {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role=roles/artifactregistry.repoAdmin

! gcloud projects add-iam-policy-binding {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role=roles/artifactregistry.reader

! gcloud projects add-iam-policy-binding {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role=roles/artifactregistry.reader

### Create a KFP repository in Artifact Registry

Create a repository in Artifact Registry for your pipeline templates.

In [None]:
PIPELINE_TEMPLATE_REPO_NAME = (
    f"your-pipeline-repo-{PROJECT_ID}-unique"  # @param {type:"string"}
)

In [None]:
! gcloud artifacts repositories create {PIPELINE_TEMPLATE_REPO_NAME} \
    --repository-format=kfp \
    --location={REGION} \
    --description="A repository for Vertex AI Pipelines templates"

In [None]:
! gcloud artifacts repositories list --project={PROJECT_ID} --location={REGION}

### Import libraries

In [None]:
import json

import google.auth
import google.auth.transport.requests
import requests
from google.cloud import aiplatform
from google_cloud_pipeline_components.types import artifact_types
from google_cloud_pipeline_components.v1.model import ModelGetOp, ModelUploadOp
from kfp import compiler, dsl
from kfp.dsl import importer_node
from kfp.registry import RegistryClient

### Set variables

In [None]:
PIPELINE_NAME = "california-demo-pipeline"
PIPELINE_ROOT = f"{BUCKET_URI}/{PIPELINE_NAME}"
MODEL_PATH = f"{PIPELINE_ROOT}/model"
MODEL_NAME = "california_reg_model"
PARAM_RUN_1 = {
    "learning_rate": 0.0001,
    "n_estimators": 4000,
    "max_depth": 20,
    "random_state": 8,
}

PARAM_RUN_2 = {
    "learning_rate": 0.1,
    "n_estimators": 10,
    "max_depth": 3,
    "random_state": 8,
}

DEPLOYED_MODEL_NAME_1 = "california_reg_model_1"
DEPLOYED_MODEL_NAME_2 = "california_reg_model_2"

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project.

In [None]:
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

### Create pipeline components

#### Data processing component

In [None]:
@dsl.component(
    base_image="python:3.7",
    packages_to_install=["numpy==1.18.5", "pandas==1.0.4", "scikit-learn==0.23.1"],
)
def data_preprocessing_op(processed_dataset: dsl.Output[dsl.Dataset]):

    from pathlib import Path as p

    import pandas as pd
    from sklearn.datasets import fetch_california_housing
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler

    housing = fetch_california_housing(as_frame=True)
    housing_df = housing["frame"]
    x_df = housing_df.drop("MedHouseVal", axis=1)
    y_df = housing_df[["MedHouseVal"]]
    processed_x = SimpleImputer().fit_transform(x_df)
    processed_x = StandardScaler().fit_transform(processed_x)

    processed_x_df = pd.DataFrame(processed_x, columns=x_df.columns)
    housing_df = pd.merge(processed_x_df, y_df, left_index=True, right_index=True)

    p(processed_dataset.path).mkdir(exist_ok=True, parents=True)
    processed_dataset_path = str(p(processed_dataset.path, "processed_dataset.csv"))
    housing_df.to_csv(processed_dataset_path, index=False)
    processed_dataset.path = processed_dataset_path

#### Training component

In [None]:
@dsl.component(
    base_image="python:3.7",
    packages_to_install=[
        "numpy==1.18.5",
        "pandas==1.0.4",
        "scikit-learn==0.23.1",
        "xgboost==1.1.1",
    ],
)
def training_op(
    params: dict,
    model_path: str,
    processed_dataset: dsl.Input[dsl.Dataset],
    trained_model: dsl.Output[dsl.Model],
    metrics: dsl.Output[dsl.Metrics],
):

    from pathlib import Path as p

    import numpy as np
    import pandas as pd
    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import train_test_split
    from xgboost import XGBRegressor

    with open(processed_dataset.path, "r") as preprocessed_data:
        processed_df = pd.read_csv(preprocessed_data)

    x = processed_df.drop("MedHouseVal", axis=1)
    y = processed_df["MedHouseVal"]
    X_train, X_test, y_train, y_test = train_test_split(
        x, y, test_size=0.25, random_state=0
    )
    model = XGBRegressor()
    if params:
        model = XGBRegressor(**params)
    model = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)), 3)

    metrics.log_metric("rmse", rmse)
    model_path = model_path.replace("gs://", "/gcs/")
    p(model_path).mkdir(exist_ok=True, parents=True)
    model_filepath = str(p(model_path, "model.bst"))
    model.save_model(model_filepath)
    trained_model.path = model_filepath

### Build the pipeline

In [None]:
@dsl.pipeline(
    name=PIPELINE_NAME,
)
def pipeline(
    params: dict = PARAM_RUN_1, model_path: str = MODEL_PATH, model_name: str = "None"
):

    """A demo pipeline."""

    preprocessing_data_task = data_preprocessing_op()

    training_task = training_op(
        params=params,
        model_path=model_path,
        processed_dataset=preprocessing_data_task.outputs["processed_dataset"],
    ).after(preprocessing_data_task)

    with dsl.If(model_name == "None", name="champion"):

        model_importer_task = importer_node.importer(
            artifact_uri=model_path,
            artifact_class=artifact_types.UnmanagedContainerModel,
            metadata={
                "containerSpec": {
                    "imageUri": "us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1-1:latest"
                }
            },
        ).after(training_task)

        model_upload_op = ModelUploadOp(
            display_name=MODEL_NAME,
            unmanaged_container_model=model_importer_task.outputs["artifact"],
            version_aliases=["v1"],
            description="A simple version of the model",
        ).after(model_importer_task)

    with dsl.Else(name="challenger"):

        model_importer_task = importer_node.importer(
            artifact_uri=model_path,
            artifact_class=artifact_types.UnmanagedContainerModel,
            metadata={
                "containerSpec": {
                    "imageUri": "us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1-1:latest"
                }
            },
        ).after(training_task)

        get_model_task = ModelGetOp(model_name=model_name).after(model_importer_task)

        model_upload_op = ModelUploadOp(
            display_name=MODEL_NAME,
            unmanaged_container_model=model_importer_task.outputs["artifact"],
            parent_model=get_model_task.outputs["model"],
            version_aliases=["v2"],
            description="A tuned version of the model",
        ).after(get_model_task)

### Compile the pipeline

In [None]:
compiler.Compiler().compile(pipeline_func=pipeline, package_path="pipeline.yaml")

### Upload the pipeline template

In [None]:
client = RegistryClient(
    host=f"https://{REGION}-kfp.pkg.dev/{PROJECT_ID}/{PIPELINE_TEMPLATE_REPO_NAME}"
)

In [None]:
xgb_pipeline_template, xgb_template_version = client.upload_pipeline(
    file_name="pipeline.yaml",
    tags=["latest"],
    extra_headers={
        "description": "This is a Xgboost pipeline template for housing project"
    },
)

In [None]:
pipeline_templates = client.list_packages()
pipeline_template = client.get_package(package_name=PIPELINE_NAME)
print(pipeline_template)

### Run the pipeline for training the v1 of the model

In [None]:
job = aiplatform.PipelineJob(
    display_name="california-demo-pipeline",
    template_path=f"https://{REGION}-kfp.pkg.dev/{PROJECT_ID}/{PIPELINE_TEMPLATE_REPO_NAME}/{PIPELINE_NAME}/latest",
    parameter_values={
        "params": PARAM_RUN_1,
        "model_path": MODEL_PATH,
    },
    enable_caching=False,
)

job.run()

### Run the pipeline for training the v2 of the model

In [None]:
model_list = aiplatform.Model.list(
    filter=f"display_name={MODEL_NAME}", order_by="create_time"
)
model_resource_name = model_list[-1].resource_name
model_name = model_list[-1].name

In [None]:
job = aiplatform.PipelineJob(
    display_name="california-demo-pipeline",
    template_path=f"https://{REGION}-kfp.pkg.dev/{PROJECT_ID}/{PIPELINE_TEMPLATE_REPO_NAME}/{PIPELINE_NAME}/latest",
    parameter_values={
        "params": PARAM_RUN_2,
        "model_path": MODEL_PATH,
        "model_name": model_name,
    },
    enable_caching=False,
)

job.run()

### Generate predictions

In [None]:
credentials, _ = google.auth.default()
authentication = google.auth.transport.requests.Request()
credentials.refresh(authentication)

In [None]:
headers = {
    "Authorization": "Bearer " + credentials.token,
    "Content-Type": "application/json",
}

prediction_data = {
    "instances": [
        [
            2.34476576,
            0.98214266,
            0.62855945,
            -0.15375759,
            -0.9744286,
            -0.04959654,
            1.05254828,
            -1.32783522,
        ]
    ]
}

data = json.dumps(prediction_data).encode("utf-8")

##### Model 1

In [None]:
model_registry = aiplatform.Model(
    model_resource_name
).versioning_registry.list_versions()

In [None]:
model_1 = aiplatform.Model(model_registry[0].model_resource_name)

In [None]:
endpoint_1 = aiplatform.Endpoint.create(
    display_name="endpoint_1",
)

In [None]:
model_1.deploy(
    endpoint=endpoint_1,
    deployed_model_display_name=DEPLOYED_MODEL_NAME_1,
    machine_type="n1-standard-4",
)

In [None]:
response = requests.post(
    f"https://us-central1-aiplatform.googleapis.com/v1/projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_1.name}:predict",
    headers=headers,
    data=data,
)

In [None]:
print(response.text)

#### Model 2

In [None]:
model_2 = aiplatform.Model(model_registry[-1].model_resource_name)

In [None]:
endpoint_2 = aiplatform.Endpoint.create(
    display_name="endpoint_2",
)

In [None]:
model_2.deploy(
    endpoint=endpoint_2,
    deployed_model_display_name=DEPLOYED_MODEL_NAME_2,
    machine_type="n1-standard-4",
)

In [None]:
response = requests.post(
    f"https://us-central1-aiplatform.googleapis.com/v1/projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_2.name}:predict",
    headers=headers,
    data=data,
)

In [None]:
print(response.text)

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial.

In [None]:

delete_pipelines = True
if delete_pipelines or os.getenv("IS_TESTING"):
    pipelines = aiplatform.PipelineJob.list()
    for pipeline in pipelines:
        pipeline.delete()

delete_endpoints = False
if delete_endpoints or os.getenv("IS_TESTING"):
    endpoints = aiplatform.Endpoint.list()
    for endpoint in endpoints:
        endpoint.delete(force=True)

delete_models = False
if delete_models or os.getenv("IS_TESTING"):
    models = aiplatform.Model.list()
    for model in models:
        model.delete()

delete_pipeline_templates = True
if delete_pipeline_templates or os.getenv("IS_TESTING"):
    pipeline_templates = client.list_packages()
    for pipeline_template in pipeline_templates:
        _ = client.delete_package(pipeline_template["name"].split("/")[-1])

delete_artifact_repo = True
if delete_artifact_repo or os.getenv("IS_TESTING"):
    ! gcloud artifacts repositories delete {PIPELINE_TEMPLATE_REPO_NAME}

delete_bucket = True
if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil -m rm -r $BUCKET_URI