In [1]:
REGION = "us-central1"
PROJECT_ID = !(gcloud config get-value project)
PROJECT_ID = PROJECT_ID[0]

DATASET = "movielens"
FOLDER = "movielens"
PIPELINE_JSON = f"{FOLDER}/{DATASET}_kfp_pipeline.json"

ARTIFACT_STORE = f"gs://kfp-{DATASET}-artifact-store-{PROJECT_ID}"
PIPELINE_ROOT = f"{ARTIFACT_STORE}/pipeline"
DATA_ROOT = f"{ARTIFACT_STORE}/data"
JOB_DIR_ROOT = f"{ARTIFACT_STORE}/jobs"
TRAINING_FILE_PATH = f"{DATA_ROOT}/training/dataset.csv"
VALIDATION_FILE_PATH = f"{DATA_ROOT}/validation/dataset.csv"

In [2]:
IMAGE_NAME = "trainer_image_movielens"
TAG = "latest"
TRAINING_CONTAINER_IMAGE_URI = f"gcr.io/{PROJECT_ID}/{IMAGE_NAME}:{TAG}"
TRAINING_CONTAINER_IMAGE_URI

'gcr.io/qwiklabs-gcp-04-853e5675f5e8/trainer_image_movielens:latest'

In [3]:
# https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers
SERVING_CONTAINER_IMAGE_URI = (
    "us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-3:latest"
)

In [None]:
%env DATASET={DATASET}
%env PIPELINE_ROOT={PIPELINE_ROOT}
%env PROJECT_ID={PROJECT_ID}
%env REGION={REGION}
%env JOB_DIR_ROOT={JOB_DIR_ROOT}
%env TRAINING_FILE_PATH={TRAINING_FILE_PATH}
%env VALIDATION_FILE_PATH={VALIDATION_FILE_PATH}
%env SERVING_CONTAINER_IMAGE_URI={SERVING_CONTAINER_IMAGE_URI}
%env TRAINING_CONTAINER_IMAGE_URI={TRAINING_CONTAINER_IMAGE_URI}

# Set `PATH` to include the directory containing KFP CLI
PATH = %env PATH
%env PATH=/home/jupyter/.local/bin:{PATH}

In [5]:
import os

os.makedirs(FOLDER, exist_ok=True)

# Dataset

## Import dataset

In [6]:
# %%bash

# DATASET_LOCATION=US
# DATASET_ID=covertype_dataset
# TABLE_ID=covertype
# DATA_SOURCE=gs://workshop-datasets/covertype/small/dataset.csv
# SCHEMA=Elevation:INTEGER,\
# Aspect:INTEGER,\
# Slope:INTEGER,\
# Horizontal_Distance_To_Hydrology:INTEGER,\
# Vertical_Distance_To_Hydrology:INTEGER,\
# Horizontal_Distance_To_Roadways:INTEGER,\
# Hillshade_9am:INTEGER,\
# Hillshade_Noon:INTEGER,\
# Hillshade_3pm:INTEGER,\
# Horizontal_Distance_To_Fire_Points:INTEGER,\
# Wilderness_Area:STRING,\
# Soil_Type:STRING,\
# Cover_Type:INTEGER

# bq --location=$DATASET_LOCATION --project_id=$PROJECT_ID mk --dataset $DATASET_ID

# bq --project_id=$PROJECT_ID --dataset_id=$DATASET_ID load \
# --source_format=CSV \
# --skip_leading_rows=1 \
# --replace \
# $TABLE_ID \
# $DATA_SOURCE \
# $SCHEMA

In [7]:
!gsutil ls | grep ^{ARTIFACT_STORE}/$ || gsutil mb -l {REGION} {ARTIFACT_STORE}

Creating gs://kfp-movielens-artifact-store-qwiklabs-gcp-04-853e5675f5e8/...


## Train, Valid split

In [8]:
# TODO: create data prep KFP component

In [9]:
# CREATE TRAIN DATASET
!bq query \
-n 0 \
--destination_table movielens.training \
--replace \
--use_legacy_sql=false \
'SELECT * \
FROM `movielens.ratings` AS table \
WHERE \
MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(table))), 10) IN (1, 2, 3, 4)' 

!bq extract \
--destination_format CSV \
movielens.training \
$TRAINING_FILE_PATH

Waiting on bqjob_r31f9b8f23bb02de1_0000017f963a7106_1 ... (23s) Current status: DONE   
Waiting on bqjob_r233385397fec6cca_0000017f963ad6cf_1 ... (23s) Current status: DONE   


In [10]:
# CREATE VALID DATASET
!bq query \
-n 0 \
--destination_table $DATASET.validation \
--replace \
--use_legacy_sql=false \
'SELECT * \
FROM `movielens.ratings` AS table \
WHERE \
MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(table))), 10) IN (8)' 

!bq extract \
--destination_format CSV \
$DATASET.validation \
$VALIDATION_FILE_PATH

Waiting on bqjob_r6a758eb02b9deff9_0000017f963b3bcb_1 ... (15s) Current status: DONE   
Waiting on bqjob_r43293434106e0950_0000017f963b813f_1 ... (6s) Current status: DONE   


In [11]:
import pandas as pd

df_train = pd.read_csv(TRAINING_FILE_PATH)
df_validation = pd.read_csv(VALIDATION_FILE_PATH)
print(df_train.shape, f"Training path: {TRAINING_FILE_PATH}\n")
print(df_validation.shape, f"Validation path: {VALIDATION_FILE_PATH}")

(7996867, 4) Training path: gs://kfp-movielens-artifact-store-qwiklabs-gcp-04-853e5675f5e8/data/training/dataset.csv

(2002053, 4) Validation path: gs://kfp-movielens-artifact-store-qwiklabs-gcp-04-853e5675f5e8/data/validation/dataset.csv


# Training and Tuning

## Train model image

In [12]:
%%writefile {FOLDER}/train.py

"""Trainer script."""
import os
import pickle
import subprocess
import sys

import fire
import hypertune
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

AIP_MODEL_DIR = os.environ["AIP_MODEL_DIR"]
MODEL_FILENAME = "model.pkl"


def train_evaluate(
    training_dataset_path, validation_dataset_path, alpha, max_iter, hptune
):
    """Trains the Covertype Classifier model."""

    df_train = pd.read_csv(training_dataset_path)
    df_validation = pd.read_csv(validation_dataset_path)

    if not hptune:
        df_train = pd.concat([df_train, df_validation])

    numeric_features = [
        "Elevation",
        "Aspect",
        "Slope",
        "Horizontal_Distance_To_Hydrology",
        "Vertical_Distance_To_Hydrology",
        "Horizontal_Distance_To_Roadways",
        "Hillshade_9am",
        "Hillshade_Noon",
        "Hillshade_3pm",
        "Horizontal_Distance_To_Fire_Points",
    ]

    categorical_features = ["Wilderness_Area", "Soil_Type"]

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numeric_features),
            ("cat", OneHotEncoder(), categorical_features),
        ]
    )

    pipeline = Pipeline(
        [
            ("preprocessor", preprocessor),
            ("classifier", SGDClassifier(loss="log")),
        ]
    )

    num_features_type_map = {feature: "float64" for feature in numeric_features}
    df_train = df_train.astype(num_features_type_map)
    df_validation = df_validation.astype(num_features_type_map)

    print(f"Starting training: alpha={alpha}, max_iter={max_iter}")
    # pylint: disable-next=invalid-name
    X_train = df_train.drop("Cover_Type", axis=1)
    y_train = df_train["Cover_Type"]

    pipeline.set_params(classifier__alpha=alpha, classifier__max_iter=max_iter)
    pipeline.fit(X_train, y_train)

    if hptune:
        # pylint: disable-next=invalid-name
        X_validation = df_validation.drop("Cover_Type", axis=1)
        y_validation = df_validation["Cover_Type"]
        accuracy = pipeline.score(X_validation, y_validation)
        print(f"Model accuracy: {accuracy}")
        # Log it with hypertune
        hpt = hypertune.HyperTune()
        hpt.report_hyperparameter_tuning_metric(
            hyperparameter_metric_tag="accuracy", metric_value=accuracy
        )

    # Save the model
    if not hptune:
        with open(MODEL_FILENAME, "wb") as model_file:
            pickle.dump(pipeline, model_file)
        subprocess.check_call(
            ["gsutil", "cp", MODEL_FILENAME, AIP_MODEL_DIR], stderr=sys.stdout
        )
        print(f"Saved model in: {AIP_MODEL_DIR}")


if __name__ == "__main__":
    fire.Fire(train_evaluate)

Writing movielens/train.py


In [19]:
%%writefile {FOLDER}/Dockerfile
FROM gcr.io/deeplearning-platform-release/base-cpu
RUN pip install -U fire cloudml-hypertune scikit-learn==0.20.4 pandas==0.24.2
WORKDIR /app
COPY train.py .

ENTRYPOINT ["python", "train.py"]

Overwriting movielens/Dockerfile


In [None]:
!gcloud builds submit --timeout 15m --tag $TRAINING_CONTAINER_IMAGE_URI $FOLDER

## Components

In [22]:
%%writefile {FOLDER}/training_lightweight_component.py

"""Lightweight component training function."""
from kfp.v2.dsl import component
import os

DATASET = os.environ["DATASET"]
SERVING_CONTAINER_IMAGE_URI = os.environ["SERVING_CONTAINER_IMAGE_URI"]

@component(
    base_image="python:3.8",
    output_component_file= f"{DATASET}_kfp_train_and_deploy.yaml",
    packages_to_install=["google-cloud-aiplatform"],
)
def train_and_deploy(
    project: str,
    location: str,
    container_uri: str,
    serving_container_uri: str,
    training_file_path: str,
    validation_file_path: str,
    staging_bucket: str,
    alpha: float,
    max_iter: int,
):
    # pylint: disable-next=import-outside-toplevel
    from google.cloud import aiplatform

    aiplatform.init(
        project=project, location=location, staging_bucket=staging_bucket
    )
    job = aiplatform.CustomContainerTrainingJob(
        display_name="model_training",
        container_uri=container_uri,
        command=[
            "python",
            "train.py",
            f"--training_dataset_path={training_file_path}",
            f"--validation_dataset_path={validation_file_path}",
            f"--alpha={alpha}",
            f"--max_iter={max_iter}",
            "--nohptune",
        ],
        staging_bucket=staging_bucket,
        model_serving_container_image_uri=serving_container_uri,
    )
    model = job.run(replica_count=1, model_display_name=f"{DATASET}_kfp_model")
    endpoint = model.deploy(  # pylint: disable=unused-variable
        traffic_split={"0": 100},
        machine_type="n1-standard-2",
    )
    

@component(
    base_image="python:3.8",
    packages_to_install=["google-cloud-aiplatform", "joblib", "sklearn", "xgboost", "google-cloud-bigquery"],
    output_component_file= f"{DATASET}_kfp_deploy.yaml",
)
def deploy(
    project: str,
    location: str,
    serving_container_uri: str,
    display_name:str,
    artifact_uri:str, 
):
    from google.cloud import aiplatform
    import os
    
    aiplatform.init(project=project)
    
    deployed_model = aiplatform.Model.upload(
        display_name= display_name,
        artifact_uri = artifact_uri,
        serving_container_image_uri= serving_container_uri
    )
    endpoint = deployed_model.deploy(
        traffic_split={"0": 100},
        machine_type="n1-standard-4"
    )
    # Save data to the output params
    # vertex_endpoint.uri = endpoint.resource_name
    # vertex_model.uri = deployed_model.resource_name

Overwriting movielens/training_lightweight_component.py


In [23]:
%%writefile {FOLDER}/tuning_lightweight_component.py


"""Lightweight component tuning function."""
from typing import NamedTuple
from kfp.v2.dsl import component
import os

DATASET = os.environ["DATASET"] 

@component(
    base_image="python:3.8",
    output_component_file=f"{DATASET}_kfp_tune_hyperparameters.yaml",
    packages_to_install=["google-cloud-aiplatform"],
)
def tune_hyperparameters(
    project: str,
    location: str,
    container_uri: str,
    training_file_path: str,
    validation_file_path: str,
    staging_bucket: str,
    max_trial_count: int,
    parallel_trial_count: int,
) -> NamedTuple(
    "Outputs",
    [("best_accuracy", float), ("best_alpha", float), ("best_max_iter", int)],
):

    # pylint: disable=import-outside-toplevel
    from google.cloud import aiplatform
    from google.cloud.aiplatform import hyperparameter_tuning as hpt

    aiplatform.init(
        project=project, location=location, staging_bucket=staging_bucket
    )

    worker_pool_specs = [
        {
            "machine_spec": {
                "machine_type": "n1-standard-4",
                "accelerator_type": "NVIDIA_TESLA_K80",
                "accelerator_count": 1,
            },
            "replica_count": 1,
            "container_spec": {
                "image_uri": container_uri,
                "args": [
                    f"--training_dataset_path={training_file_path}",
                    f"--validation_dataset_path={validation_file_path}",
                    "--hptune",
                ],
            },
        }
    ]

    custom_job = aiplatform.CustomJob(
        display_name=f"{DATASET}_kfp_trial_job",
        worker_pool_specs=worker_pool_specs,
    )

    hp_job = aiplatform.HyperparameterTuningJob(
        display_name=f"{DATASET}_kfp_tuning_job",
        custom_job=custom_job,
        metric_spec={
            "accuracy": "maximize",
        },
        parameter_spec={
            "alpha": hpt.DoubleParameterSpec(
                min=1.0e-4, max=1.0e-1, scale="linear"
            ),
            "max_iter": hpt.DiscreteParameterSpec(
                values=[1, 2], scale="linear"
            ),
        },
        max_trial_count=max_trial_count,
        parallel_trial_count=parallel_trial_count,
    )

    hp_job.run()

    metrics = [
        trial.final_measurement.metrics[0].value for trial in hp_job.trials
    ]
    best_trial = hp_job.trials[metrics.index(max(metrics))]
    best_accuracy = float(best_trial.final_measurement.metrics[0].value)
    best_alpha = float(best_trial.parameters[0].value)
    best_max_iter = int(best_trial.parameters[1].value)

    return best_accuracy, best_alpha, best_max_iter

Overwriting movielens/tuning_lightweight_component.py


# Pipeline

In [24]:
%%writefile {FOLDER}/pipeline.py
import os

from kfp import dsl
# change the below imports if you change the module name
from training_lightweight_component import train_and_deploy, deploy
from tuning_lightweight_component import tune_hyperparameters


DATASET = os.getenv("DATASET")
PIPELINE_ROOT = os.getenv("PIPELINE_ROOT")
PROJECT_ID = os.getenv("PROJECT_ID")
REGION = os.getenv("REGION")

TRAINING_CONTAINER_IMAGE_URI = os.getenv("TRAINING_CONTAINER_IMAGE_URI")
SERVING_CONTAINER_IMAGE_URI = os.getenv("SERVING_CONTAINER_IMAGE_URI")

TRAINING_FILE_PATH = os.getenv("TRAINING_FILE_PATH")
VALIDATION_FILE_PATH = os.getenv("VALIDATION_FILE_PATH")

MAX_TRIAL_COUNT = int(os.getenv("MAX_TRIAL_COUNT", "5"))
PARALLEL_TRIAL_COUNT = int(os.getenv("PARALLEL_TRIAL_COUNT", "5"))
THRESHOLD = float(os.getenv("THRESHOLD", "0.6"))


@dsl.pipeline(
    name=f"{DATASET}-kfp-pipeline",
    description=f"The pipeline training and deploying the {DATASET} model",
    pipeline_root=PIPELINE_ROOT,
)
def pipeline(
    training_container_uri: str = TRAINING_CONTAINER_IMAGE_URI,
    serving_container_uri: str = SERVING_CONTAINER_IMAGE_URI,
    training_file_path: str = TRAINING_FILE_PATH,
    validation_file_path: str = VALIDATION_FILE_PATH,
    accuracy_deployment_threshold: float = THRESHOLD,
    max_trial_count: int = MAX_TRIAL_COUNT,
    parallel_trial_count: int = PARALLEL_TRIAL_COUNT,
    pipeline_root: str = PIPELINE_ROOT,
):
#     staging_bucket = f"{pipeline_root}/staging"
    
#     tuning_op = tune_hyperparameters(
#         project=PROJECT_ID,
#         location=REGION,
#         container_uri=training_container_uri,
#         training_file_path=training_file_path,
#         validation_file_path=validation_file_path,
#         staging_bucket=staging_bucket,
#         max_trial_count=max_trial_count,
#         parallel_trial_count=parallel_trial_count,
#     )

#     accuracy = tuning_op.outputs["best_accuracy"]

#     with dsl.Condition(
#         accuracy >= accuracy_deployment_threshold, name="deploy_decision"
#     ):
#         train_and_deploy_op = (  # pylint: disable=unused-variable
#             train_and_deploy(
#                 project=PROJECT_ID,
#                 location=REGION,
#                 container_uri=training_container_uri,
#                 serving_container_uri=serving_container_uri,
#                 training_file_path=training_file_path,
#                 validation_file_path=validation_file_path,
#                 staging_bucket=staging_bucket,
#                 alpha=tuning_op.outputs["best_alpha"],
#                 max_iter=tuning_op.outputs["best_max_iter"],
#             )
#         )
    deploy(
        project=PROJECT_ID,
        location=REGION,
        serving_container_uri=serving_container_uri,
        display_name='movie-recommender-keras',
        artifact_uri=f'gs://{PROJECT_ID}/kfp_tf/model/5'
    )


Overwriting movielens/pipeline.py


In [25]:
# compile the pipeline

In [26]:
!dsl-compile-v2 --py {FOLDER}/pipeline.py --output $PIPELINE_JSON



In [27]:
!head {PIPELINE_JSON}

{
  "pipelineSpec": {
    "components": {
      "comp-deploy": {
        "executorLabel": "exec-deploy",
        "inputDefinitions": {
          "parameters": {
            "artifact_uri": {
              "type": "STRING"
            },


# Deploy

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION)

pipeline = aiplatform.PipelineJob(
    display_name=f"{DATASET}_kfp_pipeline",
    template_path=PIPELINE_JSON,
    enable_caching=False,
    project=PROJECT_ID
)

pipeline.run(sync=False)

INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/1076138843678/locations/us-central1/pipelineJobs/movielens-kfp-pipeline-20220317073851 current state:
PipelineState.PIPELINE_STATE_RUNNING
