In [1]:
# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Orchestrating model training and deployment with Kubeflow Pipelines (KFP) and Cloud AI Platform

In this lab, you will develop, deploy, and run a KFP pipeline that orchestrates BigQuery and Cloud AI Platform services to train a scikit-learn model.

The pipeline you develop in the lab orchestrates GCP managed services. The source data is in BigQuery. The pipeline uses:
- Pre-build components. The pipeline uses the following pre-build components that are included with KFP distribution:
    - [BigQuery query component](https://github.com/kubeflow/pipelines/tree/0.1.36/components/gcp/bigquery/query)
    - [AI Platform Training component](https://github.com/kubeflow/pipelines/tree/0.1.36/components/gcp/ml_engine/train)
    - [AI Platform Deploy component](https://github.com/kubeflow/pipelines/tree/0.1.36/components/gcp/ml_engine/deploy)
- Custom components. The pipeline uses two custom helper components that encapsulate functionality not available in any of the pre-build components. The components are implemented using the KFP SDK's [Lightweight Python Components](https://www.kubeflow.org/docs/pipelines/sdk/lightweight-python-components/) mechanism. The code for the components is in the `helper_components.py` file:
    - **Retrieve Best Run**. This component retrieves the tuning metric and hyperparameter values for the best run of the AI Platform Training hyperparameter tuning job.
    - **Evaluate Model**. This component evaluates the *sklearn* trained model using a provided metric and a testing dataset. 

## Lab dataset
This lab uses the [Covertype Dat Set](https://archive.ics.uci.edu/ml/datasets/covertype). The pipeline developed in the lab sources the dataset from BigQuery. Before proceeding with the lab upload the dataset to BigQuery:

1. Open new terminal in you **JupyterLab**

2. Create the BigQuery dataset and upload the Cover Type csv file.

```
export PROJECT_ID=$(gcloud config get-value core/project)

DATASET_LOCATION=US
DATASET_ID=covertype_dataset
TABLE_ID=covertype
DATA_SOURCE=gs://workshop-datasets/covertype/full/dataset.csv
SCHEMA=Elevation:INTEGER,\
Aspect:INTEGER,\
Slope:INTEGER,\
Horizontal_Distance_To_Hydrology:INTEGER,\
Vertical_Distance_To_Hydrology:INTEGER,\
Horizontal_Distance_To_Roadways:INTEGER,\
Hillshade_9am:INTEGER,\
Hillshade_Noon:INTEGER,\
Hillshade_3pm:INTEGER,\
Horizontal_Distance_To_Fire_Points:INTEGER,\
Wilderness_Area:STRING,\
Soil_Type:STRING,\
Cover_Type:INTEGER

bq --location=$DATASET_LOCATION --project_id=$PROJECT_ID mk --dataset $DATASET_ID

bq --project_id=$PROJECT_ID --dataset_id=$DATASET_ID load \
--source_format=CSV \
--skip_leading_rows=1 \
--replace \
$TABLE_ID \
$DATA_SOURCE \
$SCHEMA
```

In [1]:
import subprocess

import kfp
import kfp.gcp as gcp
import kfp.dsl as dsl
import kfp.compiler as compiler
import kfp.components as comp
from kfp.dsl import types
import datetime

import kubernetes as k8s

from jinja2 import Template

In [2]:
import logging
logging.basicConfig(level=logging.INFO)

In [3]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/luoshixin/LocalDevelop/kubeflow-pipeline/kubeflow-pipeline/kubeflow-pipeline-fantasy.json'

In [4]:
# Required Parameters
PROJECT_ID='kubeflow-pipeline-fantasy'
GCS_BUCKET='gs://kubeflow-pipeline-ui'

In [5]:
PREFIX=PROJECT_ID
NAMESPACE='kubeflow'
AIP_REGION='us-central1'
AIP_ZONE='us-central1-a'
GCS_STAGING_PATH='{}/staging'.format(GCS_BUCKET)

## Create client

If you run this notebook **outside** of a Kubeflow cluster, run the following command:
- `host`: The URL of your Kubeflow Pipelines instance, for example "https://`<your-deployment>`.endpoints.`<your-project>`.cloud.goog/pipeline"
- `client_id`: The client ID used by Identity-Aware Proxy
- `other_client_id`: The client ID used to obtain the auth codes and refresh tokens.
- `other_client_secret`: The client secret used to obtain the auth codes and refresh tokens.

```python
client = kfp.Client(host, client_id, other_client_id, other_client_secret)
```

If you run this notebook **within** a Kubeflow cluster, run the following command:
```python
client = kfp.Client()
```

You'll need to create OAuth client ID credentials of type `Other` to get `other_client_id` and `other_client_secret`. Learn more about [creating OAuth credentials](
https://cloud.google.com/iap/docs/authentication-howto#authenticating_from_a_desktop_app)

In [6]:
# Optional Parameters, but required for running outside Kubeflow cluster
HOST = 'https://kubeflow-st-ui.endpoints.kubeflow-pipeline-fantasy.cloud.goog/pipeline'
# HOST = 'https://7c021d0340d296aa-dot-us-central2.pipelines.googleusercontent.com'
CLIENT_ID = "493831447550-os23o55235htd9v45a9lsejv8d1plhd0.apps.googleusercontent.com"
OTHER_CLIENT_ID = "493831447550-iu24vv6id3ng5smhf2lboovv5qukuhbh.apps.googleusercontent.com"
OTHER_CLIENT_SECRET = "cB8Xj-rb9JWCYcCRDlpTMfhc"

In [7]:
# Create kfp client
in_cluster = True
try:
  k8s.config.load_incluster_config()
except:
  in_cluster = False
  pass

if in_cluster:
    client = kfp.Client()
else:
    if HOST.endswith('.com'):
        client = kfp.Client(host=HOST)
    else:
        client = kfp.Client(host=HOST, 
                            client_id=CLIENT_ID,
                            other_client_id=OTHER_CLIENT_ID, 
                            other_client_secret=OTHER_CLIENT_SECRET)

## Helper Components

In [8]:
from typing import NamedTuple


def retrieve_best_run(
    project_id: str, job_id: str
) -> NamedTuple('Outputs', [('metric_value', float), ('alpha', float),
                            ('max_iter', int)]):
    """Retrieves the parameters of the best Hypertune run."""

    from googleapiclient import discovery
    from googleapiclient import errors

    ml = discovery.build('ml', 'v1')

    job_name = 'projects/{}/jobs/{}'.format(project_id, job_id)
    request = ml.projects().jobs().get(name=job_name)

    try:
        response = request.execute()
    except errors.HttpError as err:
        print(err)
    except:
        print('Unexpected error')

    print(response)

    best_trial = response['trainingOutput']['trials'][0]

    metric_value = best_trial['finalMetric']['objectiveValue']
    alpha = float(best_trial['hyperparameters']['alpha'])
    max_iter = int(best_trial['hyperparameters']['max_iter'])

    return (metric_value, alpha, max_iter)


In [9]:
def evaluate_model(
    dataset_path: str, model_path: str, metric_name: str
) -> NamedTuple('Outputs', [('metric_name', str), ('metric_value', float),
                            ('mlpipeline_metrics', 'Metrics')]):
    """Evaluates a trained sklearn model."""
    import pickle
    import json
    import pandas as pd
    import subprocess
    import sys
    
    from tensorflow import gfile
    from sklearn.metrics import accuracy_score, recall_score

    df_test = pd.read_csv(dataset_path)

    X_test = df_test.drop('Cover_Type', axis=1)
    y_test = df_test['Cover_Type']

    # Copy the model from GCS
    model_filename = 'model.joblib'
    gcs_model_filepath = '{}/{}'.format(model_path, model_filename)
    print(gcs_model_filepath)
    
    if gfile.Exists(model_filename):
        gfile.Remove(model_filename)

    gfile.Copy(gcs_model_filepath, model_filename)

    with open(model_filename, 'rb') as model_file:
        model = pickle.load(model_file)

    y_hat = model.predict(X_test)

    if metric_name == 'accuracy':
        metric_value = accuracy_score(y_test, y_hat)
    elif metric_name == 'recall':
        metric_value = recall_score(y_test, y_hat)
    else:
        metric_name = 'N/A'
        metric_value = 0

    # Export the metric
    metrics = {
      'metrics': [{
          'name': metric_name,
          'numberValue': float(metric_value)
      }]
    }

    return (metric_name, metric_value, json.dumps(metrics))

## Writing the program code

The following cell creates a file `train.py` that contains a Python script. The script use the Covertype Data Set to develop a multi-class classification model that predicts the type of forest cover from cartographic data.

In [10]:
%%bash

# Create folders if they don't exist.
mkdir -p tmp/aip_pipeline/covertype_training

# Create the Python file that lists GCS blobs.
cat > ./tmp/aip_pipeline/covertype_training/train.py <<HERE
import os
import subprocess
import sys

import fire
import numpy as np
import pandas as pd
import pickle
import hypertune

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from tensorflow import gfile

def train_evaluate(job_dir, training_dataset_path, validation_dataset_path,
                   alpha, max_iter, hptune):
    with gfile.Open(training_dataset_path, 'r') as f:
        # Assume there is no header
        df_train = pd.read_csv(f)

    with gfile.Open(validation_dataset_path, 'r') as f:
        # Assume there is no header
        df_validation = pd.read_csv(f)

    if not hptune:
        df_train = pd.concat([df_train, df_validation])

    numeric_feature_indexes = slice(0, 10)
    categorical_feature_indexes = slice(10, 12)

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_feature_indexes),
            ('cat', OneHotEncoder(), categorical_feature_indexes)
        ])

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', SGDClassifier(loss='log'))
    ])

    num_features_type_map = {feature: 'float64' for feature in
                             df_train.columns[numeric_feature_indexes]}
    df_train = df_train.astype(num_features_type_map)
    df_validation = df_validation.astype(num_features_type_map)

    print('Starting training: alpha={}, max_iter={}'.format(alpha, max_iter))
    X_train = df_train.drop('Cover_Type', axis=1)
    y_train = df_train['Cover_Type']

    pipeline.set_params(classifier__alpha=alpha, classifier__max_iter=max_iter)
    pipeline.fit(X_train, y_train)

    if hptune:
        X_validation = df_validation.drop('Cover_Type', axis=1)
        y_validation = df_validation['Cover_Type']
        accuracy = pipeline.score(X_validation, y_validation)
        print('Model accuracy: {}'.format(accuracy))
        # Log it with hypertune
        hpt = hypertune.HyperTune()
        hpt.report_hyperparameter_tuning_metric(
            hyperparameter_metric_tag='accuracy',
            metric_value=accuracy
        )

    # Save the model
    if not hptune:

        model_filename = 'model.pkl'
        gcs_model_path = "{}/{}".format(job_dir, model_filename)

        if gfile.Exists(gcs_model_path):
            gfile.Remove(gcs_model_path)

        with gfile.Open(gcs_model_path, 'w') as wf:
            pickle.dump(pipeline, wf)

        print("Saved model in: {}".format(gcs_model_path))
    
if __name__ == "__main__":
    fire.Fire(train_evaluate)

## Create a Docker container
Create your own container image that includes your program. Now create a container that runs the script. Start by creating a Dockerfile. 

In [11]:
%%bash

mkdir -p tmp/aip_pipeline/covertype_base

cat > ./tmp/aip_pipeline/covertype_base/requirements.txt <<EOF
fire
cloudml-hypertune
gcsfs
pandas==0.24.0
google-api-python-client==1.7.8
joblib==0.13.0
scikit-learn==0.20.2
EOF

# Create Dockerfile.
cat > ./tmp/aip_pipeline/covertype_base/Dockerfile <<EOF
FROM tensorflow/tensorflow:1.14.0-py3
WORKDIR /app
COPY . /app
RUN pip install -r requirements.txt
EOF

In [12]:
IMAGE_NAME='base_image'
TAG='latest'

BASE_IMAGE_URI="gcr.io/{PROJECT_ID}/{IMAGE_NAME}:{TAG}".format(
    PROJECT_ID=PROJECT_ID,
    IMAGE_NAME=IMAGE_NAME,
    TAG=TAG
)

BASE_APP_FOLDER='./tmp/aip_pipeline/covertype_base/'

In [None]:
! gcloud builds submit --tag $BASE_IMAGE_URI $BASE_APP_FOLDER

Creating temporary tarball archive of 2 file(s) totalling 211 bytes before compression.
Uploading tarball of [./tmp/aip_pipeline/covertype_base/] to [gs://kubeflow-pipeline-fantasy_cloudbuild/source/1584428228.977243-fcee4428cdc647c6a807d695d53c3507.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/kubeflow-pipeline-fantasy/builds/319675bf-008f-46d9-98b4-b6e7cf640d7b].
Logs are available at [https://console.cloud.google.com/cloud-build/builds/319675bf-008f-46d9-98b4-b6e7cf640d7b?project=493831447550].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "319675bf-008f-46d9-98b4-b6e7cf640d7b"

FETCHSOURCE
Fetching storage object: gs://kubeflow-pipeline-fantasy_cloudbuild/source/1584428228.977243-fcee4428cdc647c6a807d695d53c3507.tgz#1584428230141965
Copying gs://kubeflow-pipeline-fantasy_cloudbuild/source/1584428228.977243-fcee4428cdc647c6a807d695d53c3507.tgz#1584428230141965...
/ [1 files][  341.0 B/  341.0 B]                         

In [None]:
%%bash

mkdir -p tmp/aip_pipeline/covertype_training

cat > ./tmp/aip_pipeline/covertype_training/requirements.txt <<EOF
fire
cloudml-hypertune
gcsfs
pandas==0.24.0
google-api-python-client==1.7.8
joblib==0.13.0
scikit-learn==0.20.2
EOF

# Create Dockerfile.
cat > ./tmp/aip_pipeline/covertype_training/Dockerfile <<EOF
FROM tensorflow/tensorflow:1.14.0-py3
WORKDIR /app
COPY . /app
RUN pip install -r requirements.txt

ENTRYPOINT ["python", "train.py"]
EOF

In [None]:
IMAGE_NAME='traing_image'
TAG='latest'

TRAIN_IMAGE_URI="gcr.io/{PROJECT_ID}/{IMAGE_NAME}:{TAG}".format(
    PROJECT_ID=PROJECT_ID,
    IMAGE_NAME=IMAGE_NAME,
    TAG=TAG
)

TRAIN_APP_FOLDER='./tmp/aip_pipeline/covertype_training/'

In [None]:
! gcloud builds submit --tag $TRAIN_IMAGE_URI $TRAIN_APP_FOLDER

## Load Components

In [None]:
COMPONENT_URL_SEARCH_PREFIX='https://raw.githubusercontent.com/kubeflow/pipelines/0.1.36/components/gcp/'
# COMPONENT_URL_SEARCH_PREFIX='https://raw.githubusercontent.com/kubeflow/pipelines/3f4b80127f35e40760eeb1813ce1d3f641502222/components/gcp/'

In [None]:
# Create component factories
component_store = kfp.components.ComponentStore(
    local_search_paths=None, url_search_prefixes=[COMPONENT_URL_SEARCH_PREFIX])

bigquery_query_op = component_store.load_component('bigquery/query')
mlengine_train_op = component_store.load_component('ml_engine/train')
mlengine_deploy_op = component_store.load_component('ml_engine/deploy')

In [None]:
retrieve_best_run_op = comp.func_to_container_op(retrieve_best_run, base_image=BASE_IMAGE_URI)
evaluate_model_op = comp.func_to_container_op(evaluate_model, base_image=BASE_IMAGE_URI)

## Define deployment operation on AI Platform

In [None]:
# Helper functions
def generate_sampling_query(source_table_name, num_lots, lots):
    """Prepares the data sampling query."""

    sampling_query_template = """
       SELECT *
       FROM 
           `{{ source_table }}` AS cover
       WHERE 
       MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(cover))), {{ num_lots }}) IN ({{ lots }})
       """
    query = Template(sampling_query_template).render(
      source_table=source_table_name, num_lots=num_lots, lots=str(lots)[1:-1])

    return query

In [None]:
TRAINING_FILE_PATH = 'datasets/training/data.csv'
VALIDATION_FILE_PATH = 'datasets/validation/data.csv'
TESTING_FILE_PATH = 'datasets/testing/data.csv'

# Parameter defaults
SPLITS_DATASET_ID = 'splits'
HYPERTUNE_SETTINGS = """
{
    "hyperparameters":  {
        "goal": "MAXIMIZE",
        "maxTrials": 6,
        "maxParallelTrials": 3,
        "hyperparameterMetricTag": "accuracy",
        "enableTrialEarlyStopping": True,
        "params": [
            {
                "parameterName": "max_iter",
                "type": "DISCRETE",
                "discreteValues": [500, 1000]
            },
            {
                "parameterName": "alpha",
                "type": "DOUBLE",
                "minValue": 0.0001,
                "maxValue": 0.001,
                "scaleType": "UNIT_LINEAR_SCALE"
            }
        ]
    }
}
"""

In [None]:
@kfp.dsl.pipeline(
    name='Covertype Classifier Training',
    description='The pipeline training and deploying the Covertype classifierpipeline_yaml'
)
def covertype_pipeline(
    project_id: types.GCPProjectID,
    region: types.GCPRegion,
    source_table_name: types.String,
    gcs_root: types.GCSPath,
    dataset_id: str,
    evaluation_metric_name: str,
    evaluation_metric_threshold: float,
    model_id: str,
    replace_existing_version: bool,
    hypertune_settings: types.Dict = HYPERTUNE_SETTINGS,
    dataset_location: str = 'US'
):
    """Orchestrates training and deployment of an sklearn model."""

    # Create the training split
    query = generate_sampling_query(
        source_table_name=source_table_name, num_lots=10, lots=[1, 2, 3, 4])

    training_file_path = '{}/{}'.format(gcs_root, TRAINING_FILE_PATH)

    create_training_split = bigquery_query_op(
        query=query,
        project_id=project_id,
        dataset_id=dataset_id,
        table_id='',
        output_gcs_path=training_file_path,
        dataset_location=dataset_location)

    # Create the validation split
    query = generate_sampling_query(
        source_table_name=source_table_name, num_lots=10, lots=[8])

    validation_file_path = '{}/{}'.format(gcs_root, VALIDATION_FILE_PATH)

    create_validation_split = bigquery_query_op(
        query=query,
        project_id=project_id,
        dataset_id=dataset_id,
        table_id='',
        output_gcs_path=validation_file_path,
        dataset_location=dataset_location)

    # Create the testing split
    query = generate_sampling_query(
        source_table_name=source_table_name, num_lots=10, lots=[9])

    testing_file_path = '{}/{}'.format(gcs_root, TESTING_FILE_PATH)

    create_testing_split = bigquery_query_op(
        query=query,
        project_id=project_id,
        dataset_id=dataset_id,
        table_id='',
        output_gcs_path=testing_file_path,
        dataset_location=dataset_location)

    # Tune hyperparameters
    tune_args = [
      '--training_dataset_path',
      create_training_split.outputs['output_gcs_path'],
      '--validation_dataset_path',
      create_validation_split.outputs['output_gcs_path'], '--hptune', 'True'
    ]

    job_dir = '{}/{}/{}'.format(gcs_root, 'jobdir/hypertune',
                              kfp.dsl.RUN_ID_PLACEHOLDER)

    hypertune = mlengine_train_op(
        project_id=project_id,
        region=region,
        master_image_uri=TRAIN_IMAGE_URI,
        job_dir=job_dir,
        args=tune_args,
        training_input=hypertune_settings)

    # Retrieve the best trial
    get_best_trial = retrieve_best_run_op(project_id, hypertune.outputs['job_id'])

    # Train the model on a combined training and validation datasets
    job_dir = '{}/{}/{}'.format(gcs_root, 'jobdir', kfp.dsl.RUN_ID_PLACEHOLDER)
    train_args = [
        '--training_dataset_path',
        create_training_split.outputs['output_gcs_path'],
        '--validation_dataset_path',
        create_validation_split.outputs['output_gcs_path'], '--alpha',
        get_best_trial.outputs['alpha'], '--max_iter',
        get_best_trial.outputs['max_iter'], '--hptune', 'False'
    ]

    train_model = mlengine_train_op(
        project_id=project_id,
        region=region,
        master_image_uri=TRAIN_IMAGE_URI,
        job_dir=job_dir,
        args=train_args)

    # Evaluate the model on the testing split
    eval_model = evaluate_model_op(
        dataset_path=str(create_testing_split.outputs['output_gcs_path']),
        model_path=str(train_model.outputs['job_dir']),
        metric_name=evaluation_metric_name)

    # Deploy the model if the primary metric is better than threshold
    with kfp.dsl.Condition(eval_model.outputs['metric_value'] > evaluation_metric_threshold):
        deploy_model = mlengine_deploy_op(
            model_uri=train_model.outputs['job_dir'],
            project_id=project_id,
            model_id=model_id,
            runtime_version="1.14",
            python_version="3.5",
            replace_existing_version=True, 
            set_default=True)

    kfp.dsl.get_pipeline_conf().add_op_transformer(gcp.use_gcp_secret('user-gcp-sa'))

### Submit a pipeline run

In [None]:
pipeline_func = covertype_pipeline

In [None]:
experiment_name = 'covertype_kubeflow'

arguments = {
    'project_id': PROJECT_ID,
    'gcs_root': GCS_STAGING_PATH,
    'region': AIP_REGION,
    'source_table_name': 'covertype_dataset.covertype',
    'dataset_id': 'splits',
    'evaluation_metric_name': 'accuracy',
    'evaluation_metric_threshold': 0.69,
    'model_id': 'covertype_classifier',
    'replace_existing_version': True
}

run_name = pipeline_func.__name__ + ' run'

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments)