# Orchestraining training and deployment of scikit-learn model with Kubeflow Pipelines and Cloud AI Platform. 

In this lab you develop the KFP pipeline that orchestrates BigQuery and Cloud AI Platform services to train and deploy a **scikit-learn** model. The lab uses the [Covertype Dat Set](../datasets/covertype/README.md). The model is a multi-class classification model that predicts the type of forest cover from cartographic data. 

The source data is in BigQuery. The pipeline uses BigQuery to prepare training and evaluation splits, AI Platform Training to run a custom container with data preprocessing and training code, and AI Platform Prediction as a deployment target. The below diagram represents the workflow orchestrated by the pipeline.

![Training pipeline](../images/kfp-caip.png)



In [5]:
import kfp
import os
import uuid
import time
import tempfile

import pandas as pd

from google.cloud import bigquery
from jinja2 import Template
from kfp.components import func_to_container_op
from typing import NamedTuple


from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

print('KFP: {}'.format(kfp.__version__))

KFP: 0.1.37


## Configure environment settings
Make sure to update the constants to reflect your environment settings.

In [6]:
PROJECT_ID = 'mlops-workshop'
DATASET_LOCATION = 'US'
CLUSTER_NAME = 'mlops-workshop-cluster'
CLUSTER_ZONE = 'us-central1-a'
REGION = 'us-central1'
DATASET_ID = 'lab_12'
SOURCE_TABLE_ID = 'covertype'
TRAINING_TABLE_ID = 'training_split'
VALIDATION_TABLE_ID = 'validation_split'
TESTING_TABLE_ID = 'testing_split'
LAB_GCS_BUCKET='gs://mlops-workshop-lab-12'
TRAINING_FILE_PATH = LAB_GCS_BUCKET + '/datasets/covertype_training.csv'
VALIDATION_FILE_PATH = LAB_GCS_BUCKET + '/datasets/covertype_validation.csv'
TESTING_FILE_PATH = LAB_GCS_BUCKET + '/datasets/covertype_testing.csv'

COMPONENT_URL_SEARCH_PREFIX = 'https://raw.githubusercontent.com/kubeflow/pipelines/{}/components/gcp/'.format(kfp.__version__)

## Experimentation

### Explore the source dataset 

In [13]:
client = bigquery.Client(project=PROJECT_ID, location=DATASET_LOCATION)

In [19]:
query_template = """
SELECT *
FROM `{{ source_table }}`
LIMIT 10
"""

query = Template(query_template).render(
    source_table='{}.{}.{}'.format(PROJECT_ID, DATASET_ID, SOURCE_TABLE_ID))
df = client.query(query).to_dataframe()
num_of_columns = len(df.columns)
df

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,3094,82,65,42,3,3001,193,0,0,1315,Commanche,7202,2
1,3083,105,57,0,0,3002,228,0,0,1350,Commanche,7202,2
2,3159,60,37,150,0,3045,220,0,17,1177,Commanche,7756,2
3,3158,73,62,170,-4,3042,191,0,0,1187,Commanche,7756,2
4,3147,96,59,216,-6,3037,220,0,0,1209,Commanche,7756,2
5,2506,13,64,201,88,655,73,30,0,1470,Commanche,4703,2
6,2501,3,63,216,81,626,55,40,0,1470,Commanche,4703,2
7,3281,38,59,150,123,3012,137,42,0,1159,Commanche,7756,2
8,2500,0,62,234,83,598,54,45,67,1471,Commanche,4703,2
9,2555,3,60,190,135,684,67,53,65,1470,Commanche,4703,2


In [21]:
query_template = """
SELECT count(*)
FROM `{{ source_table }}`
"""

query = Template(query_template).render(
    source_table='{}.{}.{}'.format(PROJECT_ID, DATASET_ID, SOURCE_TABLE_ID))
df = client.query(query).to_dataframe()
number_of_rows_in_full_dataset = df.iloc[0,0]
print('{} x {}'.format(number_of_rows_in_full_dataset, num_of_columns))

581012 x 13


### Create the training, validation and testing splits
#### Define the sampling query template

In [22]:
sampling_query_template = """
SELECT *
FROM 
  `{{ source_table }}` AS cover
WHERE 
  MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(cover))), {{ num_lots }}) in {{ lots_to_select }}
"""

#### Configure query job settings

In [23]:
job_config = bigquery.QueryJobConfig()
job_config.create_disposition = bigquery.job.CreateDisposition.CREATE_IF_NEEDED
job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE
dataset_ref = client.dataset(DATASET_ID)

#### Create the training split

In [24]:
query = Template(sampling_query_template).render(
    source_table='{}.{}.{}'.format(PROJECT_ID, DATASET_ID, SOURCE_TABLE_ID),
    num_lots=10,
    lots_to_select='(1, 2, 3)')

job_config.destination = dataset_ref.table(TRAINING_TABLE_ID)
client.query(query, job_config).result()

<google.cloud.bigquery.table.RowIterator at 0x7f5c2f855160>

#### Extract the training split to GCS

In [25]:
client.extract_table(dataset_ref.table(TRAINING_TABLE_ID), TRAINING_FILE_PATH).result()

<google.cloud.bigquery.job.ExtractJob at 0x7f5c2f82ec50>

In [26]:
!gsutil cat -r 0-500 {TRAINING_FILE_PATH}

Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
3222,0,0,120,1,3207,218,237,156,1698,Rawah,7201,1
3211,0,0,418,30,5862,218,238,156,2210,Rawah,7201,2
3211,90,0,30,1,5286,219,237,155,780,Rawah,7201,1
3046,0,0,228,1,666,218,238,156,1298,Rawah,7201,1
3211,180,0,437,30,5878,219,238,157,2230,Rawah,7201,2
3283,225,0,511,

#### Create the validation split

In [27]:
query = Template(sampling_query_template).render(
    source_table='{}.{}.{}'.format(PROJECT_ID, DATASET_ID, SOURCE_TABLE_ID),
    num_lots=10,
    lots_to_select='(8)')

job_config.destination = dataset_ref.table(VALIDATION_TABLE_ID)
client.query(query, job_config).result()

<google.cloud.bigquery.table.RowIterator at 0x7f5c2f85ccf8>

#### Extract the validation split to GCS

In [28]:
client.extract_table(dataset_ref.table(VALIDATION_TABLE_ID), VALIDATION_FILE_PATH).result()

<google.cloud.bigquery.job.ExtractJob at 0x7f5c2f84fcf8>

#### Create the testing split

In [29]:
query = Template(sampling_query_template).render(
    source_table='{}.{}.{}'.format(PROJECT_ID, DATASET_ID, SOURCE_TABLE_ID),
    num_lots=10,
    lots_to_select='(9)')

job_config.destination = dataset_ref.table(TESTING_TABLE_ID)
client.query(query, job_config).result()

<google.cloud.bigquery.table.RowIterator at 0x7f5c2f862668>

#### Extract the testing split to GCS

In [30]:
client.extract_table(dataset_ref.table(TESTING_TABLE_ID), TESTING_FILE_PATH).result()

<google.cloud.bigquery.job.ExtractJob at 0x7f5c2f85c518>

### Develop the training script

#### Configure the `sklearn` training pipeline

In [35]:
numeric_features = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points']
categorical_features = ['Wilderness_Area', 'Soil_Type']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features) 
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier(loss='log'))
])

#### Train and evaluate

In [45]:
df_train = pd.read_csv(TRAINING_FILE_PATH)
df_validation = pd.read_csv(VALIDATION_FILE_PATH)

X_train = df_train.drop('Cover_Type', axis=1)
y_train = df_train['Cover_Type']
X_validation = df_validation.drop('Cover_Type', axis=1)
y_validation = df_validation['Cover_Type']

pipeline.set_params(classifier__alpha=0.001, classifier__max_iter=1000)
pipeline.fit(X_train, y_train)
pipeline.score(X_validation, y_validation)

0.7011582223582836

#### Prepare a hyperparameter tuning script
Since the training run on this dataset is computationally expensive you can benefit from running a distributed hyperparameter tuning job on AI Platform Training.

In [46]:
TRAINING_APP_FOLDER = 'training_app'
os.makedirs(TRAINING_APP_FOLDER, exist_ok=True)

In [48]:
%%writefile {TRAINING_APP_FOLDER}/train.py

import os
import subprocess
import sys

import fire
import joblib
import numpy as np
import pandas as pd

import hypertune

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder


def train_evaluate(job_dir, training_dataset_path, validation_dataset_path, alpha, max_iter, dataset_location='US'):
    
  df_train = pd.read_csv(training_dataset_path)
  df_validation = pd.read_csv(validation_dataset_path)

  numeric_features = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
    'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points']
    
  categorical_features = ['Wilderness_Area', 'Soil_Type']

  preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features) 
    ])

  pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier(loss='log'))
  ])

  print('Starting training: alpha={}, max_iter={}'.format(alpha, max_iter))
  X_train = df_train.drop('Cover_Type', axis=1)
  y_train = df_train['Cover_Type']
  X_validation = df_validation.drop('Cover_Type', axis=1)
  y_validation = df_validation['Cover_Type']
    
  pipeline.set_params(classifier__alpha=alpha, classifier__max_iter=max_iter)
  pipeline.fit(X_train, y_train)
  accuracy = pipeline.score(X_validation, y_validation)
  print('Finished training. Model accuracy: {}'.format(accuracy))
    
  # Log it with hypertune
  hpt = hypertune.HyperTune()
  hpt.report_hyperparameter_tuning_metric(
    hyperparameter_metric_tag='accuracy',
    metric_value=accuracy
    )

  # Save the model
  model_filename = 'model.joblib'
  joblib.dump(value=pipeline, filename=model_filename)
  gcs_model_path = "{}/{}".format(job_dir, model_filename)
  subprocess.check_call(['gsutil', 'cp', model_filename, gcs_model_path], stderr=sys.stdout)
  print("Saved model in: {}".format(gcs_model_path)) 
    
if __name__ == "__main__":
  fire.Fire(train_evaluate)

Overwriting training_app/train.py


#### Package the script into a docker image

In [49]:
%%writefile {TRAINING_APP_FOLDER}/Dockerfile

FROM gcr.io/deeplearning-platform-release/base-cpu
RUN pip install -U fire cloudml-hypertune
WORKDIR /app
COPY train.py .

ENTRYPOINT ["python", "train.py"]

Overwriting training_app/Dockerfile


In [50]:
IMAGE_NAME='covertype_trainer'
IMAGE_TAG='latest'
IMAGE_URI='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, IMAGE_TAG)

!gcloud builds submit --tag $IMAGE_URI $TRAINING_APP_FOLDER

Creating temporary tarball archive of 3 file(s) totalling 2.7 KiB before compression.
Uploading tarball of [training_app] to [gs://mlops-workshop_cloudbuild/source/1576184362.86-fc356da96e2e4533a2331d2e07c2b3f8.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/mlops-workshop/builds/6a1f49ca-ce78-4da8-b2e3-af801846d370].
Logs are available at [https://console.cloud.google.com/gcr/builds/6a1f49ca-ce78-4da8-b2e3-af801846d370?project=745302968357].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "6a1f49ca-ce78-4da8-b2e3-af801846d370"

FETCHSOURCE
Fetching storage object: gs://mlops-workshop_cloudbuild/source/1576184362.86-fc356da96e2e4533a2331d2e07c2b3f8.tgz#1576184363357285
Copying gs://mlops-workshop_cloudbuild/source/1576184362.86-fc356da96e2e4533a2331d2e07c2b3f8.tgz#1576184363357285...
/ [1 files][  1.4 KiB/  1.4 KiB]                                                
Operation completed over 1 objects/1.4 KiB.                     

#### Create hyperparameter configuration file

In [51]:
%%writefile {TRAINING_APP_FOLDER}/hptuning_config.yaml

trainingInput:
  hyperparameters:
    goal: MAXIMIZE
    maxTrials: 9
    maxParallelTrials: 3
    hyperparameterMetricTag: accuracy
    enableTrialEarlyStopping: TRUE 
    params:
    - parameterName: max_iter
      type: DISCRETE
      discreteValues: [
          500,
          1000
          ]
    - parameterName: alpha
      type: DOUBLE
      minValue:  0.00001
      maxValue:  0.01
      scaleType: UNIT_LINEAR_SCALE

Overwriting training_app/hptuning_config.yaml


#### Submit hyperparameter tuning job

In [52]:
IMAGE_NAME='covertype_trainer'
IMAGE_TAG='latest'
IMAGE_URI='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, IMAGE_TAG)
JOB_NAME = "JOB_{}".format(time.strftime("%Y%m%d_%H%M%S"))
JOB_DIR = "{}/{}".format(LAB_GCS_BUCKET, JOB_NAME)
SCALE_TIER = "BASIC"

!gcloud ai-platform jobs submit training $JOB_NAME \
--region=$REGION \
--job-dir=$LAB_GCS_BUCKET/$JOB_NAME \
--master-image-uri=$IMAGE_URI \
--scale-tier=$SCALE_TIER \
--config $TRAINING_APP_FOLDER/hptuning_config.yaml \
-- \
--training_dataset_path=$TRAINING_FILE_PATH \
--validation_dataset_path=$VALIDATION_FILE_PATH 

Job [JOB_20191212_210301] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe JOB_20191212_210301

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs JOB_20191212_210301
jobId: JOB_20191212_210301
state: QUEUED


In [53]:
!gcloud ai-platform jobs describe $JOB_NAME

createTime: '2019-12-12T21:03:04Z'
etag: 2NCS6Jj5Ckg=
jobId: JOB_20191212_210301
state: PREPARING
trainingInput:
  args:
  - --training_dataset_path=gs://mlops-workshop-lab-12/datasets/covertype_training.csv
  - --validation_dataset_path=gs://mlops-workshop-lab-12/datasets/covertype_validation.csv
  hyperparameters:
    enableTrialEarlyStopping: true
    goal: MAXIMIZE
    hyperparameterMetricTag: accuracy
    maxParallelTrials: 3
    maxTrials: 10
    params:
    - discreteValues:
      - 500.0
      - 1000.0
      parameterName: max_iter
      type: DISCRETE
    - maxValue: 0.01
      minValue: 1e-05
      parameterName: alpha
      scaleType: UNIT_LINEAR_SCALE
      type: DOUBLE
  jobDir: gs://mlops-workshop-lab-12/JOB_20191212_210301
  masterConfig:
    imageUri: gcr.io/mlops-workshop/covertype_trainer:latest
  region: us-central1
trainingOutput:
  isHyperparameterTuningJob: true

View job in the Cloud Console at:
https://console.cloud.google.com/mlengine/jobs/JOB_20191212_210301?p

In [54]:
!gcloud ai-platform jobs stream-logs $JOB_NAME

INFO	2019-12-12 21:03:03 +0000	service		Validating job requirements...
INFO	2019-12-12 21:03:04 +0000	service		Job creation request has been successfully validated.
INFO	2019-12-12 21:03:04 +0000	service		Job JOB_20191212_210301 is queued.
INFO	2019-12-12 21:03:13 +0000	service	3	Waiting for job to be provisioned.
INFO	2019-12-12 21:03:13 +0000	service	1	Waiting for job to be provisioned.
INFO	2019-12-12 21:03:13 +0000	service	2	Waiting for job to be provisioned.
INFO	2019-12-12 21:03:16 +0000	service	1	Waiting for training program to start.
INFO	2019-12-12 21:03:16 +0000	service	2	Waiting for training program to start.
INFO	2019-12-12 21:03:16 +0000	service	3	Waiting for training program to start.
INFO	2019-12-12 21:06:24 +0000	master-replica-0	3	Copying file://model.joblib [Content-Type=application/octet-stream]...
INFO	2019-12-12 21:06:24 +0000	master-replica-0	3	/ [0 files][    0.0 B/  7.2 KiB]                                                
INFO	2019-12-12 21:06:24 +0000	master-re

In [83]:
from googleapiclient import discovery
from googleapiclient import errors
import numpy as np

In [58]:
ml = discovery.build('ml', 'v1')

In [67]:
job_id = 'projects/{}/jobs/{}'.format(PROJECT_ID, JOB_NAME)
request = ml.projects().jobs().get(name=job_id)

try:
    response = request.execute()
except errors.HttpError as err:
    print(err)
except:
    print("Unexpected error")

In [87]:
response

{'jobId': 'JOB_20191212_210301',
 'trainingInput': {'args': ['--training_dataset_path=gs://mlops-workshop-lab-12/datasets/covertype_training.csv',
   '--validation_dataset_path=gs://mlops-workshop-lab-12/datasets/covertype_validation.csv'],
  'hyperparameters': {'goal': 'MAXIMIZE',
   'params': [{'parameterName': 'max_iter',
     'type': 'DISCRETE',
     'discreteValues': [500, 1000]},
    {'parameterName': 'alpha',
     'minValue': 1e-05,
     'maxValue': 0.01,
     'type': 'DOUBLE',
     'scaleType': 'UNIT_LINEAR_SCALE'}],
   'maxTrials': 10,
   'maxParallelTrials': 3,
   'hyperparameterMetricTag': 'accuracy',
   'enableTrialEarlyStopping': True},
  'region': 'us-central1',
  'jobDir': 'gs://mlops-workshop-lab-12/JOB_20191212_210301',
  'masterConfig': {'imageUri': 'gcr.io/mlops-workshop/covertype_trainer:latest'}},
 'createTime': '2019-12-12T21:03:04Z',
 'startTime': '2019-12-12T21:03:08Z',
 'endTime': '2019-12-12T21:35:59Z',
 'state': 'SUCCEEDED',
 'trainingOutput': {'completedTria

In [71]:
response['trainingOutput']

{'completedTrialCount': '10',
 'trials': [{'trialId': '10',
   'hyperparameters': {'alpha': '0.0018003200332565434', 'max_iter': '500'},
   'finalMetric': {'trainingStep': '1', 'objectiveValue': 0.6969913430957362},
   'startTime': '2019-12-12T21:27:33.378290700Z',
   'endTime': '2019-12-12T21:35:07Z',
   'state': 'SUCCEEDED'},
  {'trialId': '9',
   'hyperparameters': {'alpha': '0.0023256017517083918', 'max_iter': '500'},
   'finalMetric': {'trainingStep': '1', 'objectiveValue': 0.693691855026617},
   'startTime': '2019-12-12T21:19:50.113970681Z',
   'endTime': '2019-12-12T21:26:51Z',
   'state': 'SUCCEEDED'},
  {'trialId': '7',
   'hyperparameters': {'alpha': '0.0022987899974855593', 'max_iter': '1000'},
   'finalMetric': {'trainingStep': '1', 'objectiveValue': 0.693011548208242},
   'startTime': '2019-12-12T21:19:47.133913295Z',
   'endTime': '2019-12-12T21:27:08Z',
   'state': 'SUCCEEDED'},
  {'trialId': '4',
   'hyperparameters': {'max_iter': '1000', 'alpha': '0.0028408783888816836

## Create the training pipeline

In [89]:
best_run_index = np.argmax([trial['finalMetric']['objectiveValue'] for trial in response['trainingOutput']['trials']])

In [92]:
[(trial['finalMetric'], trial['hyperparameters']) for trial in response['trainingOutput']['trials']][best_run_index]

({'trainingStep': '1', 'objectiveValue': 0.6969913430957362},
 {'alpha': '0.0018003200332565434', 'max_iter': '500'})

In [86]:
np.argmax([1, 2, 3, 5, 4, 5])

3

In [3]:
sampling_query_template = """
SELECT *
FROM 
  `{{ source_table }}` AS cover
WHERE 
  MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(cover))), {{ num_lots }}) in {{ lots_to_select }}
"""

In [4]:
query = Template(sampling_query_template).render(
    source_table='{}.{}.{}'.format(PROJECT_ID, DATASET_ID, SOURCE_TABLE_ID),
    num_lots=10,
    lots_to_select='(8)')

In [5]:
component_store = kfp.components.ComponentStore(
    local_search_paths=None,
    url_search_prefixes=[COMPONENT_URL_SEARCH_PREFIX])
    
bigquery_query_op = component_store.load_component('bigquery/query')

In [6]:
output_gcs_path = '{}/{}'.format(LAB_GCS_BUCKET, 'sample_data')

In [15]:
from kfp.dsl.types import GCPProjectID

@kfp.dsl.pipeline(
    name='Covertype Classifier Training',
    description='The pipeline training and deploying the Covertype classifierpipeline_yaml'
)
def covertype_train(
    project_id:GCPProjectID =PROJECT_ID,
    query:str =query,
    table_id:str =TRAINING_TABLE_ID,
    dataset_id:str =DATASET_ID,
    dataset_location:str =DATASET_LOCATION,
    output_gcs_path:str =output_gcs_path
    ):
    
    
    
    sample_data = bigquery_query_op(
        query=query,
        project_id=project_id,
        dataset_id=dataset_id,
        table_id=table_id,
        output_gcs_path=output_gcs_path,
        dataset_location=dataset_location
        )
    
    from kfp.gcp import use_gcp_secret
    kfp.dsl.get_pipeline_conf().add_op_transformer(use_gcp_secret('user-gcp-sa'))


In [16]:
pipeline_yaml = 'covertype_training.yaml'
kfp.compiler.Compiler().compile(covertype_train, pipeline_yaml)

In [17]:
pipeline_name = 'covertype_training_pipeline'
client = kfp.Client()

pipelines = [pipeline for pipeline in client.list_pipelines(page_size=100).pipelines if pipeline.name == pipeline_name]

if pipelines:
    print("Pipeline with this name already exists")
    pipeline_ref = pipelines[0]
    
else:
    pipeline_ref = client.upload_pipeline(pipeline_yaml, pipeline_name)