# Create and Commit Artifacts 

In [1]:
PROJECT_ID = 'demokfp'
PREFIX = PROJECT_ID
REGION = 'us-central1'

DATA_ROOT = 'gs://workshop-datasets/covertype'
TRAINING_FILE_PATH = DATA_ROOT + '/training/dataset.csv'
VALIDATION_FILE_PATH = DATA_ROOT + '/evaluation/dataset.csv'

# Job dir for AI Platform Training
JOB_DIR_ROOT='gs://{}-artifact-store/jobs'.format(PREFIX)


NAMESPACE='kubeflow'
ZONE='us-central1-a'
ARTIFACT_STORE_URI='gs://{}-artifact-store'.format(PREFIX)
GCS_STAGING_PATH='{}/staging'.format(ARTIFACT_STORE_URI)
GKE_CLUSTER_NAME='{}-cluster'.format(PREFIX)

!gcloud container clusters get-credentials $GKE_CLUSTER_NAME --zone $ZONE
HOST_TEMP=!(kubectl describe configmap inverse-proxy-config -n $NAMESPACE | grep "googleusercontent.com")
INVERSE_PROXY_HOSTNAME=HOST_TEMP[0]


Fetching cluster endpoint and auth data.
kubeconfig entry generated for demokfp-cluster.


## Imports

In [2]:
import json
import os
import numpy as np
import pandas as pd
import pickle
import uuid
import time
import tempfile

from googleapiclient import discovery
from googleapiclient import errors

from google.cloud import bigquery
from jinja2 import Template
from kfp.components import func_to_container_op
from typing import NamedTuple

from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

## Import data set to BQ

In [3]:
DATASET_LOCATION='US'
DATASET_ID='covertype_dataset'
TABLE_ID='covertype'
DATA_SOURCE='gs://workshop-datasets/covertype/full/dataset.csv'
SCHEMA='Elevation:INTEGER,\
Aspect:INTEGER,\
Slope:INTEGER,\
Horizontal_Distance_To_Hydrology:INTEGER,\
Vertical_Distance_To_Hydrology:INTEGER,\
Horizontal_Distance_To_Roadways:INTEGER,\
Hillshade_9am:INTEGER,\
Hillshade_Noon:INTEGER,\
Hillshade_3pm:INTEGER,\
Horizontal_Distance_To_Fire_Points:INTEGER,\
Wilderness_Area:STRING,\
Soil_Type:STRING,\
Cover_Type:INTEGER'

!bq --location=$DATASET_LOCATION --project_id=$PROJECT_ID mk --dataset $DATASET_ID
!bq --project_id=$PROJECT_ID --dataset_id=$DATASET_ID load \
--source_format=CSV \
--skip_leading_rows=1 \
--replace \
$TABLE_ID \
$DATA_SOURCE \
$SCHEMA

BigQuery error in mk operation: Dataset 'demokfp:covertype_dataset' already
exists.
Waiting on bqjob_r13cf51348cdc5136_0000017032996d9f_1 ... (14s) Current status: DONE   


### Prepare the hyperparameter tuning application.
Since the training run on this dataset is computationally expensive you can benefit from running a distributed hyperparameter tuning job on AI Platform Training.

In [4]:
TRAINING_APP_FOLDER = 'trainer_image'
BASE_IMAGE_FOLDER='base_image'
os.makedirs(TRAINING_APP_FOLDER, exist_ok=True)
os.makedirs(BASE_IMAGE_FOLDER, exist_ok=True)

### Write the tuning script. 

Notice the use of the `hypertune` package to report the `accuracy` optimization metric to AI Platform hyperparameter tuning service.

In [5]:
%%writefile {TRAINING_APP_FOLDER}/train.py
"""Covertype Classifier trainer script."""

import pickle
import subprocess
import sys

import fire
import hypertune
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler


def train_evaluate(job_dir, training_dataset_path, validation_dataset_path,
                   alpha, max_iter, hptune):
  """Trains the Covertype Classifier model."""

  df_train = pd.read_csv(training_dataset_path)
  df_validation = pd.read_csv(validation_dataset_path)

  if not hptune:
    df_train = pd.concat([df_train, df_validation])

  numeric_features = [
      'Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
      'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
      'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
      'Horizontal_Distance_To_Fire_Points'
  ]

  categorical_features = ['Wilderness_Area', 'Soil_Type']

  preprocessor = ColumnTransformer(transformers=[(
      'num', StandardScaler(),
      numeric_features), ('cat', OneHotEncoder(), categorical_features)])

  pipeline = Pipeline([('preprocessor', preprocessor),
                       ('classifier', SGDClassifier(loss='log'))])

  num_features_type_map = {feature: 'float64' for feature in numeric_features}
  df_train = df_train.astype(num_features_type_map)
  df_validation = df_validation.astype(num_features_type_map)

  print('Starting training: alpha={}, max_iter={}'.format(alpha, max_iter))
  X_train = df_train.drop('Cover_Type', axis=1)
  y_train = df_train['Cover_Type']

  pipeline.set_params(classifier__alpha=alpha, classifier__max_iter=max_iter)
  pipeline.fit(X_train, y_train)

  if hptune:
    X_validation = df_validation.drop('Cover_Type', axis=1)
    y_validation = df_validation['Cover_Type']
    accuracy = pipeline.score(X_validation, y_validation)
    print('Model accuracy: {}'.format(accuracy))
    # Log it with hypertune
    hpt = hypertune.HyperTune()
    hpt.report_hyperparameter_tuning_metric(
        hyperparameter_metric_tag='accuracy', metric_value=accuracy)

  # Save the model
  if not hptune:
    model_filename = 'model.pkl'
    with open(model_filename, 'wb') as model_file:
      pickle.dump(pipeline, model_file)
    gcs_model_path = '{}/{}'.format(job_dir, model_filename)
    subprocess.check_call(['gsutil', 'cp', model_filename, gcs_model_path],
                          stderr=sys.stdout)
    print('Saved model in: {}'.format(gcs_model_path))


if __name__ == '__main__':
  fire.Fire(train_evaluate)

Writing trainer_image/train.py


### Package the script into a docker image.

Notice that the training image is a derivative of `mlops-dev:TF115-TFX015-KFP136`. The reason is to make sure that the development environment (your AI Platform Notebook instance) and the AI Platform Training environment are consistent. Since the AI Platform Notebook instance is based on the `mlops-dev:TF115-TFX015-KFP136` image we use the same image as a base for the training image. 

Make sure to update the URI for the base image so that it points to your project's **Container Registry**.

In [6]:
%%writefile {TRAINING_APP_FOLDER}/Dockerfile

FROM gcr.io/demokfp/mlops-dev:TF115-TFX015-KFP136
RUN pip install -U fire cloudml-hypertune
WORKDIR /app
COPY train.py .

ENTRYPOINT ["python", "train.py"]

Writing trainer_image/Dockerfile


In [7]:
%%writefile {BASE_IMAGE_FOLDER}/Dockerfile
FROM gcr.io/demokfp/mlops-dev:TF115-TFX015-KFP136

Writing base_image/Dockerfile


## Build trainer image 

In [None]:
IMAGE_URI="gcr.io/{}/{}:latest".format(PROJECT_ID,TRAINING_APP_FOLDER)

!gcloud builds submit --timeout 15m --tag {IMAGE_URI} trainer_image

Creating temporary tarball archive of 2 file(s) totalling 2.7 KiB before compression.
Uploading tarball of [trainer_image] to [gs://demokfp_cloudbuild/source/1581396823.93-f5b15c222eea434399cf3cd408672b9e.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/demokfp/builds/a5d6a9dd-25e7-4d7a-a59b-c1e880464eae].
Logs are available at [https://console.cloud.google.com/gcr/builds/a5d6a9dd-25e7-4d7a-a59b-c1e880464eae?project=435903989237].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "a5d6a9dd-25e7-4d7a-a59b-c1e880464eae"

FETCHSOURCE
Fetching storage object: gs://demokfp_cloudbuild/source/1581396823.93-f5b15c222eea434399cf3cd408672b9e.tgz#1581396824207562
Copying gs://demokfp_cloudbuild/source/1581396823.93-f5b15c222eea434399cf3cd408672b9e.tgz#1581396824207562...
/ [1 files][  1.3 KiB/  1.3 KiB]                                                
Operation completed over 1 objects/1.3 KiB.                                      
BUILD
Alr

In [None]:
IMAGE_URI="gcr.io/{}/{}:latest".format(PROJECT_ID,BASE_IMAGE_FOLDER)

!gcloud builds submit --timeout 15m --tag {IMAGE_URI} base_image

## Create Yaml file of the pipeline

In [8]:
!dsl-compile --py covertype_training_pipeline.py --output covertype_training_pipeline.yaml

## Deploying the pipeline

In [9]:
PIPELINE_NAME='covertype_classifier_training'
#INVERSE_PROXY_HOSTNAME = '653e8ece592a9e11-dot-us-central1.notebooks.googleusercontent.com' # from hosted pipelines cluster 

!kfp --endpoint {INVERSE_PROXY_HOSTNAME} pipeline upload -p {PIPELINE_NAME} covertype_training_pipeline.yaml

Pipeline 31935dc5-2ceb-491a-8c3f-0ff92fc09e5f has been submitted

Pipeline Details
------------------
ID           31935dc5-2ceb-491a-8c3f-0ff92fc09e5f
Name         covertype_classifier_training
Description
Uploaded at  2020-02-11T04:55:53+00:00
+-----------------------------+--------------------------------------------------+
| Parameter Name              | Default Value                                    |
| project_id                  |                                                  |
+-----------------------------+--------------------------------------------------+
| region                      |                                                  |
+-----------------------------+--------------------------------------------------+
| source_table_name           |                                                  |
+-----------------------------+--------------------------------------------------+
| gcs_root                    |                                                  |
+------

In [None]:
!kfp --endpoint {INVERSE_PROXY_HOSTNAME} pipeline list

## Run Experiment 

In [10]:
PIPELINE_ID='31935dc5-2ceb-491a-8c3f-0ff92fc09e5f'

EXPERIMENT_NAME='Covertype_Classifier_Training'
RUN_ID='Run_001'
SOURCE_TABLE='covertype_dataset.covertype'
DATASET_ID='splits'
EVALUATION_METRIC='accuracy'
EVALUATION_METRIC_THRESHOLD='0.69'
MODEL_ID='covertype_classifier'
VERSION_ID='v01'
REPLACE_EXISTING_VERSION=True

In [11]:
!kfp --endpoint {INVERSE_PROXY_HOSTNAME} run submit \
-e Covertype_Classifier_Training \
-r {RUN_ID} \
-p {PIPELINE_ID} \
project_id={PROJECT_ID} \
gcs_root={GCS_STAGING_PATH} \
region={REGION} \
source_table_name={SOURCE_TABLE} \
dataset_id={DATASET_ID} \
evaluation_metric_name={EVALUATION_METRIC} \
evaluation_metric_threshold={EVALUATION_METRIC_THRESHOLD} \
model_id={MODEL_ID} \
version_id={VERSION_ID} \
replace_existing_version={REPLACE_EXISTING_VERSION}

Run 498ce7c7-db1e-4729-bef7-3f1749397f9e is submitted
+--------------------------------------+---------+----------+---------------------------+
| run id                               | name    | status   | created at                |
| 498ce7c7-db1e-4729-bef7-3f1749397f9e | Run_001 |          | 2020-02-11T04:56:09+00:00 |
+--------------------------------------+---------+----------+---------------------------+
