# Part 1 - Experimentation



In [50]:
import os
import numpy as np
import pandas as pd
import uuid
import time
import tempfile

from googleapiclient import discovery
from googleapiclient import errors

from google.cloud import bigquery
from jinja2 import Template
from kfp.components import func_to_container_op
from typing import NamedTuple

from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

## Configure environment settings
Make sure to update the constants to reflect your environment settings.

In [51]:
PROJECT_ID = 'mlops-workshop'
DATASET_LOCATION = 'US'
CLUSTER_NAME = 'mlops-workshop-cluster'
CLUSTER_ZONE = 'us-central1-a'
REGION = 'us-central1'
DATASET_ID = 'lab_12'
SOURCE_TABLE_ID = 'covertype'
TRAINING_TABLE_ID = 'training_split'
VALIDATION_TABLE_ID = 'validation_split'
TESTING_TABLE_ID = 'testing_split'
LAB_GCS_BUCKET='gs://mlops-workshop-lab-12'
TRAINING_FILE_PATH = LAB_GCS_BUCKET + '/datasets/training/data.csv'
VALIDATION_FILE_PATH = LAB_GCS_BUCKET + '/datasets/validation/data.csv'
TESTING_FILE_PATH = LAB_GCS_BUCKET + '/datasets/testing/data.csv'

## Explore the source dataset 
Bring a few rows from the source dataset.

In [10]:
client = bigquery.Client(project=PROJECT_ID, location=DATASET_LOCATION)

query_template = """
SELECT *
FROM `{{ source_table }}`
LIMIT 10
"""

query = Template(query_template).render(
    source_table='{}.{}.{}'.format(PROJECT_ID, DATASET_ID, SOURCE_TABLE_ID))
df = client.query(query).to_dataframe()
num_of_columns = len(df.columns)
df

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,3094,82,65,42,3,3001,193,0,0,1315,Commanche,7202,2
1,3083,105,57,0,0,3002,228,0,0,1350,Commanche,7202,2
2,3159,60,37,150,0,3045,220,0,17,1177,Commanche,7756,2
3,3158,73,62,170,-4,3042,191,0,0,1187,Commanche,7756,2
4,3147,96,59,216,-6,3037,220,0,0,1209,Commanche,7756,2
5,2506,13,64,201,88,655,73,30,0,1470,Commanche,4703,2
6,2501,3,63,216,81,626,55,40,0,1470,Commanche,4703,2
7,3281,38,59,150,123,3012,137,42,0,1159,Commanche,7756,2
8,2500,0,62,234,83,598,54,45,67,1471,Commanche,4703,2
9,2555,3,60,190,135,684,67,53,65,1470,Commanche,4703,2


Count the number of rows and columns in the source.

In [11]:
query_template = """
SELECT count(*)
FROM `{{ source_table }}`
"""

query = Template(query_template).render(
    source_table='{}.{}.{}'.format(PROJECT_ID, DATASET_ID, SOURCE_TABLE_ID))
df = client.query(query).to_dataframe()
number_of_rows_in_full_dataset = df.iloc[0,0]
print('{} x {}'.format(number_of_rows_in_full_dataset, num_of_columns))

581012 x 13


## Create the training, validation and testing splits
Define the sampling query template.

In [12]:
sampling_query_template = """
SELECT *
FROM 
  `{{ source_table }}` AS cover
WHERE 
  MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(cover))), {{ num_lots }}) in {{ lots_to_select }}
"""

Configure the sampling query job settings.

In [13]:
job_config = bigquery.QueryJobConfig()
job_config.create_disposition = bigquery.job.CreateDisposition.CREATE_IF_NEEDED
job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE
dataset_ref = client.dataset(DATASET_ID)

Create the training split table.

In [14]:
query = Template(sampling_query_template).render(
    source_table='{}.{}.{}'.format(PROJECT_ID, DATASET_ID, SOURCE_TABLE_ID),
    num_lots=10,
    lots_to_select='(1, 2, 3)')

job_config.destination = dataset_ref.table(TRAINING_TABLE_ID)
client.query(query, job_config).result()

<google.cloud.bigquery.table.RowIterator at 0x7f2bfe866828>

Extract the training split table to GCS.

In [15]:
client.extract_table(dataset_ref.table(TRAINING_TABLE_ID), TRAINING_FILE_PATH).result()

<google.cloud.bigquery.job.ExtractJob at 0x7f2bfe8fa860>

Inspect the extracted file.

In [16]:
!gsutil cat -r 0-500 {TRAINING_FILE_PATH}

Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
3222,0,0,120,1,3207,218,237,156,1698,Rawah,7201,1
3211,180,0,437,30,5878,219,238,157,2230,Rawah,7201,2
3283,225,0,511,25,6031,218,238,157,631,Rawah,7201,1
3211,0,0,418,30,5862,218,238,156,2210,Rawah,7201,2
3135,135,0,192,5,306,219,238,156,2790,Neota,7201,1
3068,0,0,4

Create the validation split table.

In [17]:
query = Template(sampling_query_template).render(
    source_table='{}.{}.{}'.format(PROJECT_ID, DATASET_ID, SOURCE_TABLE_ID),
    num_lots=10,
    lots_to_select='(8)')

job_config.destination = dataset_ref.table(VALIDATION_TABLE_ID)
client.query(query, job_config).result()

<google.cloud.bigquery.table.RowIterator at 0x7f2bfe86ffd0>

Extract the validation split table to GCS.

In [18]:
client.extract_table(dataset_ref.table(VALIDATION_TABLE_ID), VALIDATION_FILE_PATH).result()

<google.cloud.bigquery.job.ExtractJob at 0x7f2c1a541dd8>

Create the testing split table.

In [19]:
query = Template(sampling_query_template).render(
    source_table='{}.{}.{}'.format(PROJECT_ID, DATASET_ID, SOURCE_TABLE_ID),
    num_lots=10,
    lots_to_select='(9)')

job_config.destination = dataset_ref.table(TESTING_TABLE_ID)
client.query(query, job_config).result()

<google.cloud.bigquery.table.RowIterator at 0x7f2bfe87add8>

Extract the testing split table to GCS.

In [20]:
client.extract_table(dataset_ref.table(TESTING_TABLE_ID), TESTING_FILE_PATH).result()

<google.cloud.bigquery.job.ExtractJob at 0x7f2bfe883da0>

## Develop the training script

Configure the `sklearn` training pipeline.

In [21]:
numeric_features = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points']
categorical_features = ['Wilderness_Area', 'Soil_Type']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features) 
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier(loss='log'))
])

Run the pipeline locally.

In [22]:
df_train = pd.read_csv(TRAINING_FILE_PATH)
df_validation = pd.read_csv(VALIDATION_FILE_PATH)

X_train = df_train.drop('Cover_Type', axis=1)
y_train = df_train['Cover_Type']
X_validation = df_validation.drop('Cover_Type', axis=1)
y_validation = df_validation['Cover_Type']

pipeline.set_params(classifier__alpha=0.001, classifier__max_iter=1000)
pipeline.fit(X_train, y_train)
pipeline.score(X_validation, y_validation)

0.7036413422453527

#### Prepare the hyperparameter tuning application.
Since the training run on this dataset is computationally expensive you can benefit from running a distributed hyperparameter tuning job on AI Platform Training.

In [52]:
TRAINING_APP_FOLDER = 'training_app'
os.makedirs(TRAINING_APP_FOLDER, exist_ok=True)

Write the tuning script.

In [59]:
%%writefile {TRAINING_APP_FOLDER}/train.py

import os
import subprocess
import sys

import fire
import pickle
import numpy as np
import pandas as pd

import hypertune

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder


def train_evaluate(job_dir, training_dataset_path, validation_dataset_path, alpha, max_iter, hptune):
    
  df_train = pd.read_csv(training_dataset_path)
  df_validation = pd.read_csv(validation_dataset_path)
  if not hptune:
    df_train = pd.concat([df_train, df_validation])

  numeric_features = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
    'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points']
    
  categorical_features = ['Wilderness_Area', 'Soil_Type']

  preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features) 
    ])

  pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier(loss='log'))
  ])

  print('Starting training: alpha={}, max_iter={}'.format(alpha, max_iter))
  X_train = df_train.drop('Cover_Type', axis=1)
  y_train = df_train['Cover_Type']
  
    
  pipeline.set_params(classifier__alpha=alpha, classifier__max_iter=max_iter)
  pipeline.fit(X_train, y_train)
  
  if hptune:
    X_validation = df_validation.drop('Cover_Type', axis=1)
    y_validation = df_validation['Cover_Type']
    accuracy = pipeline.score(X_validation, y_validation)
    print('Model accuracy: {}'.format(accuracy))
    # Log it with hypertune
    hpt = hypertune.HyperTune()
    hpt.report_hyperparameter_tuning_metric(
      hyperparameter_metric_tag='accuracy',
      metric_value=accuracy
    )

  # Save the model
  if not hptune:
    model_filename = 'model.pkl'
    with open(model_filename, 'wb') as model_file:
        pickle.dump(pipeline, model_file)
    gcs_model_path = "{}/{}".format(job_dir, model_filename)
    subprocess.check_call(['gsutil', 'cp', model_filename, gcs_model_path], stderr=sys.stdout)
    print("Saved model in: {}".format(gcs_model_path)) 
    
if __name__ == "__main__":
  fire.Fire(train_evaluate)

Overwriting training_app/train.py


Package the script into a docker image.

In [60]:
%%writefile {TRAINING_APP_FOLDER}/Dockerfile

FROM gcr.io/deeplearning-platform-release/base-cpu
RUN pip install -U fire cloudml-hypertune
WORKDIR /app
COPY train.py .

ENTRYPOINT ["python", "train.py"]

Overwriting training_app/Dockerfile


Build the docker image.

In [61]:
IMAGE_NAME='trainer_image'
IMAGE_TAG='latest'
IMAGE_URI='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, IMAGE_TAG)

!gcloud builds submit --tag $IMAGE_URI $TRAINING_APP_FOLDER

Creating temporary tarball archive of 4 file(s) totalling 5.1 KiB before compression.
Uploading tarball of [training_app] to [gs://mlops-workshop_cloudbuild/source/1576429568.5-185f0a560eb649c3b1bf1d7a7837bf23.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/mlops-workshop/builds/45134437-e6df-48ba-b5f5-38c492f099e6].
Logs are available at [https://console.cloud.google.com/gcr/builds/45134437-e6df-48ba-b5f5-38c492f099e6?project=745302968357].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "45134437-e6df-48ba-b5f5-38c492f099e6"

FETCHSOURCE
Fetching storage object: gs://mlops-workshop_cloudbuild/source/1576429568.5-185f0a560eb649c3b1bf1d7a7837bf23.tgz#1576429568970317
Copying gs://mlops-workshop_cloudbuild/source/1576429568.5-185f0a560eb649c3b1bf1d7a7837bf23.tgz#1576429568970317...
/ [1 files][  1.5 KiB/  1.5 KiB]                                                
Operation completed over 1 objects/1.5 KiB.                        

Create the hyperparameter configuration file.

In [62]:
%%writefile {TRAINING_APP_FOLDER}/hptuning_config.yaml

trainingInput:
  hyperparameters:
    goal: MAXIMIZE
    maxTrials: 6
    maxParallelTrials: 3
    hyperparameterMetricTag: accuracy
    enableTrialEarlyStopping: TRUE 
    params:
    - parameterName: max_iter
      type: DISCRETE
      discreteValues: [
          500,
          1000
          ]
    - parameterName: alpha
      type: DOUBLE
      minValue:  0.00001
      maxValue:  0.01
      scaleType: UNIT_LINEAR_SCALE

Overwriting training_app/hptuning_config.yaml


#### Submit the hyperparameter tuning job.

In [44]:
JOB_NAME = "JOB_{}".format(time.strftime("%Y%m%d_%H%M%S"))
JOB_DIR = "{}/{}".format(LAB_GCS_BUCKET, JOB_NAME)
SCALE_TIER = "BASIC"

!gcloud ai-platform jobs submit training $JOB_NAME \
--region=$REGION \
--job-dir=$LAB_GCS_BUCKET/$JOB_NAME \
--master-image-uri=$IMAGE_URI \
--scale-tier=$SCALE_TIER \
--config $TRAINING_APP_FOLDER/hptuning_config.yaml \
-- \
--training_dataset_path=$TRAINING_FILE_PATH \
--validation_dataset_path=$VALIDATION_FILE_PATH \
--hptune

Job [JOB_20191215_051415] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe JOB_20191215_051415

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs JOB_20191215_051415
jobId: JOB_20191215_051415
state: QUEUED


#### Monitor the job.

In [64]:
!gcloud ai-platform jobs describe $JOB_NAME

createTime: '2019-12-15T17:10:20Z'
etag: a1Z-8V4yK7I=
jobId: JOB_20191215_171018
state: PREPARING
trainingInput:
  args:
  - --training_dataset_path=gs://mlops-workshop-lab-12/datasets/training/data.csv
  - --validation_dataset_path=gs://mlops-workshop-lab-12/datasets/validation/data.csv
  - --alpha
  - '0.001'
  - --max_iter
  - '200'
  - --hptune
  - 'False'
  jobDir: gs://mlops-workshop-lab-12/JOB_20191215_171018
  masterConfig:
    imageUri: gcr.io/mlops-workshop/trainer_image:latest
  region: us-central1
trainingOutput: {}

View job in the Cloud Console at:
https://console.cloud.google.com/mlengine/jobs/JOB_20191215_171018?project=mlops-workshop

View logs at:
https://console.cloud.google.com/logs?resource=ml.googleapis.com%2Fjob_id%2FJOB_20191215_171018&project=mlops-workshop


In [65]:
!gcloud ai-platform jobs stream-logs $JOB_NAME

INFO	2019-12-15 17:10:20 +0000	service		Validating job requirements...
INFO	2019-12-15 17:10:20 +0000	service		Job creation request has been successfully validated.
INFO	2019-12-15 17:10:20 +0000	service		Job JOB_20191215_171018 is queued.
INFO	2019-12-15 17:10:21 +0000	service		Waiting for job to be provisioned.
INFO	2019-12-15 17:10:22 +0000	service		Waiting for training program to start.
INFO	2019-12-15 17:13:41 +0000	master-replica-0		Copying file://model.pkl [Content-Type=application/octet-stream]...
INFO	2019-12-15 17:13:41 +0000	master-replica-0		/ [0 files][    0.0 B/  6.8 KiB]                                                
INFO	2019-12-15 17:13:41 +0000	master-replica-0		/ [1 files][  6.8 KiB/  6.8 KiB]                                                
INFO	2019-12-15 17:13:41 +0000	master-replica-0		Operation completed over 1 objects/6.8 KiB.                                      
INFO	2019-12-15 17:13:41 +0000	master-replica-0		Starting training: alpha=0.001, max_iter=200
INFO

### Retrieve HP-tuning results.

Call AI Platform Training end-point.

In [48]:
ml = discovery.build('ml', 'v1')

job_id = 'projects/{}/jobs/{}'.format(PROJECT_ID, JOB_NAME)
request = ml.projects().jobs().get(name=job_id)

try:
    response = request.execute()
except errors.HttpError as err:
    print(err)
except:
    print("Unexpected error")
    
response

{'jobId': 'JOB_20191215_051415',
 'trainingInput': {'args': ['--training_dataset_path=gs://mlops-workshop-lab-12/datasets/training/data.csv',
   '--validation_dataset_path=gs://mlops-workshop-lab-12/datasets/validation/data.csv',
   '--hptune'],
  'hyperparameters': {'goal': 'MAXIMIZE',
   'params': [{'parameterName': 'max_iter',
     'type': 'DISCRETE',
     'discreteValues': [500, 1000]},
    {'parameterName': 'alpha',
     'minValue': 1e-05,
     'maxValue': 0.01,
     'type': 'DOUBLE',
     'scaleType': 'UNIT_LINEAR_SCALE'}],
   'maxTrials': 6,
   'maxParallelTrials': 3,
   'hyperparameterMetricTag': 'accuracy',
   'enableTrialEarlyStopping': True},
  'region': 'us-central1',
  'jobDir': 'gs://mlops-workshop-lab-12/JOB_20191215_051415',
  'masterConfig': {'imageUri': 'gcr.io/mlops-workshop/trainer_image:latest'}},
 'createTime': '2019-12-15T05:14:22Z',
 'startTime': '2019-12-15T05:14:26Z',
 'endTime': '2019-12-15T05:30:51Z',
 'state': 'SUCCEEDED',
 'trainingOutput': {'completedTria

Retrieve the best run.

In [49]:
response['trainingOutput']['trials'][0]

{'trialId': '5',
 'hyperparameters': {'max_iter': '1000', 'alpha': '0.00089571222662925725'},
 'finalMetric': {'trainingStep': '1', 'objectiveValue': 0.7016004217902274},
 'startTime': '2019-12-15T05:22:43.048295659Z',
 'endTime': '2019-12-15T05:29:43Z',
 'state': 'SUCCEEDED'}