# Create and Commit Artifacts 

In [66]:
PROJECT_ID = 'marinastestproject'
PREFIX = PROJECT_ID
REGION = 'us-central1'

DATA_ROOT = 'gs://workshop-datasets/covertype'
TRAINING_FILE_PATH = DATA_ROOT + '/training/dataset.csv'
VALIDATION_FILE_PATH = DATA_ROOT + '/evaluation/dataset.csv'

# Job dir for AI Platform Training
JOB_DIR_ROOT='gs://{}-artifact-store/jobs'.format(PREFIX)


NAMESPACE='kubeflow'
ZONE='us-central1-a'
ARTIFACT_STORE_URI='gs://{}-artifact-store'.format(PREFIX)
GCS_STAGING_PATH='{}/staging'.format(ARTIFACT_STORE_URI)
GKE_CLUSTER_NAME='{}-cluster'.format(PREFIX)

!gcloud container clusters get-credentials $GKE_CLUSTER_NAME --zone $ZONE
HOST_TEMP=!(kubectl describe configmap inverse-proxy-config -n $NAMESPACE | grep "googleusercontent.com")
INVERSE_PROXY_HOSTNAME=HOST_TEMP[0]

Fetching cluster endpoint and auth data.
kubeconfig entry generated for marinastestproject-cluster.


## Import data set to BQ

In [36]:
DATASET_LOCATION='US'
DATASET_ID='covertype_dataset'
TABLE_ID='covertype'
DATA_SOURCE='gs://workshop-datasets/covertype/full/dataset.csv'
SCHEMA='Elevation:INTEGER,\
Aspect:INTEGER,\
Slope:INTEGER,\
Horizontal_Distance_To_Hydrology:INTEGER,\
Vertical_Distance_To_Hydrology:INTEGER,\
Horizontal_Distance_To_Roadways:INTEGER,\
Hillshade_9am:INTEGER,\
Hillshade_Noon:INTEGER,\
Hillshade_3pm:INTEGER,\
Horizontal_Distance_To_Fire_Points:INTEGER,\
Wilderness_Area:STRING,\
Soil_Type:STRING,\
Cover_Type:INTEGER'

!bq --location=$DATASET_LOCATION --project_id=$PROJECT_ID mk --dataset $DATASET_ID
!bq --project_id=$PROJECT_ID --dataset_id=$DATASET_ID load \
--source_format=CSV \
--skip_leading_rows=1 \
--replace \
$TABLE_ID \
$DATA_SOURCE \
$SCHEMA

BigQuery error in mk operation: Dataset 'marinastestproject:covertype_dataset'
already exists.
Waiting on bqjob_r87f8f2d80286a96_000001702c6f17c4_1 ... (9s) Current status: DONE   


### Prepare the hyperparameter tuning application.
Since the training run on this dataset is computationally expensive you can benefit from running a distributed hyperparameter tuning job on AI Platform Training.

In [37]:
TRAINING_APP_FOLDER = 'training_app'
os.makedirs(TRAINING_APP_FOLDER, exist_ok=True)

### Write the tuning script. 

Notice the use of the `hypertune` package to report the `accuracy` optimization metric to AI Platform hyperparameter tuning service.

In [38]:
%%writefile {TRAINING_APP_FOLDER}/train.py

import os
import subprocess
import sys

import fire
import pickle
import numpy as np
import pandas as pd

import hypertune

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder


def train_evaluate(job_dir, training_dataset_path, validation_dataset_path, alpha, max_iter, hptune):
    
  df_train = pd.read_csv(training_dataset_path)
  df_validation = pd.read_csv(validation_dataset_path)
    
  if not hptune:
    df_train = pd.concat([df_train, df_validation])

  numeric_feature_indexes = slice(0, 10)
  categorical_feature_indexes = slice(10, 12)

  preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_feature_indexes),
        ('cat', OneHotEncoder(), categorical_feature_indexes) 
    ])

  pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier(loss='log'))
  ])
    
  num_features_type_map = {feature: 'float64' for feature in df_train.columns[numeric_feature_indexes]}
  df_train = df_train.astype(num_features_type_map)
  df_validation = df_validation.astype(num_features_type_map) 

  print('Starting training: alpha={}, max_iter={}'.format(alpha, max_iter))
  X_train = df_train.drop('Cover_Type', axis=1)
  y_train = df_train['Cover_Type']
  
  pipeline.set_params(classifier__alpha=alpha, classifier__max_iter=max_iter)
  pipeline.fit(X_train, y_train)
  
  if hptune:
    X_validation = df_validation.drop('Cover_Type', axis=1)
    y_validation = df_validation['Cover_Type']
    accuracy = pipeline.score(X_validation, y_validation)
    print('Model accuracy: {}'.format(accuracy))
    # Log it with hypertune
    hpt = hypertune.HyperTune()
    hpt.report_hyperparameter_tuning_metric(
      hyperparameter_metric_tag='accuracy',
      metric_value=accuracy
    )

  # Save the model
  if not hptune:
    model_filename = 'model.pkl'
    with open(model_filename, 'wb') as model_file:
        pickle.dump(pipeline, model_file)
    gcs_model_path = "{}/{}".format(job_dir, model_filename)
    subprocess.check_call(['gsutil', 'cp', model_filename, gcs_model_path], stderr=sys.stdout)
    print("Saved model in: {}".format(gcs_model_path)) 
    
if __name__ == "__main__":
  fire.Fire(train_evaluate)

Overwriting training_app/train.py


### Package the script into a docker image.

Notice that the training image is a derivative of `mlops-dev:TF115-TFX015-KFP136`. The reason is to make sure that the development environment (your AI Platform Notebook instance) and the AI Platform Training environment are consistent. Since the AI Platform Notebook instance is based on the `mlops-dev:TF115-TFX015-KFP136` image we use the same image as a base for the training image. 

Make sure to update the URI for the base image so that it points to your project's **Container Registry**.

In [39]:
%%writefile {TRAINING_APP_FOLDER}/Dockerfile

FROM gcr.io/marinastestproject/mlops-dev:TF115-TFX015-KFP136
RUN pip install -U fire cloudml-hypertune
WORKDIR /app
COPY train.py .

ENTRYPOINT ["python", "train.py"]

Overwriting training_app/Dockerfile


### Build the docker image. 

You use **Cloud Build** to build the image and push it your project's **Container Registry**. As you use the remote cloud service to build the image, you don't need a local installation of Docker.

In [40]:
%%writefile {TRAINING_APP_FOLDER}/build.sh

IMAGE_NAME='trainer_image'
IMAGE_TAG='latest'
IMAGE_URI='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, IMAGE_TAG)

!gcloud builds submit --tag $IMAGE_URI $TRAINING_APP_FOLDER

Overwriting training_app/build.sh


## Create Yaml file of the pipeline

In [62]:
!dsl-compile --py covertype_training_pipeline.py --output covertype_training_pipeline.yaml

## Deploying the pipeline

In [69]:
PIPELINE_NAME='covertype_classifier_training'
print(INVERSE_PROXY_HOSTNAME)

!kfp --endpoint ea5523c7ab1be57-dot-us-central1.notebooks.googleusercontent.com pipeline upload -p covertype_classifier_training covertype_training_pipeline.yaml

ea5523c7ab1be57-dot-us-central1.notebooks.googleusercontent.com
(500)
Reason: Internal Server Error
HTTP response headers: HTTPHeaderDict({'Date': 'Mon, 10 Feb 2020 01:05:19 GMT', 'X-Powered-By': 'Express', 'Content-Length': '1461', 'Content-Type': 'text/html; charset=utf-8', 'X-Content-Type-Options': 'nosniff', 'Set-Cookie': 'S=cloud_datalab_tunnel=BPZg3FVL7d1zQ7wgA4-a0ht_i9u2QzOo; Path=/; Max-Age=3600', 'X-Xss-Protection': '0', 'X-Frame-Options': 'SAMEORIGIN'})
HTTP response body: 
<!DOCTYPE html>
<html lang=en>
  <meta charset=utf-8>
  <meta name=viewport content="initial-scale=1, minimum-scale=1, width=device-width">
  <title>Error 500 (Internal Server Error)!!1</title>
  <style>
    *{margin:0;padding:0}html,code{font:15px/22px arial,sans-serif}html{background:#fff;color:#222;padding:15px}body{margin:7% auto 0;max-width:390px;min-height:180px;padding:30px 0 15px}* > body{background:url(//www.google.com/images/errors/robot.png) 100% 5px no-repeat;padding-right:205px}p{margin:11px 0