In [None]:
import os
import time
import pandas as pd
import json

from googleapiclient import discovery
from googleapiclient import errors

from sklearn.model_selection import train_test_split

## Configure environment settings

In [None]:
REGION = 'us-central1'

Project id only works if there is one active configuration. If more than one active configuration then PROJECT_ID[1] must be used.

In [118]:
PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]
PROJECT_ID

MODEL_DATA = f"{PROJECT_ID}-fraud-data"
MODEL_STORE = f"{PROJECT_ID}-fraud-model"

In [119]:
print(f"active project: {PROJECT_ID}\nmodel_data:\t{MODEL_DATA}\nmodel_store:\t{MODEL_STORE}")

active project: Your active configuration is: [sovaml]
model_data:	Your active configuration is: [sovaml]-fraud-data
model_store:	Your active configuration is: [sovaml]-fraud-model


In [None]:
!gsutil mb -l $REGION gs://$MODEL_DATA/
!gsutil mb -l $REGION gs://$MODEL_STORE/

In [None]:
!gsutil ls 

## Create and store the training and testing sets

In [None]:
data = pd.read_csv('../dataset/creditcard.csv')
data.head()

In [None]:
X = data.drop(columns=['Class'])
y = data['Class']

Since the data is largely unbalanced we must use a stratified sampling to make sure we get both negative and positive samples to train with.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0, stratify=y)

In [None]:
pd.concat([X_train, y_train], axis=1).to_csv('../dataset/training.csv', index=False)
pd.concat([X_test, y_test], axis=1).to_csv('../dataset/testing.csv', index=False)

In [None]:
!gsutil cp ../dataset/training.csv gs://$MODEL_DATA/
!gsutil cp ../dataset/testing.csv gs://$MODEL_DATA/    

In [None]:
!gsutil ls gs://$MODEL_DATA/

## Develop a training application

In [None]:
TRAINING_APP_FOLDER = 'training_app'
os.makedirs(TRAINING_APP_FOLDER, exist_ok=True)

### Write the tuning script. 

Notice the use of the `hypertune` package to report the `roc_auc` optimization metric to AI Platform hyperparameter tuning service.

In [None]:
%%writefile {TRAINING_APP_FOLDER}/train.py
import os
import subprocess
import sys

import fire
import pickle
import numpy as np
import pandas as pd

import hypertune

from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier


def train_evaluate(
    job_dir, training_dataset_path, validation_dataset_path, max_depth, n_estimators, hptune
):

    df_train = pd.read_csv(training_dataset_path)
    df_validation = pd.read_csv(validation_dataset_path)

    if not hptune:
        df_train = pd.concat([df_train, df_validation])

    numeric_feature_indexes = slice(0, 30)
    
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numeric_feature_indexes),
        ]
    )

    pipeline = Pipeline(
        [
            ("preprocessor", preprocessor),
            ("classifier", RandomForestClassifier()),
        ]
    )

    num_features_type_map = {
        feature: "float64" for feature in df_train.columns[numeric_feature_indexes]
    }
    
    df_train = df_train.astype(num_features_type_map)
    df_validation = df_validation.astype(num_features_type_map)

    X_train = df_train.drop("Class", axis=1)
    y_train = df_train["Class"]
    
    pipeline.set_params(classifier__max_depth=max_depth, 
                        classifier__n_estimators=n_estimators
                       )
    
    pipeline.fit(X_train, y_train)

    if hptune:
        X_validation = df_validation.drop("Class", axis=1)
        y_validation = df_validation["Class"]
        
        y_pred_proba = pipeline.predict_proba(X_validation)[::,1]
        auc = roc_auc_score(y_validation, y_pred_proba)
        
        print("Model roc_auc: {}".format(auc))
        
        # Log it with hypertune
        hpt = hypertune.HyperTune()
        hpt.report_hyperparameter_tuning_metric(
            hyperparameter_metric_tag="roc_auc", metric_value=auc
        )

    # Save the model
    if not hptune:
        model_filename = "model.pkl"
        with open(model_filename, "wb") as model_file:
            pickle.dump(pipeline, model_file)
        gcs_model_path = "{}/{}".format(job_dir, model_filename)
        subprocess.check_call(
            ["gsutil", "cp", model_filename, gcs_model_path], stderr=sys.stdout
        )
        print("Saved model in: {}".format(gcs_model_path))


if __name__ == "__main__":
    fire.Fire(train_evaluate)


### Package the script into a docker image.

Notice that we are installing specific versions of `scikit-learn` and `pandas` in the training image. This is done to make sure that the training runtime is aligned with the serving runtime. Later in the notebook you will deploy the model to AI Platform Prediction, using the 1.15 version of AI Platform Prediction runtime. 

In [None]:
%%writefile {TRAINING_APP_FOLDER}/Dockerfile

FROM gcr.io/deeplearning-platform-release/base-cpu
RUN pip install -U fire cloudml-hypertune scikit-learn==0.20.4 pandas==0.24.2  
WORKDIR /app
COPY train.py .

ENTRYPOINT ["python", "train.py"]

### Build the docker image. 

You use **Cloud Build** to build the image and push it your project's **Container Registry**. As you use the remote cloud service to build the image, you don't need a local installation of Docker.

In [None]:
IMAGE_NAME='trainer_image'
IMAGE_TAG='latest'
IMAGE_URI='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, IMAGE_TAG)

In [None]:
!gcloud builds submit --tag $IMAGE_URI $TRAINING_APP_FOLDER

## Submit an AI Platform hyperparameter tuning job

https://cloud.google.com/ai-platform/training/docs/using-hyperparameter-tuning

https://cloud.google.com/ai-platform/training/docs/reference/rest/v1/projects.jobs#HyperparameterSpec

In [None]:
%%writefile {TRAINING_APP_FOLDER}/hptuning_config.yaml
trainingInput:
  hyperparameters:
    goal: MAXIMIZE
    hyperparameterMetricTag: roc_auc
    enableTrialEarlyStopping: TRUE
    maxTrials: 100
    maxParallelTrials: 5    
    params:
    - parameterName: max_depth
      type: INTEGER
      minValue: 10
      maxValue: 200
    - parameterName: n_estimators
      type: INTEGER
      minValue: 200
      maxValue: 2000

### Start the hyperparameter tuning job.

Use the `gcloud` command to start the hyperparameter tuning job.

In [None]:
MODEL_TIMESTAMP = time.strftime("%Y%m%d_%H%M%S")

JOB_NAME = f"JOB_{MODEL_TIMESTAMP}"
JOB_DIR = f"gs://{MODEL_STORE}/{JOB_NAME}"

TRAINING_FILE_PATH = f"gs://{MODEL_DATA}/training.csv"
VALIDATION_FILE_PATH = f"gs://{MODEL_DATA}/testing.csv"

SCALE_TIER = "BASIC"

!gcloud ai-platform jobs submit training $JOB_NAME \
--region=$REGION \
--job-dir=$JOB_DIR \
--master-image-uri=$IMAGE_URI \
--scale-tier=$SCALE_TIER \
--config $TRAINING_APP_FOLDER/hptuning_config.yaml \
-- \
--training_dataset_path=$TRAINING_FILE_PATH \
--validation_dataset_path=$VALIDATION_FILE_PATH \
--hptune

### Monitor the job.

You can monitor the job using GCP console or from within the notebook using `gcloud` commands.

In [None]:
!gcloud ai-platform jobs describe $JOB_NAME

In [None]:
!gcloud ai-platform jobs stream-logs $JOB_NAME

### Retrieve HP-tuning results.

After the job completes you can review the results using GCP Console or programatically by calling the AI Platform Training REST end-point.

In [None]:
ml = discovery.build('ml', 'v1')

job_id = 'projects/{}/jobs/{}'.format(PROJECT_ID, JOB_NAME)
request = ml.projects().jobs().get(name=job_id)

try:
    response = request.execute()
except errors.HttpError as err:
    print(err)
except:
    print("Unexpected error")

The returned run results are sorted by a value of the optimization metric. The best run is the first item on the returned list.

In [None]:
response['trainingOutput']['trials'][0]

## Retrain the model with the best hyperparameters

You can now retrain the model using the best hyperparameters and using combined training and validation splits as a training dataset.

### Configure and run the training job

In [None]:
max_depth = response['trainingOutput']['trials'][0]['hyperparameters']['max_depth']
n_estimators = response['trainingOutput']['trials'][0]['hyperparameters']['n_estimators']

print(f"n_estimators: {n_estimators}, max_depth: {max_depth}")

In [None]:
MODEL_TIMESTAMP = time.strftime("%Y%m%d_%H%M%S")

JOB_NAME = f"JOB_{MODEL_TIMESTAMP}"
JOB_DIR = f"gs://{MODEL_STORE}/{JOB_NAME}"

TRAINING_FILE_PATH = f"gs://{MODEL_DATA}/training.csv"
VALIDATION_FILE_PATH = f"gs://{MODEL_DATA}/testing.csv"

SCALE_TIER = "BASIC"


In [None]:
!gcloud ai-platform jobs submit training $JOB_NAME \
--region=$REGION \
--job-dir=$JOB_DIR \
--master-image-uri=$IMAGE_URI \
--scale-tier=$SCALE_TIER \
-- \
--training_dataset_path=$TRAINING_FILE_PATH \
--validation_dataset_path=$VALIDATION_FILE_PATH \
--max_depth=$max_depth \
--n_estimators=$n_estimators \
--nohptune

In [None]:
!gcloud ai-platform jobs stream-logs $JOB_NAME

### Examine the training output

The training script saved the trained model as the 'model.pkl' in the `JOB_DIR` folder on GCS.

In [None]:
!gsutil ls $JOB_DIR

## Deploy the model to AI Platform Prediction

### Create a model resource

In [None]:
model_name = 'fraud'
labels = "task=classifier,domain=fsi"
filter = 'name:{}'.format(model_name)
models = !(gcloud ai-platform models list --filter={filter} --format='value(name)')

if not models:
    !gcloud ai-platform models create  $model_name \
    --regions=$REGION \
    --labels=$labels
else:
    print("Model: {} already exists.".format(models[0]))

### Create a model version

In [None]:
model_version = 'v01'
filter = 'name:{}'.format(model_version)
versions = !(gcloud ai-platform versions list --model={model_name} --format='value(name)' --filter={filter})

if not versions:
    !gcloud ai-platform versions create {model_version} \
    --model={model_name} \
    --origin=$JOB_DIR \
    --runtime-version=1.15 \
    --framework=scikit-learn \
    --python-version=3.7
else:
    print("Model version: {} already exists.".format(versions[0]))

### Serve predictions
#### Prepare the input file with JSON formated instances.

In [None]:
input_file = 'serving_instances.json'

with open(input_file, 'w') as f:
    for index, row in X_test.head(100).iterrows():
        f.write(json.dumps(list(row.values)))
        f.write('\n')

#### Query the model: gcloud

In [None]:
!gcloud ai-platform predict \
--model $model_name \
--version $model_version \
--json-instances $input_file

#### Query the model: programatically

https://cloud.google.com/ai-platform/prediction/docs/using-pipelines

In [None]:
def predict_json(project, model, instances, version=None):
    """Send json data to a deployed model for prediction.
    Args:
        project (str): project where the AI Platform Prediction Model is deployed.
        model (str): model name.
        instances ([[float]]): List of input instances, where each input
           instance is a list of floats.
        version: str, version of the model to target.
    Returns:
        Mapping[str: any]: dictionary of prediction results defined by the
            model.
    """
    service = discovery.build('ml', 'v1')
    name = 'projects/{}/models/{}'.format(project, model)

    if version is not None:
        name += '/versions/{}'.format(version)

    response = service.projects().predict(
        name=name,
        body={'instances': instances}
    ).execute()

    if 'error' in response:
        raise RuntimeError(response['error'])

    return response['predictions']

In [None]:
instances = X_test.values.tolist()[:1000]

In [None]:
%%time 

results = predict_json(PROJECT_ID, model_name, instances, version=model_version)