## **TODO:** Set the values of
* `PROJECT`
* `BUCKET` 
* `REGION`
* `GOOGLE_APPLICATION_CREDENTIALS`

below to the values from your Google Cloud Platform environment

In [None]:
os.environ['PROJECT'] = None # REPLACE WITH YOUR PROJECT ID
os.environ['BUCKET'] = None # REPLACE WITH YOUR BUCKET NAME
os.environ['REGION'] = None # REPLACE WITH YOUR BUCKET REGION e.g. us-central1
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = None # REPLACE WITH PATH TO YOUR SERVICE ACCOUNT KEY

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
%%bash
gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS

gcloud config set project $PROJECT
gcloud config set compute/region $REGION

In [None]:
!mkdir -p src src/trainer

In [None]:
%%writefile src/requirements.txt
tensorflow>=2.3,<2.4
cloudml-hypertune
numpy
pandas
six

In [None]:
!pip install -r src/requirements.txt

In [None]:
!touch src/trainer/__init__.py

In [None]:
%%writefile src/trainer/task.py
import os
import json
import argparse
import hypertune
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.data as tfd
from datetime import datetime
from tensorflow.data import Dataset

# building blocks of our network
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

print(f"TensorFlow version {tf.__version__}")

def glob_to_ds(glob, num_epochs = 1, batch_size = 1):
  def decode(x):
    yX = tf.stack(list(x.values()), axis = 1)
    return yX[:, 1:], yX[:, 0]
  
  ds = (tf.data.experimental
        .make_csv_dataset(glob, 
                          num_epochs = num_epochs,
                          batch_size = batch_size)
        .map(decode))
  
  return ds

def train(hparams):
  print(f"hparams={hparams}")

  JOB_DIR = hparams['JOB_DIR']
  BATCH_SIZE = hparams['BATCH_SIZE']
  FEATURES = hparams['FEATURES']
  BUCKET = hparams['BUCKET']
  EPOCHS = hparams['EPOCHS']
  LEARNING_RATE = hparams['LEARNING_RATE']

  [train_ds, valid_ds, test_ds] = [glob_to_ds(glob, 
                          batch_size = BATCH_SIZE) for glob in \
                          [f"gs://{BUCKET}/train/part-*.csv",
                          f"gs://{BUCKET}/valid/part-*.csv",
                          f"gs://{BUCKET}/test/part-*.csv"]]

  model = Sequential()
  model.add(Dense(units=13, input_shape=[len(FEATURES)], activation='relu'))
  model.add(Dense(units=13, activation='relu'))
  model.add(Dense(units=13, activation='relu'))
  model.add(Dense(units=1, activation='linear'))

  tb_cb = tf.keras.callbacks.TensorBoard(log_dir=f"{JOB_DIR}/logs")

  class HpoCallback(tf.keras.callbacks.Callback):
      def __init__(self):
          self.hpt = hypertune.HyperTune()

      def on_epoch_end(self, epoch, logs):
          print(f"*** REPORTING val_loss={logs['val_loss']}")
          self.hpt.report_hyperparameter_tuning_metric(
              hyperparameter_metric_tag='val_loss',
              metric_value=logs['val_loss'],
              global_step=epoch
          )  
  hpo_cb = HpoCallback()

  mc_cb = tf.keras.callbacks.ModelCheckpoint(
      filepath=JOB_DIR + "/model-epoch-{epoch:02d}-val-mse-{val_loss:.4f}",
      save_best_only=True, 
      monitor="val_loss",
      verbose=1,
  )

  model.compile(optimizer = Adam( learning_rate = LEARNING_RATE ), 
                loss = tf.keras.losses.MSE,  
                metrics = [tf.keras.metrics.RootMeanSquaredError(), 
                        tf.keras.metrics.MeanAbsoluteError()]) 

  training = model.fit(train_ds, 
                      validation_data = valid_ds,
                      epochs = EPOCHS, 
                      callbacks=[tb_cb, mc_cb, hpo_cb],
                      verbose = 2,)
  
  results = model.evaluate(test_ds)

if __name__ == '__main__':
  hparams = {
      'JOB_ID': 'tmp03',
      'FEATURES': ['origin_block_latitude',
                   'origin_block_longitude',
                   'destination_block_latitude',
                   'destination_block_longitude'],
      'TARGET': ['fareamount'],
      'BUCKET': 'cfai-tmp03',
      'BATCH_SIZE': 2 ** 18,
      'LEARNING_RATE': 0.003,
      'EPOCHS': 1,      
  }
  hparams = {
      'JOB_DIR': hparams['JOB_ID'],
      **hparams,
  }  

  parser = argparse.ArgumentParser()
  parser.add_argument('--LEARNING_RATE', type=float)
  parser.add_argument('--BATCH_SIZE', type=int)
  parser.add_argument('--EPOCHS', type=int)
  parser.add_argument('--job_dir','--job-dir', type=str)
  args = vars(parser.parse_args())

  for k, v in args.items():
    hparams[str.upper(k)] = v if v else hparams[str.upper(k)]
  print(f"hparams={hparams}")

  #if started using gcloud ai-platform
  if 'TF_CONFIG' in os.environ and \
    'job_dir' in json.loads(os.environ['TF_CONFIG'])['job']:

    tf_config = json.loads(os.environ['TF_CONFIG'])
    print(json.dumps(tf_config, indent=2))

    job_dir = tf_config['job']['job_dir']
    print(f"*** job_dir: {job_dir}")

    task_type = tf_config['task']['type']
    print(f"*** task_type: {task_type}")

    task_index = tf_config['task']['index']
    print(f"*** task_index: {task_index}")

    num_workers = len(tf_config['cluster']['worker']) if 'worker' in tf_config['cluster'] else 1
    print(f"*** num_workers: {num_workers}")

    BATCH_SIZE = hparams['BATCH_SIZE'] // num_workers
    print(f"*** BATCH_SIZE: {BATCH_SIZE}")

    job_id = os.environ['CLOUD_ML_JOB_ID'] \
              if 'CLOUD_ML_JOB_ID' in os.environ else hparams['JOB_ID']
    print(f"*** job_id={job_id}")

    train({
        **hparams,
        'BATCH_SIZE': BATCH_SIZE,
        'JOB_DIR': job_dir,
    })

  else:
    train(hparams)

In [None]:
!cd src ; gcloud ai-platform local train \
--package-path trainer \
--module-name trainer.task \
-- \
--BATCH_SIZE=65536 \
--job-dir=tmp04

In [None]:
import os
from datetime import datetime
ts = datetime.now().strftime("%Y%m%d%H%M%S%f")

os.environ['JOB_NAME'] = f"job{ts}"
os.environ['JOB_DIR'] = f"gs://{os.environ['BUCKET']}/{os.environ['JOB_NAME']}/keras-job-dir"
print(os.environ['JOB_NAME'], os.environ['JOB_DIR'])

In [None]:
!cd src ; gcloud ai-platform jobs submit training $JOB_NAME \
--package-path trainer/ \
--module-name trainer.task \
--region $REGION \
--python-version 3.7 \
--runtime-version 2.4 \
--job-dir $JOB_DIR

In [None]:
%%writefile src/hpconfig.yaml
trainingInput:
  hyperparameters:
    goal: MINIMIZE
    hyperparameterMetricTag: "val_loss"
    maxTrials: 4
    maxParallelTrials: 2
    params:
      - parameterName: LEARNING_RATE
        type: DOUBLE
        minValue: 0.003
        maxValue: 0.3
        scaleType: UNIT_LOG_SCALE
      - parameterName: EPOCHS
        type: INTEGER
        minValue: 1
        maxValue: 10
        scaleType: UNIT_LINEAR_SCALE
      - parameterName: BATCH_SIZE
        type: DISCRETE
        discreteValues: [8192, 65536, 262144]

In [None]:
!cd src ; gcloud ai-platform jobs submit training $JOB_NAME \
--config hpconfig.yaml \
--package-path trainer/ \
--module-name trainer.task \
--region $REGION \
--python-version 3.7 \
--runtime-version 2.3 \
--job-dir $JOB_DIR

In [None]:
!gcloud ai-platform models create $JOB_NAME --region $REGION

In [None]:
MODEL_LOCATION = !gsutil ls -d $JOB_DIR/model* | head -n 1
[MODEL_LOCATION] = MODEL_LOCATION
os.environ['MODEL_LOCATION'] = MODEL_LOCATION

In [None]:
!gcloud ai-platform versions create latest \
--model $JOB_NAME \
--origin $MODEL_LOCATION \
--region $REGION \
--framework tensorflow \
--runtime-version 2.3

In [None]:
%%bash
gcloud ai-platform predict \
--model $JOB_NAME \
--json-instances test.json \
--region $REGION \
--version latest