In [None]:
# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Train tensorflow or keras model on GCP or Kubeflow from Notebooks

This notebook introduces you to using Kubeflow Fairing to train the model to Kubeflow on Google Kubernetes Engine (GKE), and Google Cloud AI Platform training. This notebook demonstrate how to:
 
* Train an Keras model in a local notebook,
* Use Kubeflow Fairing to train an Keras model remotely on Kubeflow cluster,
* Use Kubeflow Fairing to train an Keras model remotely on AI Platform training,
* Use Kubeflow Fairing to deploy a trained model to Kubeflow, and Call the deployed endpoint for predictions.

**You need Python 3.6 to use Kubeflow Fairing.**

## Setups

* Pre-conditions
    - Deployed a kubeflow cluster through https://deploy.kubeflow.cloud/
    - Have the following environment variable ready: 
        - PROJECT_ID # project host the kubeflow cluster or for running AI platform training
        - DEPLOYMENT_NAME # kubeflow deployment name, the same the cluster name after delpoyed
        - GCP_BUCKET # google cloud storage bucket

* Create service account
```bash
export SA_NAME = [service account name]
gcloud iam service-accounts create ${SA_NAME}
gcloud projects add-iam-policy-binding ${PROJECT_ID} \
    --member serviceAccount:${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com \
    --role 'roles/editor'
gcloud iam service-accounts keys create ~/key.json \
    --iam-account ${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com
```

* Authorize for Source Repository
```bash
gcloud auth configure-docker
```

* Update local kubeconfig (for submiting job to kubeflow cluster)
```bash
export CLUSTER_NAME=${DEPLOYMENT_NAME} # this is the deployment name or the kubenete cluster name
export ZONE=us-central1-c
gcloud container clusters get-credentials ${CLUSTER_NAME} --region ${ZONE}
```

* Set the environmental variable: GOOGLE_APPLICATION_CREDENTIALS
```bash
export GOOGLE_APPLICATION_CREDENTIALS = ....
```
```python
os.environ['GOOGLE_APPLICATION_CREDENTIALS']=...
```

* Install the lastest version of fairing
```python
pip install git+https://github.com/kubeflow/fairing@master
```

**Please not that the above configuration is required for notebook service running outside Kubeflow environment. And the examples demonstrated in the notebook is fully tested on notebook service outside Kubeflow cluster also.**

**The environemt variables, e.g. service account, projects and etc, should have been pre-configured while setting up the cluster.**

In [1]:
import os
import logging
import tensorflow as tf
import fairing
import numpy as np
from datetime import datetime
from fairing.cloud import gcp

In [2]:
import os
import fairing

# Setting up google container repositories (GCR) for storing output containers
# You can use any docker container registry istead of GCR
# For local notebook, GCP_PROJECT should be set explicitly
GCP_PROJECT = fairing.cloud.gcp.guess_project_name()
GCP_Bucket ='gs://kubeflow-trykube/'
print(GCP_PROJECT)
# This is for local notebook instead of that in kubeflow cluster
# os.environ['GOOGLE_APPLICATION_CREDENTIALS']=

trykube-248403


## Define the model logic

In [3]:
def gcs_copy(src_path, dst_path):
    import subprocess
    print(subprocess.run(['gsutil', 'cp', src_path, dst_path], stdout=subprocess.PIPE).stdout[:-1].decode('utf-8'))
    
def gcs_download(src_path, file_name):
    import subprocess
    print(subprocess.run(['gsutil', 'cp', src_path, file_name], stdout=subprocess.PIPE).stdout[:-1].decode('utf-8'))

In [4]:
class TensorflowModel(object):
    
    def __init__(self):
        self.model_file = "mnist_model.h5"
        self.model = None    
    
    def build(self):
        self.model = tf.keras.models.Sequential([
          tf.keras.layers.Flatten(input_shape=(28, 28)),
          tf.keras.layers.Dense(512, activation=tf.nn.relu),
          tf.keras.layers.Dropout(0.2),
          tf.keras.layers.Dense(10, activation=tf.nn.softmax)
        ])
        self.model.compile(optimizer='adam',
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])
        print(self.model.summary())
    
    def save_model(self):
        self.model.save(self.model_file)
        gcs_copy(self.model_file, GCP_Bucket + self.model_file)
    
    def train(self):
        self.build()
        
        mnist = tf.keras.datasets.mnist
        (x_train, y_train),(x_test, y_test) = mnist.load_data()
        x_train, x_test = x_train / 255.0, x_test / 255.0
        
        callbacks = [
          # Interrupt training if `val_loss` stops improving for over 2 epochs
          tf.keras.callbacks.EarlyStopping(patience=2, monitor='val_loss'),
          # Write TensorBoard logs to `./logs` directory
          tf.keras.callbacks.TensorBoard(log_dir=GCP_Bucket + 'logs/' 
                                         + datetime.now().date().__str__())
        ]
        self.model.fit(x_train, y_train, batch_size=32, epochs=5, callbacks=callbacks,
                  validation_data=(x_test, y_test))
        self.save_model()
        
    def predict(self, X):
        if not self.model:
            self.model = tf.keras.models.load_model(self.model_file)
        # Do any preprocessing
        prediction = self.model.predict(data=X)

## Train an Keras model in a notebook

In [5]:
TensorflowModel().train()

Instructions for updating:
Colocations handled automatically by placer.


From /opt/conda/lib/python3.6/site-packages/tensorflow/python/ops/resource_variable_ops.py:435: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


From /opt/conda/lib/python3.6/site-packages/tensorflow/python/keras/layers/core.py:143: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 784)               0         
_________________________________________________________________
dense (Dense)                (None, 512)               401920    
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                5130      
Total params: 407,050
Trainable params: 407,050
Non-trainable params: 0
_________________________________________________________________
None
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
Train on 60000 samples, validate on 10000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5



## Spicify a image registry that will hold the image built by fairing

In [None]:
# In this demo, I use gsutil, therefore i compile a special image to install GoogleCloudSDK as based image
# base_image = 'gcr.io/{}/fairing-predict-example:latest'.format(GCP_PROJECT)
# !docker build --build-arg PY_VERSION=3.6.4 . -t {base_image}
# !docker push {base_image}

In [6]:
BASE_IMAGE = 'gcr.io/{}/fairing-predict-example:latest'.format(GCP_PROJECT)
DOCKER_REGISTRY = 'gcr.io/{}/fairing-job-tf'.format(GCP_PROJECT)

## Deploy the training job to kubeflow cluster

In [8]:
from fairing import TrainJob
from fairing.backends import KubeflowGKEBackend

train_job = TrainJob(TensorflowModel, BASE_IMAGE, input_files=["requirements.txt"],
                     docker_registry=DOCKER_REGISTRY, backend=KubeflowGKEBackend())
train_job.submit()

Using preprocessor: <class 'fairing.preprocessors.function.FunctionPreProcessor'>
Using docker registry: gcr.io/trykube-248403/fairing-job-tf
Using builder: <class 'fairing.builders.cluster.cluster.ClusterBuilder'>
/opt/conda/lib/python3.6/site-packages/fairing/__init__.py already exists in Fairing context, skipping...
/opt/conda/lib/python3.6/site-packages/fairing/__init__.py already exists in Fairing context, skipping...
Waiting for fairing-builder-jggr8 to start...
Waiting for fairing-builder-jggr8 to start...
Pod started running True


[36mINFO[0m[0000] Downloading base image gcr.io/trykube-248403/fairing-predict-example:latest
[36mINFO[0m[0002] Executing 0 build triggers
[36mINFO[0m[0002] Unpacking rootfs as cmd RUN if [ -e requirements.txt ];then pip install --no-cache -r requirements.txt; fi requires it.
[36mINFO[0m[0051] Taking snapshot of full filesystem...
[36mINFO[0m[0068] Skipping paths under /dev, as it is a whitelisted directory
[36mINFO[0m[0068] Skipping paths under /etc/secrets, as it is a whitelisted directory
[36mINFO[0m[0068] Skipping paths under /kaniko, as it is a whitelisted directory
[36mINFO[0m[0068] Skipping paths under /proc, as it is a whitelisted directory
[36mINFO[0m[0068] Skipping paths under /sys, as it is a whitelisted directory
[36mINFO[0m[0068] Skipping paths under /var/run, as it is a whitelisted directory
[36mINFO[0m[0087] WORKDIR /app/
[36mINFO[0m[0087] cmd: workdir
[36mINFO[0m[0087] Changed working directory to /app
[36mINFO[0m[0087] Taking snapshot of ful

[36mINFO[0m[0123] Taking snapshot of full filesystem...
[36mINFO[0m[0123] Skipping paths under /dev, as it is a whitelisted directory
[36mINFO[0m[0123] Skipping paths under /etc/secrets, as it is a whitelisted directory
[36mINFO[0m[0123] Skipping paths under /kaniko, as it is a whitelisted directory
[36mINFO[0m[0123] Skipping paths under /proc, as it is a whitelisted directory
[36mINFO[0m[0123] Skipping paths under /sys, as it is a whitelisted directory
[36mINFO[0m[0123] Skipping paths under /var/run, as it is a whitelisted directory
[36mINFO[0m[0136] Using files from context: [/kaniko/buildcontext/app]
[36mINFO[0m[0136] COPY /app/ /app/
[36mINFO[0m[0136] Taking snapshot of files...
2019/07/31 08:00:52 existing blob: sha256:2eeb5ce9b9240a928b0a799f9f2601027e2c6b7525394ae5c371f124058489d7
2019/07/31 08:00:52 existing blob: sha256:041cd0421648e4d2475068b2a57abe52210afeddd6d9d30f18093d1db9b1a895
2019/07/31 08:00:52 existing blob: sha256:a8c5303780550b746a4781e5e4cd8931

Training job fairing-job-wbhjj launched.
Waiting for fairing-job-wbhjj-t92rl to start...
Waiting for fairing-job-wbhjj-t92rl to start...
Waiting for fairing-job-wbhjj-t92rl to start...
Pod started running True


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
W0731 08:01:10.903488 139671627749120 deprecation.py:506] From /usr/local/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to t

   32/60000 [..............................] - ETA: 12s - loss: 0.0454 - acc: 1.0000  704/60000 [..............................] - ETA: 4s - loss: 0.0631 - acc: 0.9830  1280/60000 [..............................] - ETA: 5s - loss: 0.0609 - acc: 0.9828 1984/60000 [..............................] - ETA: 4s - loss: 0.0789 - acc: 0.9768 2656/60000 [>.............................] - ETA: 4s - loss: 0.0752 - acc: 0.9778 3360/60000 [>.............................] - ETA: 4s - loss: 0.0705 - acc: 0.9789

   32/60000 [..............................] - ETA: 11s - loss: 0.0035 - acc: 1.0000  608/60000 [..............................] - ETA: 5s - loss: 0.0317 - acc: 0.9885  1184/60000 [..............................] - ETA: 5s - loss: 0.0389 - acc: 0.9848 1792/60000 [..............................] - ETA: 5s - loss: 0.0361 - acc: 0.9872 2432/60000 [>.............................] - ETA: 5s - loss: 0.0376 - acc: 0.9868 3008/60000 [>.............................] - ETA: 5s - loss: 0.0390 - acc: 0.9870

   32/60000 [..............................] - ETA: 11s - loss: 0.0224 - acc: 1.0000  704/60000 [..............................] - ETA: 4s - loss: 0.0339 - acc: 0.9858  1376/60000 [..............................] - ETA: 4s - loss: 0.0381 - acc: 0.9869 2080/60000 [>.............................] - ETA: 4s - loss: 0.0359 - acc: 0.9870 2784/60000 [>.............................] - ETA: 4s - loss: 0.0400 - acc: 0.9849 3488/60000 [>.............................] - ETA: 4s - loss: 0.0415 - acc: 0.9839

Copying file://mnist_model.h5 [Content-Type=application/octet-stream]...
AccessDeniedException: 403 Insufficient Permission                              



Cleaning up job fairing-job-wbhjj...


## Deploy distributed training job to kubeflow cluster

In [15]:
fairing.config.set_builder(name='cluster', registry=DOCKER_REGISTRY, 
                           base_image=BASE_IMAGE, push=True, 
                           pod_spec_mutators=[gcp.add_gcp_credentials_if_exists])
fairing.config.set_deployer(name='tfjob', worker_count=1, ps_count=1, 
                            pod_spec_mutators=[gcp.add_gcp_credentials_if_exists])
run_fn = fairing.config.fn(TensorflowModel)

In [16]:
run_fn()

/opt/conda/lib/python3.6/site-packages/fairing/__init__.py already exists in Fairing context, skipping...
/opt/conda/lib/python3.6/site-packages/fairing/__init__.py already exists in Fairing context, skipping...
Waiting for fairing-builder-x76qh to start...
Waiting for fairing-builder-x76qh to start...
Pod started running True


[36mINFO[0m[0000] Downloading base image gcr.io/trykube-248403/fairing-predict-example:latest
[36mINFO[0m[0002] Executing 0 build triggers
[36mINFO[0m[0002] Unpacking rootfs as cmd RUN if [ -e requirements.txt ];then pip install --no-cache -r requirements.txt; fi requires it.
[36mINFO[0m[0049] Taking snapshot of full filesystem...
[36mINFO[0m[0064] Skipping paths under /dev, as it is a whitelisted directory
[36mINFO[0m[0064] Skipping paths under /etc/secrets, as it is a whitelisted directory
[36mINFO[0m[0064] Skipping paths under /kaniko, as it is a whitelisted directory
[36mINFO[0m[0064] Skipping paths under /proc, as it is a whitelisted directory
[36mINFO[0m[0064] Skipping paths under /sys, as it is a whitelisted directory
[36mINFO[0m[0065] Skipping paths under /var/run, as it is a whitelisted directory
[36mINFO[0m[0084] WORKDIR /app/
[36mINFO[0m[0084] cmd: workdir
[36mINFO[0m[0084] Changed working directory to /app
[36mINFO[0m[0084] Taking snapshot of ful

[36mINFO[0m[0119] Taking snapshot of full filesystem...
[36mINFO[0m[0119] Skipping paths under /dev, as it is a whitelisted directory
[36mINFO[0m[0119] Skipping paths under /etc/secrets, as it is a whitelisted directory
[36mINFO[0m[0119] Skipping paths under /kaniko, as it is a whitelisted directory
[36mINFO[0m[0119] Skipping paths under /proc, as it is a whitelisted directory
[36mINFO[0m[0119] Skipping paths under /sys, as it is a whitelisted directory
[36mINFO[0m[0120] Skipping paths under /var/run, as it is a whitelisted directory
[36mINFO[0m[0132] Using files from context: [/kaniko/buildcontext/app]
[36mINFO[0m[0132] COPY /app/ /app/
[36mINFO[0m[0132] Taking snapshot of files...
2019/07/31 08:29:08 existing blob: sha256:687ed2fb2a0d7da5503478759fd00c23970b65d02b317119b3fb9025038a2594
2019/07/31 08:29:08 existing blob: sha256:0c1db95989906f161007d8ef2a6ef6e0ec64bc15bf2c993fd002edbdfc7aa7df
2019/07/31 08:29:08 existing blob: sha256:041cd0421648e4d2475068b2a57abe52

Training job fairing-tfjob-hpggb launched.


KeyboardInterrupt: 

## Deploy the training job as CMLE training job

Doesn’t support CMLE distributed training

In [22]:
from fairing import TrainJob
from fairing.backends import GCPManagedBackend
train_job = TrainJob(TensorflowModel, BASE_IMAGE, input_files=["requirements.txt"],
                     docker_registry=DOCKER_REGISTRY, backend=GCPManagedBackend())
train_job.submit()

Using preprocessor: <class 'fairing.preprocessors.function.FunctionPreProcessor'>
Using docker registry: gcr.io/gojek-kubeflow/fairing-job-tf
Using builder: <class 'fairing.builders.docker.docker.DockerBuilder'>
Building the docker image.
Building image using docker
Docker command: ['python', '/app/function_shim.py', '--serialized_fn_file', '/app/pickled_fn.p']
/Users/luoshixin/LocalSim/virtualPython36/lib/python3.6/site-packages/fairing/__init__.py already exists in Fairing context, skipping...
Creating docker context: /tmp/fairing_context_ql6o52sy
/Users/luoshixin/LocalSim/virtualPython36/lib/python3.6/site-packages/fairing/__init__.py already exists in Fairing context, skipping...
Context: /tmp/fairing_context_ql6o52sy, Adding /Users/luoshixin/LocalSim/virtualPython36/lib/python3.6/site-packages/fairing/__init__.py at /app/fairing/__init__.py
Context: /tmp/fairing_context_ql6o52sy, Adding /Users/luoshixin/LocalSim/virtualPython36/lib/python3.6/site-packages/fairing/runtime_config.py

Creating training job with the following options: {'jobId': 'fairing_job_64b00bf8', 'trainingInput': {'scaleTier': 'BASIC', 'masterConfig': {'imageUri': 'gcr.io/gojek-kubeflow/fairing-job-tf/fairing-job:258D8D01'}, 'region': 'us-central1'}}
Job submitted successfully.
Access job logs at the following URL:
https://console.cloud.google.com/mlengine/jobs/fairing_job_64b00bf8?project=gojek-kubeflow


## Inspect training process with tensorboard

In [22]:
# ! tensorboard --logdir=gs://kubeflow-demo-g/logs --host=localhost --port=8777

## Deploy the trained model to Kubeflow for predictions

In [13]:
from fairing import PredictionEndpoint
from fairing.backends import KubeflowGKEBackend
# The trained_ames_model.joblib is exported during the above local training
endpoint = PredictionEndpoint(TensorflowModel, BASE_IMAGE, input_files=['mnist_model.h5', "requirements.txt"],
                              docker_registry=DOCKER_REGISTRY, backend=KubeflowGKEBackend())
endpoint.create()

Using preprocessor: <class 'fairing.preprocessors.function.FunctionPreProcessor'>
Using docker registry: gcr.io/gojek-kubeflow/fairing-job-tf
Using builder: <class 'fairing.builders.docker.docker.DockerBuilder'>
Building the docker image.
Building image using docker
Docker command: ['python', '/app/function_shim.py', '--serialized_fn_file', '/app/pickled_fn.p']
/Users/luoshixin/LocalSim/virtualPython36/lib/python3.6/site-packages/fairing/__init__.py already exists in Fairing context, skipping...
Creating docker context: /tmp/fairing_context_ftqfzvuc
/Users/luoshixin/LocalSim/virtualPython36/lib/python3.6/site-packages/fairing/__init__.py already exists in Fairing context, skipping...
Context: /tmp/fairing_context_ftqfzvuc, Adding /Users/luoshixin/LocalSim/virtualPython36/lib/python3.6/site-packages/fairing/__init__.py at /app/fairing/__init__.py
Context: /tmp/fairing_context_ftqfzvuc, Adding /Users/luoshixin/LocalSim/virtualPython36/lib/python3.6/site-packages/fairing/runtime_config.py

Waiting for prediction endpoint to come up...


Cluster endpoint: http://35.184.251.118:5000/predict
Prediction endpoint: http://35.184.251.118:5000/predict


In [14]:
endpoint.delete()

Deleted service: kubeflow/fairing-service-vrhnq
Deleted deployment: kubeflow/fairing-deployer-fd2bz
