In [None]:
# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Train tensorflow or keras model in .py files on GCP or Kubeflow from Notebooks

This notebook introduces you to using Kubeflow Fairing to train the model, which is developed using tensorflow or keras and enclosed in python files, to Kubeflow on Google Kubernetes Engine (GKE), and Google Cloud AI Platform training. This notebook demonstrate how to:
 
* Use Kubeflow Fairing to train an Tensorflow model remotely on Kubeflow cluster,
* Use Kubeflow Fairing to train an Tensorflow model remotely on AI Platform training,

**You need Python 3.6 to use Kubeflow Fairing.**

## Setups

* Pre-conditions
    - Deployed a kubeflow cluster through https://deploy.kubeflow.cloud/
    - Have the following environment variable ready: 
        - PROJECT_ID # project host the kubeflow cluster or for running AI platform training
        - DEPLOYMENT_NAME # kubeflow deployment name, the same the cluster name after delpoyed
        - GCP_BUCKET # google cloud storage bucket

* Create service account
```bash
export SA_NAME = [service account name]
gcloud iam service-accounts create ${SA_NAME}
gcloud projects add-iam-policy-binding ${PROJECT_ID} \
    --member serviceAccount:${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com \
    --role 'roles/editor'
gcloud iam service-accounts keys create ~/key.json \
    --iam-account ${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com
```

* Authorize for Source Repository
```bash
gcloud auth configure-docker
```

* Update local kubeconfig (for submiting job to kubeflow cluster)
```bash
export CLUSTER_NAME=${DEPLOYMENT_NAME} # this is the deployment name or the kubenete cluster name
export ZONE=us-central1-c
gcloud container clusters get-credentials ${CLUSTER_NAME} --region ${ZONE}
```

* Set the environmental variable: GOOGLE_APPLICATION_CREDENTIALS
```bash
export GOOGLE_APPLICATION_CREDENTIALS = ....
```
```python
os.environ['GOOGLE_APPLICATION_CREDENTIALS']=...
```

* Install the lastest version of fairing
```python
pip install git+https://github.com/kubeflow/fairing@master
```

**Please not that the above configuration is required for notebook service running outside Kubeflow environment. And the examples demonstrated in the notebook is fully tested on notebook service outside Kubeflow cluster also.**

**The environemt variables, e.g. service account, projects and etc, should have been pre-configured while setting up the cluster.**

In [1]:
import os
import fairing
from fairing.cloud import gcp

In [2]:
import os
import fairing

# Setting up google container repositories (GCR) for storing output containers
# You can use any docker container registry istead of GCR
# For local notebook, GCP_PROJECT should be set explicitly
GCP_PROJECT = fairing.cloud.gcp.guess_project_name()
GCP_Bucket ='gs://kubeflow-trykube/'
print(GCP_PROJECT)
# This is for local notebook instead of that in kubeflow cluster
# os.environ['GOOGLE_APPLICATION_CREDENTIALS']=

trykube-248403


In [3]:
# In this demo, I use gsutil, therefore i compile a special image to install GoogleCloudSDK as based image
# base_image = 'gcr.io/{}/fairing-predict-example:latest'.format(GCP_PROJECT)
# !docker build --build-arg PY_VERSION=3.6.4 . -t {base_image}
# !docker push {base_image}

In [4]:
DOCKER_REGISTRY = 'gcr.io/{}/fairing-job-tf'.format(GCP_PROJECT)
BASE_IMAGE = 'gcr.io/{}/fairing-predict-example:latest'.format(GCP_PROJECT)

In [5]:
file_name = 'model.py'

## Deploy the training job to AI platform training

In [8]:
fairing.config.set_preprocessor('python', executable=file_name, input_files=[file_name, 'requirements.txt'])
fairing.config.set_builder(name='cluster', registry=DOCKER_REGISTRY, base_image=BASE_IMAGE, push=True, 
                           pod_spec_mutators=[gcp.add_gcp_credentials_if_exists])
fairing.config.set_deployer(name='gcp', pod_spec_mutators=[gcp.add_gcp_credentials_if_exists])
fairing.config.run()

Using preprocessor: <fairing.preprocessors.base.BasePreProcessor object at 0x11f4fe128>
Using builder: <fairing.builders.docker.docker.DockerBuilder object at 0x11f4fee48>
file_cache is unavailable when using oauth2client >= 4.0.0 or google-auth
Traceback (most recent call last):
  File "/Users/luoshixin/LocalSim/virtualPython36/lib/python3.6/site-packages/googleapiclient/discovery_cache/__init__.py", line 36, in autodetect
    from google.appengine.api import memcache
ModuleNotFoundError: No module named 'google.appengine'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/luoshixin/LocalSim/virtualPython36/lib/python3.6/site-packages/googleapiclient/discovery_cache/file_cache.py", line 33, in <module>
    from oauth2client.contrib.locked_file import LockedFile
ModuleNotFoundError: No module named 'oauth2client.contrib.locked_file'

During handling of the above exception, another exception occurred:

Traceback (most 

Creating training job with the following options: {'jobId': 'fairing_job_5310fcf1', 'trainingInput': {'masterConfig': {'imageUri': 'gcr.io/gojek-kubeflow/fairing-job-tf/fairing-job:F0D4918E'}, 'region': 'us-central1'}}
Job submitted successfully.
Access job logs at the following URL:
https://console.cloud.google.com/mlengine/jobs/fairing_job_5310fcf1?project=gojek-kubeflow


(<fairing.preprocessors.base.BasePreProcessor at 0x11f4fe128>,
 <fairing.builders.docker.docker.DockerBuilder at 0x11f4fee48>,
 <fairing.deployers.gcp.gcp.GCPJob at 0x11f4fefd0>)

## Deploy the training job to kubeflow cluster

In [6]:
fairing.config.set_preprocessor('python', executable=file_name, input_files=[file_name, 'requirements.txt'])
fairing.config.set_builder(name='cluster', registry=DOCKER_REGISTRY, base_image=BASE_IMAGE, push=True
                           ,pod_spec_mutators=[gcp.add_gcp_credentials_if_exists])
fairing.config.set_deployer(name='job', pod_spec_mutators=[gcp.add_gcp_credentials_if_exists])
fairing.config.run()

Waiting for fairing-builder-v69fn to start...
Waiting for fairing-builder-v69fn to start...
Pod started running True


[36mINFO[0m[0000] Downloading base image gcr.io/trykube-248403/fairing-predict-example:latest
[36mINFO[0m[0002] Executing 0 build triggers
[36mINFO[0m[0002] Unpacking rootfs as cmd RUN if [ -e requirements.txt ];then pip install --no-cache -r requirements.txt; fi requires it.
[36mINFO[0m[0050] Taking snapshot of full filesystem...
[36mINFO[0m[0067] Skipping paths under /dev, as it is a whitelisted directory
[36mINFO[0m[0067] Skipping paths under /etc/secrets, as it is a whitelisted directory
[36mINFO[0m[0067] Skipping paths under /kaniko, as it is a whitelisted directory
[36mINFO[0m[0067] Skipping paths under /proc, as it is a whitelisted directory
[36mINFO[0m[0067] Skipping paths under /sys, as it is a whitelisted directory
[36mINFO[0m[0067] Skipping paths under /var/run, as it is a whitelisted directory
[36mINFO[0m[0087] WORKDIR /app/
[36mINFO[0m[0087] cmd: workdir
[36mINFO[0m[0087] Changed working directory to /app
[36mINFO[0m[0087] Taking snapshot of ful

[36mINFO[0m[0122] Taking snapshot of full filesystem...
[36mINFO[0m[0122] Skipping paths under /dev, as it is a whitelisted directory
[36mINFO[0m[0122] Skipping paths under /etc/secrets, as it is a whitelisted directory
[36mINFO[0m[0122] Skipping paths under /kaniko, as it is a whitelisted directory
[36mINFO[0m[0122] Skipping paths under /proc, as it is a whitelisted directory
[36mINFO[0m[0122] Skipping paths under /sys, as it is a whitelisted directory
[36mINFO[0m[0123] Skipping paths under /var/run, as it is a whitelisted directory
[36mINFO[0m[0135] Using files from context: [/kaniko/buildcontext/app]
[36mINFO[0m[0135] COPY /app/ /app/
[36mINFO[0m[0135] Taking snapshot of files...
2019/07/31 08:41:37 existing blob: sha256:c60eba308238780085602c72a69337c634aba5207d54d2369ddd92e4120f808f
2019/07/31 08:41:37 existing blob: sha256:0c1db95989906f161007d8ef2a6ef6e0ec64bc15bf2c993fd002edbdfc7aa7df
2019/07/31 08:41:37 existing blob: sha256:5d71636fb824265e30ff34bf20737c9c

Training job fairing-job-bf972 launched.
Waiting for fairing-job-bf972-mjlfb to start...
Waiting for fairing-job-bf972-mjlfb to start...
Waiting for fairing-job-bf972-mjlfb to start...
Pod started running True


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
W0731 08:41:56.680246 139767480002304 deprecation_wrapper.py:119] From /app/model.py:240: The name tf.app.run is deprecated. Please use tf.compat.v1.app.run instead.

W0731 08:41:56.680960 139767480002304 deprecation_wrapper.py:119] From /app/model.py:160: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.

W0731 08:41:56.68113

W0731 08:41:59.993082 139767480002304 deprecation.py:323] From /app/model.py:91: max_pooling2d (from tensorflow.python.layers.pooling) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.MaxPooling2D instead.
W0731 08:42:00.184135 139767480002304 deprecation.py:323] From /app/model.py:107: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.dense instead.
W0731 08:42:00.531209 139767480002304 deprecation.py:323] From /app/model.py:111: dropout (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.dropout instead.
W0731 08:42:00.632676 139767480002304 deprecation_wrapper.py:119] From /app/model.py:131: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.

W0731 08:42:00.644682 139767480002304 d

I0731 08:42:18.694576 139767480002304 saver.py:1280] Restoring parameters from /tmp/tensorflow/logs/model.ckpt-200
I0731 08:42:18.718579 139767480002304 builder_impl.py:661] Assets added to graph.
I0731 08:42:18.718801 139767480002304 builder_impl.py:456] No assets to write.
I0731 08:42:18.762445 139767480002304 builder_impl.py:421] SavedModel written to: /tmp/tensorflow/model/temp-b'1564562538'/saved_model.pb
Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.
Extracting /tmp/tensorflow/input_data/train-images-idx3-ubyte.gz
Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.
Extracting /tmp/tensorflow/input_data/train-labels-idx1-ubyte.gz
Successfully downloaded t10k-images-idx3-ubyte.gz 1648877 bytes.
Extracting /tmp/tensorflow/input_data/t10k-images-idx3-ubyte.gz
Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.
Extracting /tmp/tensorflow/input_data/t10k-labels-idx1-ubyte.gz
Train and evaluate
Training done
Export saved model
Done exporting the 

Cleaning up job fairing-job-bf972...
