In [None]:
# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Train tensorflow or keras model in .py files on GCP or Kubeflow from Notebooks

This notebook introduces you to using Kubeflow Fairing to train the model, which is developed using tensorflow or keras and enclosed in python files, to Kubeflow on Google Kubernetes Engine (GKE), and Google Cloud AI Platform training. This notebook demonstrate how to:
 
* Use Kubeflow Fairing to train an Tensorflow model remotely on Kubeflow cluster,
* Use Kubeflow Fairing to train an Tensorflow model remotely on AI Platform training,

**You need Python 3.6 to use Kubeflow Fairing.**

## Setups

* Pre-conditions
    - Deployed a kubeflow cluster through https://deploy.kubeflow.cloud/
    - Have the following environment variable ready: 
        - PROJECT_ID # project host the kubeflow cluster or for running AI platform training
        - DEPLOYMENT_NAME # kubeflow deployment name, the same the cluster name after delpoyed
        - GCP_BUCKET # google cloud storage bucket

* Create service account
```bash
export SA_NAME = [service account name]
gcloud iam service-accounts create ${SA_NAME}
gcloud projects add-iam-policy-binding ${PROJECT_ID} \
    --member serviceAccount:${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com \
    --role 'roles/editor'
gcloud iam service-accounts keys create ~/key.json \
    --iam-account ${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com
```

* Authorize for Source Repository
```bash
gcloud auth configure-docker
```

* Update local kubeconfig (for submiting job to kubeflow cluster)
```bash
export CLUSTER_NAME=${DEPLOYMENT_NAME} # this is the deployment name or the kubenete cluster name
export ZONE=us-central1-c
gcloud container clusters get-credentials ${CLUSTER_NAME} --region ${ZONE}
```

* Set the environmental variable: GOOGLE_APPLICATION_CREDENTIALS
```bash
export GOOGLE_APPLICATION_CREDENTIALS = ....
```
```python
os.environ['GOOGLE_APPLICATION_CREDENTIALS']=...
```

* Install the lastest version of fairing
```python
pip install git+https://github.com/kubeflow/fairing@master
```

**Please not that the above configuration is required for notebook service running outside Kubeflow environment. And the examples demonstrated in the notebook is fully tested on notebook service outside Kubeflow cluster also.**

**The environemt variables, e.g. service account, projects and etc, should have been pre-configured while setting up the cluster.**

In [1]:
import os
import kubeflow.fairing as fairing
from kubeflow.fairing.cloud import gcp

In [2]:
# This is for local notebook instead of that in kubeflow cluster
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../kubeflow-pipeline/config/kubeflow-pipeline-fantasy.json'
! echo $GOOGLE_APPLICATION_CREDENTIALS

../kubeflow-pipeline/config/kubeflow-pipeline-fantasy.json


In [9]:
# Setting up google container repositories (GCR) for storing output containers
# You can use any docker container registry istead of GCR
# For local notebook, GCP_PROJECT should be set explicitly

GCP_PROJECT = fairing.cloud.gcp.guess_project_name() # 'kubeflow-pipeline-fantasy'
GCS_BUCKET='gs://kubeflow-pipeline-ui'
MY_NAMESPACE = 'kubeflow-luoshixin'
print(GCP_PROJECT)

kubeflow-pipeline-fantasy


In [4]:
# In this demo, I use gsutil, therefore i compile a special image to install GoogleCloudSDK as based image
base_image = 'gcr.io/{}/kubeflow-fairing-base:latest'.format(GCP_PROJECT)
# ! gcloud builds submit --tag {base_image}

For build using docker
```bash
docker build . -t {base_image}
docker push {base_image}
```

In [5]:
DOCKER_REGISTRY = 'gcr.io/{}/kubeflow-fairing-job-tf'.format(GCP_PROJECT)
BASE_IMAGE = base_image

In [6]:
file_name = 'model.py'

## Deploy the training job to AI platform training

In [8]:
fairing.config.set_preprocessor('python', 
                                executable=file_name, 
                                input_files=[file_name, 'requirements.txt'])

fairing.config.set_builder(name='cluster', 
                           registry=DOCKER_REGISTRY, 
                           context_source=context_source,
                           base_image=BASE_IMAGE, 
                           push=True,
                           namespace=MY_NAMESPACE,
                           pod_spec_mutators=[gcp.add_gcp_credentials_if_exists])

fairing.config.set_deployer(name='gcp', 
                            namespace=MY_NAMESPACE,
                            pod_spec_mutators=[gcp.add_gcp_credentials_if_exists])

fairing.config.run()

Using preprocessor: <fairing.preprocessors.base.BasePreProcessor object at 0x11f4fe128>
Using builder: <fairing.builders.docker.docker.DockerBuilder object at 0x11f4fee48>
file_cache is unavailable when using oauth2client >= 4.0.0 or google-auth
Traceback (most recent call last):
  File "/Users/luoshixin/LocalSim/virtualPython36/lib/python3.6/site-packages/googleapiclient/discovery_cache/__init__.py", line 36, in autodetect
    from google.appengine.api import memcache
ModuleNotFoundError: No module named 'google.appengine'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/luoshixin/LocalSim/virtualPython36/lib/python3.6/site-packages/googleapiclient/discovery_cache/file_cache.py", line 33, in <module>
    from oauth2client.contrib.locked_file import LockedFile
ModuleNotFoundError: No module named 'oauth2client.contrib.locked_file'

During handling of the above exception, another exception occurred:

Traceback (most 

Creating training job with the following options: {'jobId': 'fairing_job_5310fcf1', 'trainingInput': {'masterConfig': {'imageUri': 'gcr.io/gojek-kubeflow/fairing-job-tf/fairing-job:F0D4918E'}, 'region': 'us-central1'}}
Job submitted successfully.
Access job logs at the following URL:
https://console.cloud.google.com/mlengine/jobs/fairing_job_5310fcf1?project=gojek-kubeflow


(<fairing.preprocessors.base.BasePreProcessor at 0x11f4fe128>,
 <fairing.builders.docker.docker.DockerBuilder at 0x11f4fee48>,
 <fairing.deployers.gcp.gcp.GCPJob at 0x11f4fefd0>)

## Deploy the training job to kubeflow cluster

In [12]:
from kubeflow.fairing.builders.cluster import gcs_context
context_source = gcs_context.GCSContextSource(gcp_project=GCP_PROJECT, namespace=MY_NAMESPACE)

In [14]:
fairing.config.set_preprocessor('python', 
                                executable=file_name, 
                                input_files=[file_name, 'requirements.txt'])

fairing.config.set_builder(name='cluster', 
                           registry=DOCKER_REGISTRY, 
                           context_source=context_source,
                           base_image=BASE_IMAGE, 
                           push=True,
                           namespace=MY_NAMESPACE,
                           pod_spec_mutators=[gcp.add_gcp_credentials_if_exists])

fairing.config.set_deployer(name='job', 
                            namespace=MY_NAMESPACE,
                            pod_spec_mutators=[gcp.add_gcp_credentials_if_exists])

fairing.config.run()

[I 200409 11:48:06 config:125] Using preprocessor: <kubeflow.fairing.preprocessors.base.BasePreProcessor object at 0x122d51eb8>
[I 200409 11:48:06 config:127] Using builder: <kubeflow.fairing.builders.cluster.cluster.ClusterBuilder object at 0x122d51048>
[I 200409 11:48:06 config:129] Using deployer: <kubeflow.fairing.deployers.job.job.Job object at 0x122d43630>
[I 200409 11:48:06 cluster:46] Building image using cluster builder.
[I 200409 11:48:06 base:107] Creating docker context: /tmp/fairing_context_aax8r5e3
[W 200409 11:48:10 manager:296] Waiting for fairing-builder-fj7x2-42sr7 to start...
[I 200409 11:48:12 manager:302] Pod started running True


ERROR: logging before flag.Parse: E0409 03:48:15.653275       1 metadata.go:241] Failed to unmarshal scopes: json: cannot unmarshal string into Go value of type []string
[36mINFO[0m[0004] Resolved base name gcr.io/kubeflow-pipeline-fantasy/kubeflow-fairing-base:latest to gcr.io/kubeflow-pipeline-fantasy/kubeflow-fairing-base:latest
[36mINFO[0m[0004] Resolved base name gcr.io/kubeflow-pipeline-fantasy/kubeflow-fairing-base:latest to gcr.io/kubeflow-pipeline-fantasy/kubeflow-fairing-base:latest
[36mINFO[0m[0004] Downloading base image gcr.io/kubeflow-pipeline-fantasy/kubeflow-fairing-base:latest
[36mINFO[0m[0004] Error while retrieving image from cache: getting file info: stat /cache/sha256:88c783033a077d26d0b09e8015ed6691d8568adcd46bd5abc470d0ad99d72e9a: no such file or directory
[36mINFO[0m[0004] Downloading base image gcr.io/kubeflow-pipeline-fantasy/kubeflow-fairing-base:latest
[36mINFO[0m[0005] Built cross stage deps: map[]
[36mINFO[0m[0005] Downloading base image gcr.

Installing collected packages: decorator, fsspec, gcsfs, cloudpickle
  Attempting uninstall: cloudpickle
    Found existing installation: cloudpickle 1.2.2
    Uninstalling cloudpickle-1.2.2:
      Successfully uninstalled cloudpickle-1.2.2
Successfully installed cloudpickle-1.1.1 decorator-4.4.2 fsspec-0.7.2 gcsfs-0.6.1
[36mINFO[0m[0105] Taking snapshot of full filesystem...
[36mINFO[0m[0106] Adding whiteout for /usr/local/lib/python3.7/site-packages/cloudpickle/cloudpickle_fast.py
[36mINFO[0m[0106] Adding whiteout for /usr/local/lib/python3.7/site-packages/cloudpickle-1.2.2.dist-info
[36mINFO[0m[0106] Adding whiteout for /usr/local/lib/python3.7/site-packages/cloudpickle/__pycache__/cloudpickle_fast.cpython-37.pyc
[36mINFO[0m[0111] Using files from context: [/kaniko/buildcontext/app]
[36mINFO[0m[0111] Pushing layer gcr.io/kubeflow-pipeline-fantasy/kubeflow-fairing-job-tf/fairing-job/cache:24ab49b363b09bd7815ef8451a5c56e65ee516a127c90d579b87287378046049 to cache now
[36mI

[W 200409 11:50:11 job:101] The job fairing-job-xzmhl launched.
[W 200409 11:50:11 manager:296] Waiting for fairing-job-xzmhl-mn64g to start...
[I 200409 11:50:18 manager:302] Pod started running True


2020-04-09 03:50:23.036571: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer.so.6'; dlerror: libnvinfer.so.6: cannot open shared object file: No such file or directory
2020-04-09 03:50:23.036917: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer_plugin.so.6'; dlerror: libnvinfer_plugin.so.6: cannot open shared object file: No such file or directory
2020-04-09 03:50:23.036948: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:30] Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
Traceback (most recent call last):
  File "/app/model.py", line 240, in <module>
    tf.app.run()
AttributeError: module 'tensorflow' has no attribute 'app'


[W 200409 11:50:29 job:173] Cleaning up job fairing-job-xzmhl...


(<kubeflow.fairing.preprocessors.base.BasePreProcessor at 0x122d51eb8>,
 <kubeflow.fairing.builders.cluster.cluster.ClusterBuilder at 0x122d51048>,
 <kubeflow.fairing.deployers.job.job.Job at 0x122d43630>)