In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import json
import sys
import shutil
import uuid
from functools import partial

import numpy as np
import tensorflow as tf

import kubeflow.fairing as fairing
from kubeflow.fairing.cloud import gcp
# from kubeflow.fairing.kubernetes.utils import get_resource_mutator

In [2]:
tf.__version__

'1.15.2'

In [3]:
# This is for local notebook instead of that in kubeflow cluster
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../kubeflow-pipeline/config/kubeflow-pipeline-fantasy.json'
! echo $GOOGLE_APPLICATION_CREDENTIALS

../kubeflow-pipeline/config/kubeflow-pipeline-fantasy.json


In [4]:
GCP_PROJECT = "kubeflow-pipeline-fantasy"
GCP_Bucket ='gs://kubeflow-stable-test/'
BASE_IMAGE = "tensorflow/tensorflow:1.15.2-py3"
MY_NAMESPACE = 'kubeflow-luoshixin'
print(GCP_PROJECT)

kubeflow-pipeline-fantasy


In [5]:
CLUSTER_NAME = "kubeflow-fairing"
ZONE = "us-central1-c"

In [6]:
# %env CLUSTER_NAME=$CLUSTER_NAME
# %env ZONE=$ZONE
# ! gcloud container clusters get-credentials ${CLUSTER_NAME} --region ${ZONE}

In [7]:
from kubeflow.fairing import utils
utils.is_running_in_k8s()

False

In [8]:
def train_input_fn(epoch=None):
    mnist = tf.keras.datasets.mnist
    (x_train, y_train),(x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0
    y_train, y_test = y_train.astype(np.int32), y_test.astype(np.int32)
    dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    if epoch:
        dataset = dataset.repeat(epoch)
    dataset = dataset.batch(32)
    return dataset

def test_input_fn(epoch=None):
    mnist = tf.keras.datasets.mnist
    (x_train, y_train),(x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0
    y_train, y_test = y_train.astype(np.int32), y_test.astype(np.int32)
    dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
    if epoch:
        dataset = dataset.repeat(epoch)
    dataset = dataset.batch(32)
    return dataset

In [9]:
# JOB_FOLDER = GCP_Bucket + f'tfkeras_example_{uuid.uuid4().hex[:4]}/'
# print(JOB_FOLDER)

# class TensorflowKerasModel(object):
    
#     def __init__(self):
#         self.model_dir = JOB_FOLDER
#         self.model = None  
    
#     def build(self):
#         # Define a Keras Model.
#         self.model = tf.keras.models.Sequential([
#           tf.keras.layers.Flatten(input_shape=(28, 28)),
#           tf.keras.layers.Dense(512, activation=tf.nn.relu),
#           tf.keras.layers.Dropout(0.2),
#           tf.keras.layers.Dense(10, activation=tf.nn.softmax)
#         ])
#         self.model.compile(optimizer='adam',
#                       loss='sparse_categorical_crossentropy',
#                       metrics=['accuracy'])
#         print(self.model.summary())
    

#     def train(self):
#         self.build()
        
#         tf.keras.backend.set_learning_phase(True)

#         def serving_input_receiver_fn():
#             return tf.estimator.export.build_raw_serving_input_receiver_fn(
#                 {
#                     self.model.input_names[0]: self.model.input
#                 })

#         config = tf.estimator.RunConfig(
#             save_checkpoints_secs=10
#         )

#         keras_estimator = tf.keras.estimator.model_to_estimator(
#             keras_model=self.model, config=config, model_dir=self.model_dir)

#         exporter = tf.estimator.LatestExporter('mnist', serving_input_receiver_fn())
        
#         eval_spec=tf.estimator.EvalSpec(input_fn=partial(test_input_fn, 1),
#                                         exporters=[exporter],
#                                         name='mnist_eval',
#                                         throttle_secs=20,
#                                         start_delay_secs=1)
        
#         train_spec=tf.estimator.TrainSpec(input_fn=partial(train_input_fn, 5))        
        
#         # Train and evaluate the model.
#         tf.estimator.train_and_evaluate(
#             keras_estimator,
#             train_spec=train_spec,
#             eval_spec=eval_spec)
        
#         # Export model ?
#         tf_config = os.environ.get('TF_CONFIG', '{}')
#         tf_config_json = json.loads(tf_config)
#         cluster = tf_config_json.get('cluster')
#         job_name = tf_config_json.get('task', {}).get('type')

#         is_chief = False
#         if not job_name or job_name.lower() in ["chief", "master"]:
#             is_chief = True
        
#         if is_chief:
#             print("Export saved model")
#             estimator.export_savedmodel(
#                 JOB_FOLDER + 'export', serving_input_receiver_fn=serving_input_receiver_fn())
#             print("Done exporting the model")

In [10]:
JOB_FOLDER = f'tfkeras_example_{uuid.uuid4().hex[:4]}/'
print(JOB_FOLDER)

class TensorflowModel(object):
    
    def __init__(self):
        self.model_dir = JOB_FOLDER 
    
    @staticmethod
    def build_model_fn(features, labels, mode, params):
        # Define a Keras Model.
        images = features
        if isinstance(images, dict):
            images = features['image']
        
        image = tf.reshape(images, [-1, 28, 28])
        
        model = tf.keras.models.Sequential([
          tf.keras.layers.Flatten(input_shape=(28, 28)),
          tf.keras.layers.Dense(512, activation=tf.nn.relu),
          tf.keras.layers.Dropout(0.2),
          tf.keras.layers.Dense(10, activation=tf.nn.softmax)
        ])
        
        # Logits can be easily computed using Keras functional API
        logits = model(image)
        probs = tf.nn.softmax(logits)
        predicted_classes = tf.argmax(logits, 1)
        
        if mode == tf.estimator.ModeKeys.PREDICT:
            predictions = {
                'probabilities': probs,
                'logits': logits
            }
            return tf.estimator.EstimatorSpec(mode, predictions=predictions)

        # Binary cross entropy is used as loss function
        loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
        loss_avg = tf.reduce_mean(loss)

        metrics = {        
            'accuracy': tf.metrics.accuracy(labels=labels, predictions=predicted_classes)
        }

        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)

        optimizer = tf.train.AdamOptimizer()
        train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
        
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
    
    def train(self):

        def serving_input_receiver_fn():
            return tf.estimator.export.build_raw_serving_input_receiver_fn(
                {
                    'image': tf.placeholder(tf.float32, [None, 28, 28])
                })

        run_config = tf.estimator.RunConfig(
            save_checkpoints_steps=2000
        )

        keras_estimator = tf.estimator.Estimator(
            model_fn=self.build_model_fn,
            model_dir=self.model_dir,
            config=run_config,
            params=dict())

        exporter = tf.estimator.LatestExporter('mnist', serving_input_receiver_fn())
        
        eval_spec=tf.estimator.EvalSpec(input_fn=partial(test_input_fn, 1),
                                        exporters=[exporter],
                                        name='mnist_eval',
                                        throttle_secs=20)
        
        train_spec=tf.estimator.TrainSpec(input_fn=partial(train_input_fn, 5))        
        
        # Train and evaluate the model.
        tf.estimator.train_and_evaluate(
            keras_estimator,
            train_spec=train_spec,
            eval_spec=eval_spec)
        
        # Export model ?
        tf_config = os.environ.get('TF_CONFIG', '{}')
        tf_config_json = json.loads(tf_config)
        cluster = tf_config_json.get('cluster')
        job_name = tf_config_json.get('task', {}).get('type')

        is_chief = False
        if not job_name or job_name.lower() in ["chief", "master"]:
            is_chief = True
        
        if is_chief:
            print("Export saved model")
            keras_estimator.export_savedmodel(
                JOB_FOLDER + 'export', serving_input_receiver_fn=serving_input_receiver_fn())
            print("Done exporting the model")

tfkeras_example_ce20/


In [11]:
# TensorflowModel().train()

# TFJob in Kubeflow

In [12]:
DOCKER_REGISTRY = 'gcr.io/{}/fairing-job-tf-estimator'.format(GCP_PROJECT)

In [13]:
from kubeflow.fairing.builders.cluster import gcs_context
context_source = gcs_context.GCSContextSource(gcp_project=GCP_PROJECT, namespace=MY_NAMESPACE)

In [14]:
tfjob_name = f'mnist-training-{uuid.uuid4().hex[:9]}'

fairing.config.set_preprocessor('function', function_obj=TensorflowModel)

fairing.config.set_builder(name='cluster', 
                           registry=DOCKER_REGISTRY, 
                           context_source=context_source,
                           base_image=BASE_IMAGE, 
                           push=True,
                           namespace=MY_NAMESPACE,
                           pod_spec_mutators=[gcp.add_gcp_credentials_if_exists])

fairing.config.set_deployer(name='tfjob', 
                            worker_count=2, ps_count=1,
                            job_name=tfjob_name,
                            namespace=MY_NAMESPACE,
                            pod_spec_mutators=[gcp.add_gcp_credentials_if_exists])

In [15]:
_, _, tf_job_ref = fairing.config.run()

[I 200409 11:23:05 config:125] Using preprocessor: <kubeflow.fairing.preprocessors.function.FunctionPreProcessor object at 0x144131c18>
[I 200409 11:23:05 config:127] Using builder: <kubeflow.fairing.builders.cluster.cluster.ClusterBuilder object at 0x140c16eb8>
[I 200409 11:23:05 config:129] Using deployer: <kubeflow.fairing.deployers.tfjob.tfjob.TfJob object at 0x143ac2e80>
[I 200409 11:23:05 cluster:46] Building image using cluster builder.
[W 200409 11:23:05 base:94] /Users/luoshixin/LocalSim/virtualPython36Fairing/lib/python3.6/site-packages/kubeflow/fairing/__init__.py already exists in Fairing context, skipping...
[I 200409 11:23:05 base:107] Creating docker context: /tmp/fairing_context_s5dhqj9b
[W 200409 11:23:05 base:94] /Users/luoshixin/LocalSim/virtualPython36Fairing/lib/python3.6/site-packages/kubeflow/fairing/__init__.py already exists in Fairing context, skipping...
[W 200409 11:23:10 manager:296] Waiting for fairing-builder-lpvj5-kc7tx to start...
[I 200409 11:23:11 man

ERROR: logging before flag.Parse: E0409 03:23:12.663408       1 metadata.go:241] Failed to unmarshal scopes: json: cannot unmarshal string into Go value of type []string
[36mINFO[0m[0002] Resolved base name tensorflow/tensorflow:1.15.2-py3 to tensorflow/tensorflow:1.15.2-py3
[36mINFO[0m[0002] Resolved base name tensorflow/tensorflow:1.15.2-py3 to tensorflow/tensorflow:1.15.2-py3
[36mINFO[0m[0002] Downloading base image tensorflow/tensorflow:1.15.2-py3
ERROR: logging before flag.Parse: E0409 03:23:12.994755       1 metadata.go:142] while reading 'google-dockercfg' metadata: http status code: 404 while fetching url http://metadata.google.internal./computeMetadata/v1/instance/attributes/google-dockercfg
ERROR: logging before flag.Parse: E0409 03:23:12.997566       1 metadata.go:159] while reading 'google-dockercfg-url' metadata: http status code: 404 while fetching url http://metadata.google.internal./computeMetadata/v1/instance/attributes/google-dockercfg-url
[36mINFO[0m[0003] Er

[W 200409 11:24:29 job:101] The tfjob mnist-training-7b6dca98a launched.
[W 200409 11:24:30 manager:296] Waiting for mnist-training-7b6dca98a-worker-0 to start...
[W 200409 11:24:30 manager:296] Waiting for mnist-training-7b6dca98a-worker-0 to start...
[W 200409 11:24:30 manager:296] Waiting for mnist-training-7b6dca98a-worker-0 to start...
[I 200409 11:24:33 manager:302] Pod started running True


Traceback (most recent call last):
  File "/app/function_shim.py", line 79, in <module>
    call(args.serialized_fn_file)
  File "/app/function_shim.py", line 61, in call
    obj = cloudpickle.load(f)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/__init__.py", line 50, in __getattr__
    module = self._load()
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/__init__.py", line 44, in _load
    module = _importlib.import_module(self.__name__)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/__init__.py", line 50, in __getattr__
    module = self._load()
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/__init__.py", line 44, in _load
    module = _importlib.import_module(self.__name__)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/__init__.py", line 50, in __getattr__
    module = self._load()
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/__init__.py", line 44, in _load
    module = _importlib.import_module(self.__name__)
  Fi

  File "/usr/local/lib/python3.6/dist-packages/tensorflow/__init__.py", line 44, in _load
    module = _importlib.import_module(self.__name__)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/__init__.py", line 50, in __getattr__
    module = self._load()
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/__init__.py", line 44, in _load
    module = _importlib.import_module(self.__name__)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/__init__.py", line 50, in __getattr__
    module = self._load()
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/__init__.py", line 44, in _load
    module = _importlib.import_module(self.__name__)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/__init__.py", line 50, in __getattr__
    module = self._load()
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/__init__.py", line 44, in _load
    module = _importlib.import_module(self.__name__)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/__in

In [13]:
tf_job_ref.job_name

In [14]:
# from kubeflow.tfjob import TFJobClient
# tfjob_client = TFJobClient()
# tfjob_client.get('mnist-training-938076ce1mxxbh', namespace=MY_NAMESPACE)

In [15]:
# tfjob_client.wait_for_job(tfjob_name, namespace=MY_NAMESPACE, watch=True)

In [16]:
# tfjob_client.is_job_running('mnist-training-938076ce1mxxbh', namespace=MY_NAMESPACE)

In [17]:
# tfjob_client.delete('mnist-training-4640d8539sgfpv', namespace=MY_NAMESPACE)

# KFServing

Only the following version is supported
"allowedImageVersions": [
   "1.11.0",
   "1.11.0-gpu",
   "1.12.0",
   "1.12.0-gpu",
   "1.13.0",
   "1.13.0-gpu",
   "1.14.0",
   "1.14.0-gpu"
]

In [18]:
from kubeflow.fairing.deployers.kfserving.kfserving import KFServing
isvc = KFServing('tensorflow', namespace=MY_NAMESPACE, stream_log=True,
                 default_storage_uri=JOB_FOLDER + 'export/mnist/1583389154')
isvc_name = isvc.deploy(isvc.generate_isvc())

NAME                 READY      DEFAULT_TRAFFIC CANARY_TRAFFIC  URL                                               
fairing-kfserving... Unknown                                                                                      
fairing-kfserving... False                                                                                        
fairing-kfserving... False                                                                                        
fairing-kfserving... False                                                                                        


KeyboardInterrupt: 

In [19]:
from kfserving import KFServingClient
kfserving_client = KFServingClient()

In [21]:
mnist_isvc = kfserving_client.get("fairing-kfserving-77dkk", namespace=MY_NAMESPACE)
mnist_isvc

{'apiVersion': 'serving.kubeflow.org/v1alpha2',
 'kind': 'InferenceService',
 'metadata': {'creationTimestamp': '2020-03-05T06:20:12Z',
  'generateName': 'fairing-kfserving-',
  'generation': 4,
  'name': 'fairing-kfserving-77dkk',
  'namespace': 'kubeflow-luoshixin',
  'resourceVersion': '1866644',
  'selfLink': '/apis/serving.kubeflow.org/v1alpha2/namespaces/kubeflow-luoshixin/inferenceservices/fairing-kfserving-77dkk',
  'uid': '5f2f7df9-5ea9-11ea-b79c-42010a800208'},
 'spec': {'default': {'predictor': {'tensorflow': {'resources': {'limits': {'cpu': '1',
       'memory': '2Gi'},
      'requests': {'cpu': '1', 'memory': '2Gi'}},
     'runtimeVersion': '1.14.0',
     'storageUri': 'tfkeras_example_f7f8/export/mnist/1583389154'}}}},
 'status': {'canary': {},
  'conditions': [{'lastTransitionTime': '2020-03-05T06:22:15Z',
    'message': 'Configuration "fairing-kfserving-77dkk-predictor-default" does not have any ready Revision.',
    'reason': 'RevisionMissing',
    'status': 'False',
 

In [44]:
kfserving_client.delete(isvc_name, namespace=MY_NAMESPACE)

{'kind': 'Status',
 'apiVersion': 'v1',
 'metadata': {},
 'status': 'Success',
 'details': {'name': 'fairing-kfserving-htbkp',
  'group': 'serving.kubeflow.org',
  'kind': 'inferenceservices',
  'uid': '146db360-5e26-11ea-b79c-42010a800208'}}