In [15]:
import os
import tensorflow as tf
import argparse
from tensorflow.python.keras.callbacks import Callback


class MyModel(object):
    def train(self):
        # 입력 값을 받게 추가합니다.
        parser = argparse.ArgumentParser()
        parser.add_argument('--learning_rate', required=False, type=float, default=0.01)
        parser.add_argument('--dropout_rate', required=False, type=float, default=0.2)
        args = parser.parse_args()

        (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data('/app/mnist.npz')
        x_train, x_test = x_train / 255.0, x_test / 255.0

        model = tf.keras.models.Sequential([
            tf.keras.layers.Flatten(input_shape=(28, 28)),
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dropout(args.dropout_rate),
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dropout(args.dropout_rate),            
            tf.keras.layers.Dense(10, activation='softmax')
        ])

        sgd = tf.keras.optimizers.SGD(lr=args.learning_rate,
                                      decay=1e-6,
                                      momentum=0.9,
                                      nesterov=True)

        model.compile(optimizer=sgd,
                      loss='sparse_categorical_crossentropy',
                      metrics=['acc'])

        model.fit(x_train, y_train,
                  verbose=0,
                  validation_data=(x_test, y_test),
                  epochs=1,
                  callbacks=[KatibMetricLog()])


class KatibMetricLog(Callback):
    def on_batch_end(self, batch, logs={}):
        print("batch=" + str(batch),
              "accuracy=" + str(logs.get('acc')),
              "loss=" + str(logs.get('loss')))

    def on_epoch_begin(self, epoch, logs={}):
        print("epoch=" + str(epoch) + ":")

    def on_epoch_end(self, epoch, logs={}):
        print("Validation-accuracy=" + str(logs.get('val_acc')),
              "Validation-loss=" + str(logs.get('val_loss')))
        return


if __name__ == '__main__':
    if os.getenv('FAIRING_RUNTIME', None) is None:
        from kubeflow import fairing
        from kubeflow.fairing.kubernetes import utils as k8s_utils

        DOCKER_REGISTRY = 'kubeflow-registry.default.svc.cluster.local:30000'
        
        output_map = {
            'mnist.npz': '/app/mnist.npz'
        }
        
        fairing.config.set_builder(
            'append',
            image_name='katib-job',
            base_image='brightfly/kubeflow-jupyter-lab:tf2.0-cpu',
            registry=DOCKER_REGISTRY,
            push=True)
        # cpu 1, memory 1GiB
        fairing.config.set_deployer('job',
                                    namespace='anonymous'
                                    )
        fairing.config.set_preprocessor('notebook', 
                                        notebook_file='katib-job.ipynb',
                                        output_map=output_map)        
        fairing.config.run()

    else:
        remote_train = MyModel()
        remote_train.train()

[I 210202 18:27:56 config:123] Using preprocessor: <kubeflow.fairing.preprocessors.converted_notebook.ConvertNotebookPreprocessor object at 0x7f3a69990320>
[I 210202 18:27:56 config:125] Using builder: <kubeflow.fairing.builders.append.append.AppendBuilder object at 0x7f3a69150fd0>
[I 210202 18:27:56 config:127] Using deployer: <kubeflow.fairing.builders.append.append.AppendBuilder object at 0x7f3a69150fd0>
[W 210202 18:27:56 append:50] Building image using Append builder...
[I 210202 18:27:56 base:105] Creating docker context: /tmp/fairing_context_0jvoxm5x
[I 210202 18:27:56 converted_notebook:127] Converting katib-job.ipynb to katib-job.py
[I 210202 18:27:56 docker_creds_:234] Loading Docker credentials for repository 'brightfly/kubeflow-jupyter-lab:tf2.0-cpu'
[W 210202 18:27:59 append:54] Image successfully built in 2.767386882002029s.
[W 210202 18:27:59 append:94] Pushing image kubeflow-registry.default.svc.cluster.local:30000/katib-job:EFB72FB8...
[I 210202 18:27:59 docker_creds_:

2021-02-02 18:28:05.114409: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2021-02-02 18:28:05.118571: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 3593295000 Hz
2021-02-02 18:28:05.119250: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x3ce52f0 executing computations on platform Host. Devices:
2021-02-02 18:28:05.119298: I tensorflow/compiler/xla/service/service.cc:175]   StreamExecutor device (0): Host, Default Version
Train on 60000 samples, validate on 10000 samples
epoch 0:
batch 0 accuracy=0.1875 loss=2.3450565
   32/60000 [..............................] - ETA: 15:20 - loss: 2.3451 - acc: 0.1875batch 1 accuracy=0.140625 loss=2.29174
batch 2 accuracy=0.104166664 loss=2.299227
batch 3 accuracy=0.125 loss=2.2110605
batch 4 accuracy=0.11875 loss=2.3025255
batch 5 accuracy=0.125 loss=2.16715
batch 6 accuracy=0.125 loss=2.2359648
batch 7 accura

[W 210202 18:28:13 job:162] Cleaning up job fairing-job-rnph6...
