# Using MLRUN with MpiJobs (Horovod)

In [1]:
# nuclio: ignore
import nuclio

In [8]:
from mlrun import new_function, code_to_function, get_run_db, mount_v3io, mlconf, new_model_server, v3io_cred
import os
# for local DB path use '/User/mlrun' instead 
mlconf.dbpath = 'http://mlrun-db:8080'

In [3]:
base_dir = '/User/mlrun/examples'
images_path = os.path.join(base_dir, 'images')
model_name = 'cat_vs_dog_v1'

## Import and define ML functions for our pipeline (utils, training, serving)

In [6]:
# data import and labeling 
utilsfn = code_to_function(name='file_utils', filename='./utils.py',
                           image='mlrun/mlrun:latest')

In [9]:
# distributed training function, using 4 containers and optionally GPUs
# update the code dir to the location of the code file (full path starting with /User)
code_dir = '/User/mlrun'
HOROVOD_FILE = os.path.join(code_dir, 'horovod-training.py')

image = 'mlrun/mpijob:latest'
trainer_fn = new_function(name='horovod-trainer',
                          command='mpijob://{}'.format(HOROVOD_FILE), 
                          image=image,
                          interactive=True)
trainer_fn.apply(mount_v3io())
trainer_fn.spec.replicas = 4
#trainer.gpus(1)

In [10]:
# inference function
inference_function = code_to_function(name='tf-image-serving-pipe', 
                                      filename='./nuclio-serving-tf-images.ipynb',
                                      runtime='nuclio')
inference_function.with_http(workers=2).add_volume('User','~/')

<mlrun.runtimes.function.RemoteRuntime at 0x7f5a5b80c4a8>

## Create and run the pipeline

In [4]:
import kfp
from kfp import dsl

In [11]:
artifacts_path = 'v3io:///users/admin/mlrun/kfp/{{workflow.uid}}/'

In [14]:
@dsl.pipeline(
    name='Image classification training pipeline',
    description='Shows how to use mlrun with horovod.'
)
def hvd_pipeline(
    images_path = '/User/mlrun/examples/images', 
    source_dir='/User/mlrun/examples/images/cats_n_dogs'
):
    open_archive = utilsfn.as_step(name='download', handler='open_archive',
                                   out_path=images_path, 
                                   params={'target_dir': images_path},
                                   inputs={'archive_url': 'http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip'},
                                   outputs=['content']).apply(mount_v3io())
              
    label = utilsfn.as_step(name='label', handler='categories_map_builder',
                            out_path=images_path,
                            params={'source_dir': source_dir}, 
                            outputs=['categories_map', 'file_categories']).apply(mount_v3io()).after(open_archive)
    
    train = trainer_fn.as_step(name='train', 
                               params = {'epochs' : 8,
                                         'checkpoints_dir' : '/User/mlrun/examples/checkpoints',
                                         'model_path' : '/User/mlrun/examples/models/cats_n_dogs.hd5'},
                               inputs = {'data_path' : source_dir,
                                         'categories_map': label.outputs['categories_map'],
                                         'file_categories': label.outputs['file_categories']},                               
                               out_path=images_path, 
                               outputs=['model']).apply(v3io_cred())

    # deploy the model using nuclio functions
    deploy = inference_function.deploy_step(project = 'horovod', models={'cat_vs_dog_v1': train.outputs['model']})


In [15]:
# for debug generate the pipeline dsl
#kfp.compiler.Compiler().compile(hvd_pipeline, 'hvd_pipeline.yaml')

In [16]:
client = kfp.Client(namespace='default-tenant')
arguments = {}
run_result = client.create_run_from_pipeline_func(hvd_pipeline, arguments, experiment_name='horovod1')

In [17]:
# connect to the run db 
db = get_run_db().connect()

In [18]:
# query the DB with filter on workflow ID (only show this workflow) 
db.list_runs('', labels=f'workflow={run_result.run_id}').show()

uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...287c01,0,Nov 11 14:09:24,running,download,host=image-classification-training-pipeline-98jwn-3957617112kind=localowner=rootv3io_user=adminworkflow=d2102423-9df9-4dd8-96c1-2e49d84eb862,archive_url,target_dir=/User/mlrun/examples/images,,
