# Using MLRUN with MpiJobs (Horovod)

In [1]:
# nuclio: ignore
import nuclio

In [2]:
%nuclio config spec.build.baseImage = "python:3.6-jessie"

%nuclio: setting spec.build.baseImage to 'python:3.6-jessie'


## Helper functions for downloading and labeling images

In [3]:
import os
import zipfile
import json
from tempfile import mktemp


def open_archive(context, 
                 target_dir='content',
                 archive_url=''):
    """Open a file/object archive into a target directory"""
        
    # Define locations
    os.makedirs(target_dir, exist_ok=True)
    context.logger.info('Verified directories')
    
    # Extract dataset from zip
    context.logger.info('Extracting zip')
    zip_ref = zipfile.ZipFile(archive_url, 'r')
    zip_ref.extractall(target_dir)
    zip_ref.close()
    
    context.logger.info(f'extracted archive to {target_dir}')
    context.log_artifact('content', target_path=target_dir)
    
    
def categories_map_builder(context,
                         base_dir,
                         categories_map_filename):
    
    DATA_PATH = os.path.join(base_dir, 'data')

    # create filenames list (jpg only)
    filenames = [file for file in os.listdir(DATA_PATH) if file.endswith('jpg')]
    categories = []
        
    # Create a pandas DataFrame for the full sample
    for filename in filenames:
        category = filename.split('.')[0]
        categories.append([categories_map[category]])

    df = pd.DataFrame({
        'filename': filenames,
        'category': categories
    })
    df['category'] = df['category'].astype('str')
    
    mlctx.logger.info(df.categories.values_count())
    mlctx.log_artifact(TableArtifact('mydf.csv', df=df))

In [4]:
# nuclio: end-code

## MLRun pipeline
- Get data
- Create categories map
- Train horovod model on the cluster

In [5]:
from mlrun import new_function, code_to_function, get_run_db, mount_v3io, NewTask, mlconf
mlconf.dbpath = 'http://mlrun-db:8080'

In [10]:
base_dir = '/User/mlrun'
images_path = os.path.join(base_dir, 'images')

In [11]:
# download images from s3 using the local `open_archive` function
task = NewTask(handler=open_archive, 
               params={'target_dir': images_path},
               inputs={'archive_url': 'http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip'})
fn = new_function().run(task)

[mlrun] 2019-10-14 23:30:32,257 starting run None uid=29c22c8326194e96addcb9835c9381de  -> http://mlrun-db:8080
[mlrun] 2019-10-14 23:30:32,300 downloading http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip to local tmp
[mlrun] 2019-10-14 23:30:33,415 Verified directories
[mlrun] 2019-10-14 23:30:33,416 Extracting zip
[mlrun] 2019-10-14 23:30:49,632 extracted archive to /User/mlrun/images


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...9381de,0,Oct 14 23:30:32,completed,,kind=handlerowner=iguaziohost=jupyter-h41nyj9oi0-ztkn5-7f7b6dbb85-h44sv,archive_url,target_dir=/User/mlrun/images,,content


type result.show() to see detailed results/progress or use CLI:
!mlrun get run --uid 29c22c8326194e96addcb9835c9381de 
[mlrun] 2019-10-14 23:30:49,673 run executed, status=completed


In [12]:
HOROVOD_FILE = 'horovod-training.py'
model_dir = os.path.join(base_dir, 'models')

params = {
    'base_dir' : base_dir,
    'checkpoints_dir' : os.path.join(base_dir, 'checkpoints'),
    'data_path' : os.path.join(images_path, 'cats_n_dogs'),
    'model_path' : os.path.join(model_dir, 'cats_n_dogs.hd5'),
    'epochs' : 1,
    'batch_size' : 64,
    'image_width': 128,
    'image_height': 128,
    'image_channels': 3,
}

image = 'mlrun/mpijob:latest'
fn = new_function(name='horovod-trainer',
                  command='mpijob://{}'.format(os.path.join(base_dir, HOROVOD_FILE)), 
                  image=image,
                  interactive=True)
fn.apply(mount_v3io())
fn.spec.image_pull_policy = 'Always'
fn.run(params=params, out_path='/User/mlrun')

[mlrun] 2019-10-14 23:31:25,981 starting run horovod-trainer uid=283c3b39b57b445cb4152a9a9c58edcb  -> http://mlrun-db:8080
[mlrun] 2019-10-14 23:31:26,014 using in-cluster config.
[mlrun] 2019-10-14 23:31:26,027 MpiJob horovod-trainer-0d15a092 created
[mlrun] 2019-10-14 23:31:29,039 MpiJob horovod-trainer-0d15a092 state=Active
...
+ POD_NAME=horovod-trainer-0d15a092-worker-0
+ shift
+ /opt/kube/kubectl exec horovod-trainer-0d15a092-worker-0 -- /bin/sh -c     PATH=/usr/local/bin:$PATH ; export PATH ; LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; DYLD_LIBRARY_PATH=/usr/local/lib:$DYLD_LIBRARY_PATH ; export DYLD_LIBRARY_PATH ;   /usr/local/bin/orted -mca ess "env" -mca ess_base_jobid "551419904" -mca ess_base_vpid 1 -mca ess_base_num_procs "2" -mca orte_node_regex "horovod-trainer-[1:0]d15a092-launcher-7gtj4,horovod-trainer-[1:0]d15a092-worker-0@0(2)" -mca orte_hnp_uri "551419904.0;tcp://10.233.81.210:40160" -mca plm "rsh" --tree-spawn -mca orte_parent_uri "55

uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...58edcb,0,Oct 14 23:31:37,completed,horovod-trainer,host=horovod-trainer-0d15a092-worker-0kind=mpijobmlrun/job=horovod-trainer-0d15a092owner=iguazio,,base_dir=/User/mlrunbatch_size=64checkpoints_dir=/User/mlrun/checkpointsdata_path=/User/mlrun/images/cats_n_dogsepochs=1image_channels=3image_height=128image_width=128model_path=/User/mlrun/models/cats_n_dogs.hd5,accuracy=0.5918750166893005loss=1.1245769548416138,modelsummary.html


type result.show() to see detailed results/progress or use CLI:
!mlrun get run --uid 283c3b39b57b445cb4152a9a9c58edcb 
[mlrun] 2019-10-14 23:35:04,644 run executed, status=completed


<mlrun.model.RunObject at 0x7f4818ac65f8>

In [13]:
fn.delete_job('horovod-trainer-0d15a092')

[mlrun] 2019-10-14 23:37:15,614 del status: Success


## Deploy model server 

In [None]:
srvfn = new_model_server('tf-image-server', model_class='TFModel', image='mlrun/serving-tf', 
                         models={'netops_v1': model_dir}).with_v3io('User', '~/')