# Using MLRUN with MpiJobs (Horovod)

In [1]:
from mlrun import new_runner
from mlrun.platforms import mount_v3io

In [2]:
base_dir = '/User/horovod'
HOROVOD_JOB_NAME = "horovod-cats-n-dogs"
GPU_HOROVOD_FILE = '/horovod_train_cats_n_dogs.py'
CPU_HOROVOD_FILE = '/horovod_train_cats_n_dogs-cpu.py'

## Create an Mpi Job Runner
Set the job image, command, args, and add a v3io (iguazio) volume mount

In [4]:
image = 'zilbermanor/horovod_cpu:0.2'
runner = new_runner(command='mpijob://{}#{}'.format(image, base_dir + CPU_HOROVOD_FILE), mode='noctx')
runner.apply(mount_v3io())
runner.args = [base_dir + '/cats_and_dogs_filtered', base_dir]

In [None]:
runner.to_dict()

## Initiate a new job

In [6]:
run = runner.run(name='ml')

[mlrun] 2019-09-07 22:25:26,185 starting run ml uid=c46e9bcca11a45a29a2fc7dd83c8af58
[mlrun] 2019-09-07 22:25:27,082 using in-cluster config.
[mlrun] 2019-09-07 22:25:27,095 MpiJob ml-2f03ba53 created
[mlrun] 2019-09-07 22:25:27,096 use runner.watch(ml-2f03ba53) to see logs


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...c8af58,0,,completed,ml,kind=mpijobowner=iguaziomlrun/job=ml-2f03ba53,,,,


[mlrun] 2019-09-07 22:25:27,114 run executed, status=created


## List Running MpiJobs and their Pods

In [7]:
jobs = runner.list_jobs(show=True)

status     name                 start                 end
Succeeded  horovod-cats-n-dogs  2019-09-04T17:06:39Z  2019-09-04T17:46:32Z
Active     ml-2f03ba53          2019-09-07T22:25:29Z  
Succeeded  ml-5f9218f6          2019-09-07T20:57:11Z  2019-09-07T21:15:08Z
Failed     ml-d7e2af6c          2019-09-07T20:44:17Z  
Succeeded  mpij-f308cbfa        2019-09-07T19:14:41Z  2019-09-07T19:32:37Z


In [8]:
runner.get_pods()

{'ml-2f03ba53-launcher-2q4pm': 'Running',
 'ml-2f03ba53-worker-0': 'Running',
 'ml-5f9218f6-launcher-ph2xb': 'Succeeded',
 'mpij-f308cbfa-launcher-kl2ct': 'Succeeded'}

## Watch Job logs

In [None]:
runner.watch('ml-2f03ba53')

[mlrun] 2019-09-07 22:26:18,302 watching pod ml-2f03ba53-launcher-2q4pm, status = Running

+ POD_NAME=ml-2f03ba53-worker-0
+ shift
+ /opt/kube/kubectl exec ml-2f03ba53-worker-0 -- /bin/sh -c     PATH=/usr/local/bin:$PATH ; export PATH ; LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; DYLD_LIBRARY_PATH=/usr/local/lib:$DYLD_LIBRARY_PATH ; export DYLD_LIBRARY_PATH ;   /usr/local/bin/orted -mca ess "env" -mca ess_base_jobid "102105088" -mca ess_base_vpid 1 -mca ess_base_num_procs "2" -mca orte_node_regex "ml-[1:2]f03ba53-launcher-2q4pm,ml-[1:2]f03ba53-worker-0@0(2)" -mca orte_hnp_uri "102105088.0;tcp://10.233.92.107:51544" -mca plm "rsh" --tree-spawn -mca orte_parent_uri "102105088.0;tcp://10.233.92.107:51544" -mca plm_rsh_agent "/etc/mpi/kubexec.sh" -mca orte_default_hostfile "/etc/mpi/hostfile" -mca pmix "^s1,s2,cray,isolated"
Using TensorFlow backend.
2019-09-07 22:25:34.818933: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instruction

## Delete a Job

In [None]:
runner.delete_job('mpij-77afc965')