# Using MLRUN with MpiJobs (Horovod)

In [1]:
from mlrun import new_function
from mlrun.platforms import mount_v3io

In [2]:
HOROVOD_JOB_NAME = "horovod-cats-n-dogs"

base_dir = '/User/horovod'
GPU_HOROVOD_FILE = 'horovod_train_cats_n_dogs-GPU.py'
CPU_HOROVOD_FILE = 'horovod_train_cats_n_dogs-CPU.py'

## Download cats & dogs dataset

In [3]:
import os
import zipfile
import json

# Define locations
DATA_PATH = os.path.join(base_dir, 'cats_and_dogs_filtered')
MODEL_PATH = os.path.join(base_dir, 'model')
!mkdir -p {MODEL_PATH}
!mkdir -p {DATA_PATH}

# Download 
!mkdir cats_and_dogs_filtered
# Download a sample stocks file from Iguazio demo bucket in S3
!curl -L "iguazio-sample-data.s3.amazonaws.com/catsndogs.zip" > {os.path.join(base_dir, 'catsndogs.zip')}

# Extract dataset
zip_ref = zipfile.ZipFile(os.path.join(base_dir, 'catsndogs.zip'), 'r')
zip_ref.extractall(DATA_PATH)
zip_ref.close()

# Build prediction map
def build_prediction_map(categories_map):
    return {v:k for k ,v in categories_map.items()}

# create filenames list (jpg only)
filenames = [file for file in os.listdir(DATA_PATH) if file.endswith('jpg')]
categories = []

# categories & prediction classes map
categories_map = {
    'dog': 1,
    'cat': 0,
}
prediction_map = build_prediction_map(categories_map)
with open(MODEL_PATH + 'prediction_classes_map.json', 'w') as f:
    json.dump(prediction_map, f)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 65.2M  100 65.2M    0     0  14.7M      0  0:00:04  0:00:04 --:--:-- 14.7M


## Create an Mpi Job Runner
Set the job image, command, args, and add a v3io (iguazio) volume mount

In [4]:
image = 'zilbermanor/horovod_cpu:0.2'
fn = new_function(command='mpijob://{}'.format(os.path.join(base_dir, CPU_HOROVOD_FILE)), 
                  args= [DATA_PATH, base_dir], image=image, mode='noctx')
fn.apply(mount_v3io())

<mlrun.runtimes.mpijob.MpiRuntime at 0x7fe7e0935710>

## Initiate a new job

In [5]:
run = fn.run(name='ml')

[mlrun] 2019-09-18 14:42:02,805 starting run ml uid=e80ca833703e45c2ad68ca6cded8a537
[mlrun] 2019-09-18 14:42:05,422 using in-cluster config.
[mlrun] 2019-09-18 14:42:05,455 MpiJob ml-a087b526 created
[mlrun] 2019-09-18 14:42:05,458 use .watch(ml-a087b526) to see logs
[mlrun] 2019-09-18 14:42:09,485 MpiJob ml-a087b526 state=Active
[mlrun] 2019-09-18 14:42:09,504 MpiJob ml-a087b526 launcher pod ml-a087b526-launcher-rbn58 state Pending


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...d8a537,0,,created,ml,kind=mpijobowner=iguaziorepo=https://github.com/zilbermanor/mlrun.gitcommit=85a94dd75c269150159caea7973d269ed13a619cmlrun/job=ml-a087b526,,,,


type result.show() to see detailed results/progress or use CLI:
!mlrun get run --uid e80ca833703e45c2ad68ca6cded8a537 
[mlrun] 2019-09-18 14:42:09,561 run executed, status=created


## List Running MpiJobs and their Pods

In [6]:
jobs = fn.list_jobs(show=True)

status     name                 start                 end
Succeeded  horovod-cats-n-dogs  2019-09-18T11:59:49Z  2019-09-18T12:04:31Z
Active     ml-a087b526          2019-09-18T14:42:09Z  
Active     ml-ff5ea63e          2019-09-18T14:26:13Z  


## Watch Job logs

In [None]:
fn.watch('ml-a087b526')

[mlrun] 2019-09-18 14:42:35,317 watching pod ml-a087b526-launcher-rbn58, status = Running

+ POD_NAME=ml-a087b526-worker-0
+ shift
+ /opt/kube/kubectl exec ml-a087b526-worker-0 -- /bin/sh -c     PATH=/usr/local/bin:$PATH ; export PATH ; LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; DYLD_LIBRARY_PATH=/usr/local/lib:$DYLD_LIBRARY_PATH ; export DYLD_LIBRARY_PATH ;   /usr/local/bin/orted -mca ess "env" -mca ess_base_jobid "2588606464" -mca ess_base_vpid 1 -mca ess_base_num_procs "2" -mca orte_node_regex "ml-a[3:87]b526-launcher-rbn58,ml-a[3:87]b526-worker-0@0(2)" -mca orte_hnp_uri "2588606464.0;tcp://10.233.81.210:48513" -mca plm "rsh" --tree-spawn -mca orte_parent_uri "2588606464.0;tcp://10.233.81.210:48513" -mca plm_rsh_agent "/etc/mpi/kubexec.sh" -mca orte_default_hostfile "/etc/mpi/hostfile" -mca pmix "^s1,s2,cray,isolated"
Using TensorFlow backend.
2019-09-18 14:42:25.384992: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructio

## Delete a Job

In [20]:
fn.delete_job('ml-a087b526')

[mlrun] 2019-09-18 14:17:38,395 del status: Success
