# Using MLRUN with Dask Distributed Jobs

In [None]:
!pip install dask distributed --upgrade

In [None]:
!pip install dask_kubernetes

In [1]:
# specify MLRUN package for image builds 
%env MLRUN_PACKAGE_PATH=git+https://github.com/mlrun/mlrun.git@development
# set mlrun db path (can also be specified in run_start command)
%env MLRUN_META_DBPATH=/User/mlrun

env: MLRUN_PACKAGE_PATH=git+https://github.com/mlrun/mlrun.git@development
env: MLRUN_META_DBPATH=/User/mlrun


In [2]:
from mlrun import new_function
from mlrun.platforms import mount_v3io
import yaml
import pandas as pd

## Writing a function code

In [3]:
# define a function with spec as parameter
import time
def handler(context, p1=1, p2='xx'):
    # access input metadata, values, and inputs
    print(f'Run: {context.name} (uid={context.uid})')
    print(f'Params: p1={p1}, p2={p2}')
    
    time.sleep(1)
    
    # log the run results (scalar values)
    context.log_result('accuracy', p1 * 2)
    context.log_result('loss', p1 * 3)
    
    # add a lable/tag to this run 
    context.set_label('category', 'tests')
    
    # log a simple artifact + label the artifact 
    context.log_artifact('model.txt', body=b'abc is 123', labels={'framework': 'xgboost'})
    return 'my resp'

In [4]:
# test our function locally
task = NewRun(handler=handler)
run = new_function().run(task)

[mlrun] 2019-09-07 22:06:12,487 starting run None uid=a2e842e588dd46d2b8e25edde5a0d4db
Run:  (uid=a2e842e588dd46d2b8e25edde5a0d4db)
Params: p1=1, p2=xx



uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...a0d4db,0,Sep 07 22:06:13,completed,,kind=handlerowner=iguaziohost=jupyter-rngbsdr6ab-f02en-5dd7cd96db-xfzxrcategory=tests,,,accuracy=2loss=3return=my resp,model.txt


[mlrun] 2019-09-07 22:06:14,567 run executed, status=created


## Run the task on a local Dask client

In [15]:
run = new_function(command='dask://').run(task, handler=handler)

[mlrun] 2019-09-07 00:39:27,514 starting run demo uid=00e7693869a045faa4d3d166b0ff0795
['my resp', 'my resp', 'my resp']


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...ff0795,1,Sep 07 00:39:27,completed,demo,type=demokind=daskowner=iguazio,,p1=5,return=my resp,
...ff0795,2,Sep 07 00:39:27,completed,demo,type=demokind=daskowner=iguazio,,p1=2,return=my resp,
...ff0795,3,Sep 07 00:39:27,completed,demo,type=demokind=daskowner=iguazio,,p1=3,return=my resp,
...ff0795,0,Sep 07 00:39:27,completed,demo,type=demokind=daskowner=iguazio,,p1=5,,iteration_results.csv


[mlrun] 2019-09-07 00:39:28,842 run finished, status=created
1.33 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# Running Dask Jobs on the Kubernetes Cluster
in order to use `dask` we first create a dask runner, we may want to create custom image with our extra pip packages <br>
before we start we need to initialize the cluster with `runner.cluster(size)` if we we keep the `size` blank it will auto adjust 

In [8]:
runner = new_function(command='dask://').apply(mount_v3io())

In [None]:
# the build process can be done once, and is optional (we can specify a pre-built image by setting runner.image)
runner.build_image(image='mlrun/dask:latest', base_image='daskdev/dask:latest', commands=['pip install pandas'])

In [11]:
runner.cluster(1)

VBox(children=(HTML(value='<h2>KubeCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

## Run Jobs on the cluster
below you can see how we submit out function to run on the cluster and inject parameters<br>
if you need access to the dask `client` object you can use `runner.client`

In [12]:
task = NewRun(handler=handler).with_hyper_params({'p1': [5, 2, 3]}, 'max.accuracy')
runner.run(task)

[mlrun] 2019-09-07 22:09:34,636 starting run None uid=87b10a85f0da4ca3b96562b5c3d63ec6
['my resp', 'my resp', 'my resp']


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...d63ec6,1,Sep 07 22:09:34,completed,,kind=daskowner=iguazio,,p1=5,return=my resp,
...d63ec6,2,Sep 07 22:09:34,completed,,kind=daskowner=iguazio,,p1=2,return=my resp,
...d63ec6,3,Sep 07 22:09:34,completed,,kind=daskowner=iguazio,,p1=3,return=my resp,
...d63ec6,0,Sep 07 22:09:34,completed,,kind=daskowner=iguazio,,,,iteration_results.csv


[mlrun] 2019-09-07 22:09:36,471 run executed, status=created


<mlrun.model.RunObject at 0x7f3bf977b940>

## Shutting down the cluster
if we want to delete our Kubernetes resources we use the `close()` method

In [7]:
runner.close()