# Using MLRUN with Dask Distributed Jobs

In [None]:
!pip install dask distributed --upgrade

In [None]:
!pip install dask_kubernetes

In [1]:
# specify MLRUN package for image builds 
#%env MLRUN_PACKAGE_PATH=git+https://github.com/mlrun/mlrun.git@development
# set mlrun db path (can also be specified in run_start command)
%env MLRUN_DBPATH=/User/mlrun

env: MLRUN_PACKAGE_PATH=git+https://github.com/mlrun/mlrun.git@development
env: MLRUN_DBPATH=/User/mlrun


In [2]:
from mlrun import new_function, NewTask
from mlrun.platforms import mount_v3io
import yaml
import pandas as pd

## Writing a function code

In [3]:
# define a function with spec as parameter
import time
def handler(context, p1=1, p2='xx'):
    # access input metadata, values, and inputs
    print(f'Run: {context.name} (uid={context.uid})')
    print(f'Params: p1={p1}, p2={p2}')
    
    time.sleep(1)
    
    # log the run results (scalar values)
    context.log_result('accuracy', p1 * 2)
    context.log_result('loss', p1 * 3)
    
    # add a lable/tag to this run 
    context.set_label('category', 'tests')
    
    # log a simple artifact + label the artifact 
    context.log_artifact('model.txt', body=b'abc is 123', labels={'framework': 'xgboost'})
    return 'my resp'

In [4]:
# test our function locally
task = NewTask(handler=handler)
run = new_function().run(task)

[mlrun] 2019-09-16 07:19:10,509 starting run None uid=531bc44712dd419d8f06c6f66908d243
Run:  (uid=531bc44712dd419d8f06c6f66908d243)
Params: p1=1, p2=xx



uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...08d243,0,Sep 16 07:19:10,completed,,kind=handlerowner=iguaziohost=jupyter-sm67srm2t3-w15ka-8bcf9dddd-mb2hscategory=tests,,,accuracy=2loss=3return=my resp,model.txt


type result.show() to see detailed results/progress or use CLI:
!mlrun get run --uid 531bc44712dd419d8f06c6f66908d243 
[mlrun] 2019-09-16 07:19:11,770 run executed, status=completed


## Run the task on a local Dask client

In [5]:
run = new_function(command='dask://').run(task, handler=handler)

[mlrun] 2019-09-16 07:19:16,505 starting run None uid=65f071fdddf94fbd9256c46e33f94061
Run:  (uid=65f071fdddf94fbd9256c46e33f94061)
Params: p1=1, p2=xx


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...f94061,0,Sep 16 07:19:16,completed,,kind=daskowner=iguaziocategory=tests,,,accuracy=2loss=3return=my resp,model.txt


type result.show() to see detailed results/progress or use CLI:
!mlrun get run --uid 65f071fdddf94fbd9256c46e33f94061 
[mlrun] 2019-09-16 07:19:17,979 run executed, status=completed


# Running Dask Jobs on the Kubernetes Cluster
in order to use `dask` we first create a dask runner, we may want to create custom image with our extra pip packages <br>
before we start we need to initialize the cluster with `runner.cluster(size)` if we we keep the `size` blank it will auto adjust 

In [6]:
fn = new_function(command='dask://').apply(mount_v3io())

In [None]:
# the build process can be done once, and is optional (we can specify a pre-built image by setting runner.image)
fn.build(image='mlrun/dask:latest', base_image='daskdev/dask:latest', commands=['pip install pandas'])

In [8]:
fn.cluster(1)

  json = yaml.load(f)


VBox(children=(HTML(value='<h2>KubeCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

## Run Jobs on the cluster
below you can see how we submit out function to run on the cluster and inject parameters<br>
if you need access to the dask `client` object you can use `runner.client`

In [10]:
task = NewTask(handler=handler).with_hyper_params({'p1': [5, 2, 3]}, 'max.accuracy')
fn.run(task)

[mlrun] 2019-09-16 07:21:59,559 starting run None uid=7b30072f373b4b0796c6a795a4d1563d
['my resp', 'my resp', 'my resp']


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...d1563d,0,Sep 16 07:21:59,completed,,kind=daskowner=iguazio,,,,iteration_results.csv


type result.show() to see detailed results/progress or use CLI:
!mlrun get run --uid 7b30072f373b4b0796c6a795a4d1563d 
[mlrun] 2019-09-16 07:22:03,014 run executed, status=completed


<mlrun.model.RunObject at 0x7f1eb0440278>

## Shutting down the cluster
if we want to delete our Kubernetes resources we use the `close()` method

In [12]:
fn.close()