# load dask cluster with data
load a parquet dataset into a dask cluster

In [1]:
# nuclio: ignore
import nuclio

In [2]:
import os
import json
import numpy as np
import pandas as pd

import dask
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem

from typing import List, Optional

def load_dask(
    context: MLClientCtx,
    src_data: str,
    dask_key: str = "dask_key",
    inc_cols: Optional[List[str]] = None,
    index_cols: Optional[List[str]] = None,
    dask_persist: bool = True,
    refresh_data: bool = True,
    scheduler_key: str = "scheduler"
) -> None:
    """Load dataset into an existing dask cluster
    
    dask jobs define the dask client parameters at the job level, this method will raise an error if no client is detected.
    
    :param context:         the function context
    :param src_data:        url of the data file or partitioned dataset as either
                            artifact DataItem, string, or path object (similar to 
                            pandas read_csv)
    :param dask_key:        destination key of data on dask cluster and artifact store
    :param inc_cols:        include only these columns (very fast)
    :param index_cols:      list of index column names (can be a long-running process)
    :param dask_persist:    (True) should the data be persisted (through the `client.persist` op)
    :param refresh_data:    (False) if the dask_key already exists in the dask cluster, this will 
                            raise an Exception.  Set to True to replace the existing cluster data.
    :param scheduler_key:   (scheduler) the dask scheduler configuration, json also logged as an artifact
    """
    if hasattr(context, "dask_client"):
        dask_client = context.dask_client
    else:
        raise Exception("a dask client was not found in the execution context")
    
    src_data = str(src_data)
    if isinstance(src_data, str):
        if os.path.isdir(src_data) or src_data.endswith("pq") or src_data.endswith("parquet"):
            df = dd.read_parquet(src_data)
        elif src_data.endswith("csv"):
            df = dd.read_csv(src_data)

    if dask_persist:
        df = dask_client.persist(df)
        if dask_client.datasets and dask_key in dask_client.datasets:
            dask_client.unpublish_dataset(dask_key)
        dask_client.publish_dataset(df, name=dask_key)
    
    if context:
        context.dask_client = dask_client
        
    # share the scheduler, whether data is persisted or not
    filepath = os.path.join(context.artifact_path, scheduler_key+".json")
    dask_client.write_scheduler_file(filepath)
    
    # we don't use log_dataset here until it can take into account
    # dask origin and apply dask describe.
    context.log_artifact(scheduler_key, local_path=scheduler_key+".json")

In [3]:
# nuclio: end-code

### mlconfig

In [4]:
from mlrun import mlconf

In [5]:
mlconf.dbpath = mlconf.dbpath or './'
mlconf.dbpath

'http://mlrun-api:8080'

In [6]:
vcs_branch = 'development'
base_vcs = f'https://raw.githubusercontent.com/mlrun/functions/{vcs_branch}/'

mlconf.hub_url = mlconf.hub_url or base_vcs + f'{name}/function.yaml'
mlconf.hub_url

'/User/repos/functions/{name}/function.yaml'

In [7]:
import os
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["V3IO_HOME"]}/artifacts'
mlconf.artifact_path

'/User/artifacts'

### save

In [8]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("load_dask", kind="dask", 
                      with_doc=True,
                      handler=load_dask,
                      image = "mlrun/ml-models:0.4.6")

# add metadata (for templates and reuse)
fn.spec.default_handler = "load_dask"
fn.spec.description = "load dask cluster with data"
fn.metadata.categories = ["fileutils", "distributed"]
fn.metadata.labels = {"author": "yjb"}
fn.spec.remote = True
fn.spec.replicas = 4
fn.spec.max_replicas = 8
fn.spec.service_type = "NodePort"

fn.save()
fn.export("function.yaml")

[mlrun] 2020-04-25 22:31:59,755 saving function: load-dask, tag: latest
[mlrun] 2020-04-25 22:31:59,786 function spec saved to path: function.yaml


<mlrun.runtimes.daskjob.DaskCluster at 0x7f97649b6400>

### test

In [9]:
from mlrun import import_function

func = import_function("hub://load_dask")

if "V3IO_HOME" in list(os.environ):
    from mlrun import mount_v3io
    func.apply(mount_v3io())
else:
    # is you set up mlrun using the instructions at https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
    from mlrun.platforms import mount_pvc
    func.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))

In [10]:
from mlrun import NewTask, run_local

task_params = {
    "name":        "tasks load dask cluster with data",
    "params" : {
        "persist"      : True,
        "refresh_data" : True,
        "dask_key"     : "dask_key"}}

In [11]:
run = func.run(NewTask(**task_params), 
               handler=load_dask, 
               inputs={"src_data" : "/User/artifacts/classifier-data.csv" },
               artifact_path=mlconf.artifact_path)

[mlrun] 2020-04-25 22:31:59,840 starting run tasks load dask cluster with data uid=c80650ba615b406eb98453938d24ba4e  -> http://mlrun-api:8080
[mlrun] 2020-04-25 22:31:59,926 saving function: load-dask, tag: latest
[mlrun] 2020-04-25 22:32:06,066 trying dask client at: tcp://mlrun-load-dask-a8c2d9a8-e.default-tenant:8786
[mlrun] 2020-04-25 22:32:06,087 using remote dask scheduler (mlrun-load-dask-a8c2d9a8-e) at: tcp://mlrun-load-dask-a8c2d9a8-e.default-tenant:8786


[mlrun] 2020-04-25 22:32:06,167 log artifact scheduler at /User/artifacts/scheduler.json, size: None, db: Y

[mlrun] 2020-04-25 22:32:06,191 run ended with state 


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
,...8d24ba4e,0,Apr 25 22:31:59,completed,tasks load dask cluster with data,v3io_user=adminkind=daskowner=adminhost=jupyter-5859859b4f-hwhxd,src_data,persist=Truerefresh_data=Truedask_key=dask_key,,scheduler


to track results use .show() or .logs() or in CLI: 
!mlrun get run c80650ba615b406eb98453938d24ba4e  , !mlrun logs c80650ba615b406eb98453938d24ba4e 
[mlrun] 2020-04-25 22:32:06,241 run executed, status=completed


In [12]:
func.status.to_dict()

{'scheduler_address': 'tcp://mlrun-load-dask-a8c2d9a8-e.default-tenant:8786',
 'cluster_name': 'mlrun-load-dask-a8c2d9a8-e',
 'node_ports': {'dashboard': 30590, 'scheduler': 30490}}

In [13]:
import dask
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

In [14]:
client = Client(func.status.to_dict()['scheduler_address'])
client

0,1
Client  Scheduler: tcp://mlrun-load-dask-a8c2d9a8-e.default-tenant:8786  Dashboard: http://mlrun-load-dask-a8c2d9a8-e.default-tenant:8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [15]:
list(client.list_datasets())

['dask_key']

In [16]:
client.datasets['dask_key']

Unnamed: 0_level_0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,feat_11,feat_12,feat_13,feat_14,feat_15,feat_16,feat_17,feat_18,feat_19,feat_20,feat_21,feat_22,feat_23,feat_24,feat_25,feat_26,feat_27,feat_28,feat_29,feat_30,feat_31,feat_32,feat_33,feat_34,feat_35,feat_36,feat_37,feat_38,feat_39,feat_40,feat_41,feat_42,feat_43,feat_44,feat_45,feat_46,feat_47,feat_48,feat_49,feat_50,feat_51,feat_52,feat_53,feat_54,feat_55,feat_56,feat_57,feat_58,feat_59,feat_60,feat_61,feat_62,feat_63,labels
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1
,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,int64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
