# load dask cluster with data
load a parquet dataset into a dask cluster

In [1]:
# nuclio: ignore
import nuclio

In [2]:
# Copyright 2018 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import json
import numpy as np
import pandas as pd

import dask
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem

from typing import List, Optional

def load_dask(
    context: MLClientCtx,
    src_data: str,
    dask_key: str = "dask_key",
    inc_cols: Optional[List[str]] = None,
    index_cols: Optional[List[str]] = None,
    dask_persist: bool = True,
    refresh_data: bool = True,
    scheduler_key: str = "scheduler"
) -> None:
    """Load dataset into an existing dask cluster
    
    dask jobs define the dask client parameters at the job level, this method will raise an error if no client is detected.
    
    :param context:         the function context
    :param src_data:        url of the data file or partitioned dataset as either
                            artifact DataItem, string, or path object (similar to 
                            pandas read_csv)
    :param dask_key:        destination key of data on dask cluster and artifact store
    :param inc_cols:        include only these columns (very fast)
    :param index_cols:      list of index column names (can be a long-running process)
    :param dask_persist:    (True) should the data be persisted (through the `client.persist` op)
    :param refresh_data:    (False) if the dask_key already exists in the dask cluster, this will 
                            raise an Exception.  Set to True to replace the existing cluster data.
    :param scheduler_key:   (scheduler) the dask scheduler configuration, json also logged as an artifact
    """
    if hasattr(context, "dask_client"):
        dask_client = context.dask_client
        print(dask_client)
    else:
        raise Exception("a dask client was not found in the execution context")
    
    src_data = str(src_data)
    if isinstance(src_data, str):
        if os.path.isdir(src_data) or src_data.endswith("pq") or src_data.endswith("parquet"):
            df = dd.read_parquet(src_data)
        elif src_data.endswith("csv"):
            df = dd.read_csv(src_data)

    if dask_persist and context:
        df = dask_client.persist(df)
        if dask_client.datasets and dask_key in dask_client.datasets:
            dask_client.unpublish_dataset(dask_key)
        dask_client.publish_dataset(dask_key=df)
        context.dask_client = dask_client
        
        # share the scheduler
        filepath = os.path.join(context.artifact_path, scheduler_key+".json")
        dask_client.write_scheduler_file(filepath)
        context.log_artifact(scheduler_key, local_path=scheduler_key+".json")

  import pandas.util.testing as tm


In [3]:
# nuclio: end-code

### save

In [4]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("load_dask", kind="dask", with_doc=True,
                      handler=load_dask)

# add metadata (for templates and reuse)
fn.spec.default_handler = "load_dask"
fn.spec.description = "load dask cluster with data"
fn.metadata.categories = ["fileutils", "distributed"]
fn.metadata.labels = {"author": "yjb"}
fn.spec.remote = True
fn.spec.replicas = 4 
fn.spec.max_replicas = 4
fn.spec.service_type = "NodePort"
fn.spec.image = "mlrun/ml-models"

fn.save()
fn.export("function.yaml")

[mlrun] 2020-03-31 17:15:42,837 saving function: load-dask, tag: latest
[mlrun] 2020-03-31 17:15:42,866 function spec saved to path: function.yaml


<mlrun.runtimes.daskjob.DaskCluster at 0x7fd14913fd30>

### test

In [5]:
from mlrun import import_function, mount_v3io, NewTask

func = import_function("hub://load_dask").apply(mount_v3io())

task_params = {
    "name":        "tasks load dask cluster with data",
    "params" : {
        "src_data"     : "/User/artifacts/breast_cancer.parquet",
        "persist"      : True,
        "refresh_data" : True,
        "dask_key"     : "dask_key"}}

run = func.run(NewTask(**task_params), handler=load_dask, artifact_path="/User/artifacts")

[mlrun] 2020-03-31 17:15:42,906 starting run tasks load dask cluster with data uid=be860cd26677443497d8217d84797b57  -> http://mlrun-api:8080
[mlrun] 2020-03-31 17:15:42,983 saving function: load-dask, tag: latest
[mlrun] 2020-03-31 17:15:50,009 trying dask client at: tcp://mlrun-load-dask-b0872515-9.default-tenant:8786
[mlrun] 2020-03-31 17:15:50,026 using remote dask scheduler (mlrun-load-dask-b0872515-9) at: tcp://mlrun-load-dask-b0872515-9.default-tenant:8786


<Client: 'tcp://10.200.0.69:8786' processes=0 threads=0, memory=0 B>
[mlrun] 2020-03-31 17:15:50,132 log artifact scheduler at /User/artifacts/scheduler.json, size: None, db: Y

[mlrun] 2020-03-31 17:15:50,170 run ended with state 




uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...797b57,0,Mar 31 17:15:42,completed,tasks load dask cluster with data,v3io_user=adminkind=daskowner=adminhost=jupyter-57954c4b4c-xln99,,src_data=/User/artifacts/breast_cancer.parquetpersist=Truerefresh_data=Truedask_key=dask_key,,scheduler


to track results use .show() or .logs() or in CLI: 
!mlrun get run be860cd26677443497d8217d84797b57  , !mlrun logs be860cd26677443497d8217d84797b57 
[mlrun] 2020-03-31 17:15:50,195 run executed, status=completed


In [6]:
func.status.to_dict()

{'scheduler_address': 'tcp://mlrun-load-dask-b0872515-9.default-tenant:8786',
 'cluster_name': 'mlrun-load-dask-b0872515-9',
 'node_ports': {'dashboard': 31805, 'scheduler': 30903}}

### our cluster

Let"s load the scheduler file into a cluster in this notebook:

In [7]:
import dask
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

In [8]:
client = Client(func.status.to_dict()['scheduler_address'])

In [9]:
client

0,1
Client  Scheduler: tcp://mlrun-load-dask-b0872515-9.default-tenant:8786  Dashboard: http://mlrun-load-dask-b0872515-9.default-tenant:8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [10]:
list(client.list_datasets())

['dask_key']

In [11]:
client.datasets['dask_key']

Unnamed: 0_level_0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,labels
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [12]:
!conda list -n base dask

# packages in environment at /conda:
#
# Name                    Version                   Build  Channel
dask                      2.12.0                     py_0    anaconda
dask-core                 2.12.0                     py_0    anaconda
dask-glm                  0.2.0                    py36_0    anaconda
dask-ml                   1.2.0                      py_0    anaconda


In [13]:
!pip list

Package                  Version            
------------------------ -------------------
absl-py                  0.9.0              
adal                     1.2.2              
aiohttp                  3.6.2              
argo-models              2.2.1a0            
asn1crypto               1.3.0              
astor                    0.8.0              
async-timeout            3.0.1              
attrs                    19.3.0             
backcall                 0.1.0              
beautifulsoup4           4.8.2              
bleach                   3.1.0              
blinker                  1.4                
blosc                    1.7.0              
bokeh                    1.4.0              
boto3                    1.12.30            
botocore                 1.15.30            
cachetools               3.1.1              
certifi                  2019.11.28         
cffi                     1.14.0             
chardet                  3.0.4              
Click     