# load dask cluster with data
load a parquet dataset into a dask cluster

In [1]:
# nuclio: ignore
import nuclio

In [3]:
# Copyright 2018 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import json
import numpy as np
import pandas as pd

import dask
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem

from typing import List, Optional

def load_dask(
    context: MLClientCtx,
    src_data: str,
    dask_key: str = "daskdf",
    inc_cols: Optional[List[str]] = None,
    index_cols: Optional[List[str]] = None,
    dask_persist: bool = True
) -> None:
    """Load dataset into an existing dask cluster
    
    dask jobs define the dask client parameters at the job level, this method will raise an error if no client is detected.
    
    :param context:         the function context
    :param src_data:        url of the data file or partitioned dataset as either
                            artifact DataItem, string, or path object (similar to 
                            pandas read_csv)
    :param dask_key:        destination key of data on dask cluster and artifact store
    :param inc_cols:        include only these columns (very fast)
    :param index_cols:      list of index column names (can be a long-running process)
    """
    if hasattr(context, "dask_client"):
        dask_client = context.dask_client
        print(dask_client)
    else:
        raise Exception("a dask client was not found in the execution context")
    
    context.logger.info(dask_client)

    # need utility: many functions use this if pq or csv, but could be dir with pq partitioned 
    # dataset
    src_data = str(src_data)
    if isinstance(src_data, string):
        if os.path.isdir(src_data) or src_data.endswith("pq") or src_data.endswith("parquet"):
            df = dd.read_parquet(src_data)
        elif src_data.endswith("csv"):
            df = dd.read_csv(src_data)

    if persist and context:
        df = dask_client.persist(df)
        dask_client.publish_dataset(dask_key=df)
        context.dask_client = dask_client
        
        # share the scheduler
        filepath = os.path.join(target_path, "scheduler.json")
        dask_client.write_scheduler_file(filepath)
        context.log_artifact("scheduler", target_path=filepath)
        
        print(df.head())


In [4]:
# nuclio: end-code

### save

In [11]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("load_dask", kind="job", with_doc=True,
                      handler=load_dask, image="mlrun/ml-dask:0.4.6")

# add metadata (for templates and reuse)
fn.spec.default_handler = "load_dask"
fn.spec.description = "load dask cluster with data"
fn.metadata.categories = ["fileutils", "distributed"]
fn.spec.image_pull_policy = "Always"
fn.metadata.labels = {"author": "yjb"}
fn.spec.remote = True
fn.spec.replicas = 4 
fn.spec.max_replicas = 4
fn.spec.service_type = 'NodePort'
fn.spec.build.image = "mlrun/ml-dask:0.4.6"

fn.save()
fn.export("function.yaml")

[mlrun] 2020-03-28 15:12:24,588 saving function: load-dask, tag: latest
[mlrun] 2020-03-28 15:12:24,637 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7ffa41303f90>

### test

In [12]:
from mlrun import import_function, mount_v3io, NewTask

func = import_function("hub://load_dask").apply(mount_v3io())

task_params = {
    "name":        "tasks load dask cluster with data",
    "params" : {
        "src_data"    : "/User/artifacts/higgs.parquet",
        "persist"     : True,
        "dask_key"    : "dask_df"}}

run = func.run(NewTask(**task_params), artifact_path="/User/artifacts")

[mlrun] 2020-03-28 15:12:24,691 starting run tasks load dask cluster with data uid=ef958038bf384b3bb95e13603f76833d  -> http://mlrun-api:8080
[mlrun] 2020-03-28 15:12:24,814 Job is running in the background, pod: tasks-load-dask-cluster-with-data-454lw
[mlrun] 2020-03-28 15:14:33,847 Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/mlrun/runtimes/local.py", line 183, in exec_from_params
    val = handler(*args_list)
  File "main.py", line 41, in load_dask
    raise Exception("a dask client was not found in the execution context")
Exception: a dask client was not found in the execution context


[mlrun] 2020-03-28 15:14:33,856 exec error - a dask client was not found in the execution context
[mlrun] 2020-03-28 15:14:33,882 run executed, status=error
a dask client was not found in the execution context
runtime error: a dask client was not found in the execution context
final state: failed


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...76833d,0,Mar 28 15:14:33,error,tasks load dask cluster with data,host=tasks-load-dask-cluster-with-data-454lwkind=jobowner=admin,,dask_key=dask_dfpersist=Truesrc_data=/User/artifacts/higgs.parquet,,


to track results use .show() or .logs() or in CLI: 
!mlrun get run ef958038bf384b3bb95e13603f76833d  , !mlrun logs ef958038bf384b3bb95e13603f76833d 
[mlrun] 2020-03-28 15:14:35,055 run executed, status=error
runtime error: a dask client was not found in the execution context


RunError: a dask client was not found in the execution context

### our cluster

Let"s load the scheduler file into a cluster in this notebook:

In [None]:
import dask
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

In [None]:
%%time 
client = Client(scheduler_file="/User/artifacts/scheduler.json")

In [None]:
%%time 
df = client.get_dataset("dask_df")

In [None]:
%%time 
df.shape[0].compute()

In [None]:
client.nbytes(summary=False)