# parquet to dask
load a parquet dataset into a dask cluster

In [1]:
import mlrun
import os
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'
mlrun.mlconf.remote_host = '3.20.134.26' 

## parameters


In [2]:
FUNCTION           = 'parquet_to_dask'
DESCRIPTION        = 'load parquet dataset into a dask cluster'

BASE_IMAGE         = 'yjbds/mlrun-daskboost:dev'
JOB_KIND           = 'dask'
TASK_NAME          = 'user-task-parq-to-dask'

CODE_BASE          = '/User/repos/functions/fileutils'

SRC_PATH           = '/User/mlrun/airlines/dataset-small/partitions'

PARTITION_COLS     = ['Year', 'Month']
PARTITION_COLS     = None

DASK_SHARDS = 4
DASK_THREADS_PER = 4

## load and configure function

In [3]:
func_py = os.path.join(CODE_BASE, FUNCTION, 'function.py')
func_yaml = os.path.join(CODE_BASE, FUNCTION, 'function.yaml')

**If run the first time, create the function:**

In [4]:
# load function from a local Python file
parq2dask = mlrun.new_function(command=func_py, kind=JOB_KIND)

parq2dask.spec.remote = True
parq2dask.spec.replicas = 4 
parq2dask.spec.max_replicas = 4
parq2dask.spec.service_type = 'NodePort'
parq2dask.spec.image_pull_policy = 'Always'
parq2dask.spec.build.image = BASE_IMAGE

In [5]:
parq2dask.export(func_yaml)

[mlrun] 2020-02-03 12:21:40,341 function spec saved to path: /User/repos/functions/fileutils/parquet_to_dask/function.yaml


**otherwise load it:**

In [6]:
# parq2dask = mlrun.import_function(func_yaml)

parq2dask.apply(mlrun.mount_v3io())
# parq2dask.spec.image = 'yjbds/mlrun-daskboost:dev'

parq2dask.deploy(skip_deployed=True, with_mlrun=False)

'ready'

In [8]:
# create and run the task
parq_to_dask_task = mlrun.NewTask(
    TASK_NAME, 
    handler=FUNCTION,  
    params={
        'parquet_url': SRC_PATH,
        'index_cols' : PARTITION_COLS,
        'shards'     : DASK_SHARDS,
        'threads_per': DASK_THREADS_PER,
        'persist'    : True,
        'dask_key'   : 'testdf1',
        'target_path': '/User/mlrun/models'})
# run
rn = parq2dask.run(parq_to_dask_task)

[mlrun] 2020-02-03 12:21:44,208 starting run user-task-parq-to-dask uid=7f492c3993e142c8ab659a02bd79d843  -> http://mlrun-api:8080
[mlrun] 2020-02-03 12:21:45,121 using remote dask scheduler (mlrun-function-a1926d9a-b) at: 3.20.134.26:31014
[mlrun] 2020-02-03 12:21:45,122 remote dashboard (node) port: 3.20.134.26:30360



blosc
+--------------------------+---------+
|                          | version |
+--------------------------+---------+
| client                   | None    |
| scheduler                | 1.8.3   |
| tcp://10.233.64.52:37679 | 1.8.3   |
| tcp://10.233.64.53:43466 | 1.8.3   |
| tcp://10.233.64.54:33816 | 1.8.3   |
| tcp://10.233.64.55:36283 | 1.8.3   |
+--------------------------+---------+

cloudpickle
+--------------------------+---------+
|                          | version |
+--------------------------+---------+
| client                   | 1.1.1   |
| scheduler                | 1.2.2   |
| tcp://10.233.64.52:37679 | 1.2.2   |
| tcp://10.233.64.53:43466 | 1.2.2   |
| tcp://10.233.64.54:33816 | 1.2.2   |
| tcp://10.233.64.55:36283 | 1.2.2   |
+--------------------------+---------+

lz4
+--------------------------+---------+
|                          | version |
+--------------------------+---------+
| client                   | None    |
| scheduler                | 3.0.2   |


[mlrun] 2020-02-03 12:21:45,272 exec error - 'Dataset dask_key already exists'


'Dataset dask_key already exists'


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...79d843,0,Feb 03 12:21:45,error,user-task-parq-to-dask,host=jupyter-1-7db95b7786-v8fcfkind=daskowner=admin,,dask_key=testdf1index_cols=Noneparquet_url=/User/mlrun/airlines/dataset-small/partitionspersist=Trueshards=4target_path=/User/mlrun/modelsthreads_per=4,,


to track results use .show() or .logs() or in CLI: 
!mlrun get run 7f492c3993e142c8ab659a02bd79d843  , !mlrun logs 7f492c3993e142c8ab659a02bd79d843 
[mlrun] 2020-02-03 12:21:45,331 run executed, status=error


RunError: 'Dataset dask_key already exists'

In [9]:
rn.outputs['scheduler']

NameError: name 'rn' is not defined

#### What's the scheduler address?

In [None]:
import json
json.load(open(rn.outputs['scheduler']))

### our cluster

Let's load the scheduler file into a cluster in this notebook:

In [11]:
import dask
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

In [12]:
%%time 
client = Client(scheduler_file='/User/mlrun/models/scheduler.json') # Client(scheduler_file=rn.outputs['scheduler'])

CPU times: user 6.65 ms, sys: 2.11 ms, total: 8.76 ms
Wall time: 21.8 ms



blosc
+--------------------------+---------+
|                          | version |
+--------------------------+---------+
| client                   | None    |
| scheduler                | 1.8.3   |
| tcp://10.233.64.52:37679 | 1.8.3   |
| tcp://10.233.64.53:43466 | 1.8.3   |
| tcp://10.233.64.54:33816 | 1.8.3   |
| tcp://10.233.64.55:36283 | 1.8.3   |
+--------------------------+---------+

cloudpickle
+--------------------------+---------+
|                          | version |
+--------------------------+---------+
| client                   | 1.1.1   |
| scheduler                | 1.2.2   |
| tcp://10.233.64.52:37679 | 1.2.2   |
| tcp://10.233.64.53:43466 | 1.2.2   |
| tcp://10.233.64.54:33816 | 1.2.2   |
| tcp://10.233.64.55:36283 | 1.2.2   |
+--------------------------+---------+

lz4
+--------------------------+---------+
|                          | version |
+--------------------------+---------+
| client                   | None    |
| scheduler                | 3.0.2   |


In [13]:
%%time 
df = client.get_dataset('dask_key')

CPU times: user 8.44 ms, sys: 2.39 ms, total: 10.8 ms
Wall time: 20.4 ms


In [14]:
%%time 
df.shape[0].compute()

CPU times: user 7.61 ms, sys: 131 µs, total: 7.75 ms
Wall time: 33.9 ms


43978

In [15]:
client.nbytes(summary=False)

{"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 1)": 2912128,
 "('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 2)": 2912128,
 "('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 0)": 2912128,
 "('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 3)": 1159726,
 "('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 4)": 2912128}

### create a component 'on the fly' to summarise the table

The nice thing about having a dask cluster loaded with all your data is that you can write _quick and dirty_ jobs either in your notebook, a local file, or a gihub repo.

In [17]:
# write up function in local directory
summ = mlrun.new_function(command='/User/repos/functions/tests/describe.py', 
                          kind='job')
# specify a base image
summ.spec.build.image = BASE_IMAGE

# (optional) export it as yaml
summ.export('/User/repos/functions/tests/describe.yaml')

# mount it on iguazio data fabric
summ.apply(mlrun.mount_v3io())

[mlrun] 2020-02-03 12:23:41,371 function spec saved to path: /User/repos/functions/tests/describe.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f351afd7e48>

In [18]:
# deploy the function
summ.deploy(skip_deployed=True, with_mlrun=False)

[mlrun] 2020-02-03 12:23:43,952 starting remote build, image: yjbds/mlrun-daskboost:dev


True

In [20]:
# create the task
summ_task = mlrun.NewTask(
    'user-task-my-sum', 
    handler='table_summary',  
    params={
        'dask_key'   : 'testdf1',
        'dask_client': '/User/mlrun/models/scheduler.json',
        'target_path': '/User/mlrun/models',
        'name'       : 'table-summary.csv',
        'key'        : 'table-summary'})

# run
rn2 = summ.run(summ_task)

rn2.outputs

[mlrun] 2020-02-03 12:24:06,154 starting run user-task-my-sum uid=b0ed0394cb9842adb039d0abd75a5681  -> http://mlrun-api:8080
[mlrun] 2020-02-03 12:24:06,243 Job is running in the background, pod: user-task-my-sum-tnzrx
{'_uid': 'b0ed0394cb9842adb039d0abd75a5681', 'name': 'describe', '_iteration': 0, '_project': '', '_tag': '', '_secrets_manager': <mlrun.secrets.SecretsStore object at 0x7efdbe8867d0>, '_rundb': HTTPRunDB('http://10.233.35.61:8080'), '_tmpfile': '/tmp/tmp6b8ae_21.json', '_logger': <Logger mlrun (INFO)>, '_log_level': 'info', '_matrics_db': None, '_autocommit': False, '_labels': {'kind': 'job', 'owner': 'admin', 'host': 'user-task-my-sum-tnzrx'}, '_annotations': {}, '_function': 'default/describe:6edf099494c35f7f800b6c1082256aed4d6d177a', '_parameters': {'dask_key': 'testdf1', 'dask_client': '/User/mlrun/models/scheduler.json', 'target_path': '/User/mlrun/models', 'name': 'table-summary.csv', 'key': 'table-summary'}, '_in_path': '', '_out_path': '', '_inputs': {}, '_outpu

uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...5a5681,0,Feb 03 12:24:14,completed,describe,host=user-task-my-sum-tnzrxkind=jobowner=admin,,dask_client=/User/mlrun/models/scheduler.jsondask_key=testdf1key=table-summaryname=table-summary.csvtarget_path=/User/mlrun/models,,table-summary


to track results use .show() or .logs() or in CLI: 
!mlrun get run b0ed0394cb9842adb039d0abd75a5681  , !mlrun logs b0ed0394cb9842adb039d0abd75a5681 
[mlrun] 2020-02-03 12:24:25,420 run executed, status=completed


{'table-summary': '/User/mlrun/models/table-summary.csv'}

distributed.client - ERROR - Failed to reconnect to scheduler after 3.00 seconds, closing client
distributed.utils - ERROR - 
Traceback (most recent call last):
  File "/conda/lib/python3.6/site-packages/distributed/utils.py", line 662, in log_errors
    yield
  File "/conda/lib/python3.6/site-packages/distributed/client.py", line 1306, in _close
    await asyncio.wait_for(asyncio.gather(*coroutines), 2)
  File "/conda/lib/python3.6/asyncio/tasks.py", line 351, in wait_for
    yield from waiter
  File "/conda/lib/python3.6/asyncio/futures.py", line 327, in __iter__
    yield self  # This tells Task to wait for completion.
  File "/conda/lib/python3.6/asyncio/tasks.py", line 250, in _wakeup
    future.result()
  File "/conda/lib/python3.6/asyncio/futures.py", line 238, in result
    raise CancelledError
concurrent.futures._base.CancelledError
distributed.utils - ERROR - 
Traceback (most recent call last):
  File "/conda/lib/python3.6/site-packages/distributed/utils.py", line 662, in log

____

# tests

In [None]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [None]:
import dask
import dask.dataframe as dd

In [None]:
dataset = pq.ParquetDataset(os.path.join(SRC_PATH))
df = dataset.read().to_pandas()


ddf = dd.read_parquet(SRC_PATH) #+'/*.parquet')

In [None]:
ddf = ddf.persist()

In [None]:
ddf.head()

In [None]:
ddf.shape[0].compute()