# parquet to dask
load a parquet dataset into a dask cluster

In [1]:
import mlrun
import os
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'
mlrun.mlconf.remote_host = '3.133.8.252' 

## parameters


In [2]:
FUNCTION           = 'parquet_to_dask'
DESCRIPTION        = 'load parquet dataset into a dask cluster'

BASE_IMAGE         = 'yjbds/mlrun-daskboost:dev'
JOB_KIND           = 'dask'
TASK_NAME          = 'user-task-parq-to-dask'

CODE_BASE          = 'https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/fileutils'

SRC_PATH           = '/User/mlrun/airlines/dataset-small/partitions'

PARTITION_COLS     = ['Year', 'Month']

DASK_SHARDS = 4
DASK_THREADS_PER = 4

## load and configure function

In [3]:
func_py = os.path.join(CODE_BASE, FUNCTION, 'function.py')
func_yaml = os.path.join(CODE_BASE, FUNCTION, 'function.yaml')

**If run the first time, create the function:**

In [4]:
# load function from a local Python file
parq2dask = mlrun.new_function(command=func_py, kind=JOB_KIND)

parq2dask.spec.remote = True
parq2dask.spec.replicas = 4 
parq2dask.spec.max_replicas = 4
parq2dask.spec.service_type = 'NodePort'
parq2dask.spec.build.base_image = BASE_IMAGE

In [5]:
parq2dask.export(func_yaml)

[mlrun] 2020-01-30 09:38:16,699 function spec saved to path: /User/repos/functions/fileutils/parquet_to_dask/function.yaml


**otherwise load it:**

In [6]:
parq2dask = mlrun.import_function(func_yaml)

parq2dask.apply(mlrun.mount_v3io())

parq2dask.deploy() # skip_deployed=True, with_mlrun=False)

[mlrun] 2020-01-30 09:38:18,145 starting remote build, image: .mlrun/func-default-function-latest


True

In [7]:
# create and run the task
parq_to_dask_task = mlrun.NewTask(
    TASK_NAME, 
    handler=FUNCTION,  
    params={
        'parquet_url': SRC_PATH,
        'index_cols' : PARTITION_COLS,
        'shards'     : DASK_SHARDS,
        'threads_per': DASK_THREADS_PER,
        'persist'    : True,
        'dask_key'   : 'testdf1',
        'target_path': '/User/mlrun/models'})
# run
rn = parq2dask.run(parq_to_dask_task)

[mlrun] 2020-01-30 09:38:20,399 starting run user-task-parq-to-dask uid=8d780f8755984477975fb16927110af1  -> http://mlrun-api:8080
[mlrun] 2020-01-30 09:38:21,436 saving function: function, tag: latest
[mlrun] 2020-01-30 09:38:27,297 using remote dask scheduler (mlrun-function-90bd99ce-2) at: 3.133.8.252:30417
[mlrun] 2020-01-30 09:38:27,298 remote dashboard (node) port: 3.133.8.252:30164



dask
+-----------+---------+
|           | version |
+-----------+---------+
| client    | 2.9.2   |
| scheduler | 2.10.0  |
+-----------+---------+

distributed
+-----------+---------+
|           | version |
+-----------+---------+
| client    | 2.9.3   |
| scheduler | 2.10.0  |
+-----------+---------+

msgpack
+-----------+---------+
|           | version |
+-----------+---------+
| client    | 0.6.2   |
| scheduler | 0.6.1   |
+-----------+---------+


[mlrun] 2020-01-30 09:38:27,301 found cluster...
[mlrun] 2020-01-30 09:38:27,301 <Client: 'tcp://10.233.64.55:8786' processes=0 threads=0, memory=0 B>
[mlrun] 2020-01-30 09:38:27,636 log artifact scheduler at /User/mlrun/models/scheduler.json, size: None, db: Y
   Year  Month  DayofMonth  DayOfWeek  DepTime  CRSDepTime  ArrTime  \
0  1997      1           7          2   1020.0        1020   1123.0   
1  1997      1           8          3   1107.0        1020   1205.0   
2  1997      1           9          4   1020.0        1020   1130.0   
3  1997      1          10          5   1020.0        1020   1123.0   
4  1997      1          12          7   1020.0        1020   1134.0   

   CRSArrTime UniqueCarrier  FlightNum  ...  Dest  Distance TaxiIn TaxiOut  \
0        1130            WN       1293  ...   PHX     328.0    2.0     5.0   
1        1130            WN       1293  ...   PHX     328.0    3.0     9.0   
2        1130            WN       1293  ...   PHX     328.0    3.0     8.0   

uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...110af1,0,Jan 30 09:38:21,completed,user-task-parq-to-dask,kind=daskowner=adminhost=jupyter-1-6ccccd5fdf-mz2ld,,"parquet_url=/User/mlrun/airlines/dataset-small/partitionsindex_cols=['Year', 'Month']shards=4threads_per=4persist=Truedask_key=testdf1target_path=/User/mlrun/models",,scheduler


to track results use .show() or .logs() or in CLI: 
!mlrun get run 8d780f8755984477975fb16927110af1  , !mlrun logs 8d780f8755984477975fb16927110af1 
[mlrun] 2020-01-30 09:38:35,085 run executed, status=completed


In [9]:
rn.outputs['scheduler']

'/User/mlrun/models/scheduler.json'

#### What's the scheduler address?

In [10]:
import json
json.load(open(rn.outputs['scheduler']))

{'type': 'Scheduler',
 'id': 'Scheduler-e216939d-7eaf-4946-98dc-29a0b571b1e2',
 'address': 'tcp://10.233.64.55:8786',
 'services': {},
 'workers': {}}

### create a component 'on the fly' to summarise the table

The nice thing about having a dask clkuster loaded with all you rdata is that you can write _quick and dirty_ jobs either in your notebook, a local file, or a gihub repo.

In [11]:
# write up function in local directory
summ = mlrun.new_function(command='https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/tests/describe.py', 
                          kind='job')
# specify a base image
summ.spec.build.base_image = BASE_IMAGE

# (optional) export it as yaml
summ.export('/User/repos/functions/tests/describe.yaml')

# mount it on iguazio data fabric
summ.apply(mlrun.mount_v3io())

# deploy the function
summ.deploy(skip_deployed=True, with_mlrun=False)

# create the task
summ_task = mlrun.NewTask(
    'user-task-my-sum', 
    handler='table_summary',  
    params={
        'dask_key'   : 'testdf1',
        'dask_client': rn.outputs['scheduler'],
        'target_path': '/User/mlrun/models',
        'name'       : 'table-summary.csv',
        'key'        : 'table-summary'})

# run
rn2 = summ.run(summ_task)

rn2.outputs

[mlrun] 2020-01-30 09:38:53,769 function spec saved to path: /User/repos/functions/tests/describe.yaml
[mlrun] 2020-01-30 09:38:53,822 starting run user-task-my-sum uid=5a52e1a6009647848d71dd211b741ee8  -> http://mlrun-api:8080
[mlrun] 2020-01-30 09:38:53,905 Job is running in the background, pod: user-task-my-sum-k5nsk
[mlrun] 2020-01-30 09:39:04,332 log artifact table-summary at /User/mlrun/models/table-summary.csv, size: None, db: Y

[mlrun] 2020-01-30 09:39:04,347 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...741ee8,0,Jan 30 09:39:02,completed,describe,host=user-task-my-sum-k5nskkind=jobowner=admin,,dask_client=/User/mlrun/models/scheduler.jsondask_key=testdf1key=table-summaryname=table-summary.csvtarget_path=/User/mlrun/models,,table-summary


to track results use .show() or .logs() or in CLI: 
!mlrun get run 5a52e1a6009647848d71dd211b741ee8  , !mlrun logs 5a52e1a6009647848d71dd211b741ee8 
[mlrun] 2020-01-30 09:39:13,120 run executed, status=completed


{'table-summary': '/User/mlrun/models/table-summary.csv'}

## our cluster

In [17]:
from dask.distributed import Client, LocalCluster

client = Client(scheduler_file='/User/mlrun/models/scheduler.json') # Client(scheduler_file=rn.outputs['scheduler'])

In [18]:
df = client.get_dataset('dask_key')

In [19]:
df.shape[0].compute()

175912

In [23]:
client.nbytes(summary=False)

{"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 7)": 2912128,
 "('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 6)": 1159726,
 "('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 19)": 2912128,
 "('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 14)": 2912128,
 "('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 16)": 2912128,
 "('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 8)": 2912128,
 "('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 9)": 2912128,
 "('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 11)": 2912128,
 "('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 3)": 2912128,
 "('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 5)": 2912128,
 "('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 15)": 1159726,
 "('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 0)": 2912128,
 "('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 1)": 1159726,
 "('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 12)": 2912128,
 "('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 4)": 2912128,
 "('

In [24]:
client.ncores()

{'tcp://10.233.64.56:38718': 1,
 'tcp://10.233.64.57:36325': 1,
 'tcp://10.233.64.58:38383': 1,
 'tcp://10.233.64.59:44139': 1}

In [26]:
client.nthreads()

{'tcp://10.233.64.56:38718': 1,
 'tcp://10.233.64.57:36325': 1,
 'tcp://10.233.64.58:38383': 1,
 'tcp://10.233.64.59:44139': 1}

In [27]:
client.processing()

{'tcp://10.233.64.56:38718': (),
 'tcp://10.233.64.57:36325': (),
 'tcp://10.233.64.58:38383': (),
 'tcp://10.233.64.59:44139': ()}

distributed.client - ERROR - Failed to reconnect to scheduler after 3.00 seconds, closing client
distributed.client - ERROR - Failed to reconnect to scheduler after 3.00 seconds, closing client
distributed.utils - ERROR - 
Traceback (most recent call last):
  File "/conda/lib/python3.6/site-packages/distributed/utils.py", line 662, in log_errors
    yield
  File "/conda/lib/python3.6/site-packages/distributed/client.py", line 1311, in _close
    await gen.with_timeout(timedelta(seconds=2), list(coroutines))
  File "/conda/lib/python3.6/asyncio/tasks.py", line 250, in _wakeup
    future.result()
concurrent.futures._base.CancelledError
distributed.utils - ERROR - 
Traceback (most recent call last):
  File "/conda/lib/python3.6/site-packages/distributed/utils.py", line 662, in log_errors
    yield
  File "/conda/lib/python3.6/site-packages/distributed/client.py", line 1025, in _reconnect
    await self._close()
  File "/conda/lib/python3.6/site-packages/distributed/client.py", line 1311, 

____

# tests

In [20]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [None]:
import dask
import dask.dataframe as dd

In [None]:
dataset = pq.ParquetDataset(os.path.join(SRC_PATH))
df = dataset.read().to_pandas()


ddf = dd.read_parquet(SRC_PATH) #+'/*.parquet')

In [None]:
ddf = ddf.persist()

In [None]:
ddf.head()

In [None]:
ddf.shape[0].compute()