# parquet to dask
load a parquet dataset into a dask cluster

In [1]:
import mlrun
import os
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'
mlrun.mlconf.remote_host = '3.133.8.252' 

## parameters


In [2]:
FUNCTION           = 'parquet_to_dask'
DESCRIPTION        = 'load parquet dataset into a dask cluster'

BASE_IMAGE         = 'yjbds/mlrun_dev-dask-ds:latest'
JOB_KIND           = 'dask'
TASK_NAME          = 'user-task-parq-to-dask'

CODE_BASE          = '/User/repos/functions/fileutils'

SRC_PATH           = '/User/mlrun/airlines/dataset/partitions'

PARTITION_COLS     = ['Year', 'Month']

DASK_SHARDS = 4
DASK_THREADS_PER = 4

## load and configure function

In [3]:
func_py = os.path.join(CODE_BASE, FUNCTION, 'function.py')
func_yaml = os.path.join(CODE_BASE, FUNCTION, 'function.yaml')

**If run the first time, create the function:**

In [4]:
# load function from a local Python file
parq2dask = mlrun.new_function(command=func_py, kind=JOB_KIND)

parq2dask.spec.remote = True
parq2dask.spec.replicas = 4 
parq2dask.spec.max_replicas = 4
parq2dask.spec.service_type = 'NodePort'
parq2dask.spec.build.base_image = BASE_IMAGE

In [5]:
parq2dask.export(func_yaml)

[mlrun] 2020-01-29 01:44:34,435 function spec saved to path: /User/repos/functions/fileutils/parquet_to_dask/function.yaml


**otherwise load it:**

In [6]:
parq2dask = mlrun.import_function(func_yaml)

parq2dask.apply(mlrun.mount_v3io())

parq2dask.deploy(skip_deployed=True, with_mlrun=False)

[mlrun] 2020-01-29 01:44:47,617 starting remote build, image: .mlrun/func-default-function-latest


True

In [7]:
# create and run the task
parq_to_dask_task = mlrun.NewTask(
    TASK_NAME, 
    handler=FUNCTION,  
    params={
        'parquet_url': SRC_PATH,
        'index_cols' : PARTITION_COLS,
        'shards'     : DASK_SHARDS,
        'threads_per': DASK_THREADS_PER,
        'persist'    : True})
# run
rn = parq2dask.run(parq_to_dask_task)

[mlrun] 2020-01-29 01:44:50,780 starting run user-task-parq-to-dask uid=7f501215960146a89447aa29cb21c2a1  -> http://mlrun-api:8080
[mlrun] 2020-01-29 01:44:51,667 saving function: function, tag: latest
[mlrun] 2020-01-29 01:44:57,827 using remote dask scheduler (mlrun-function-e71085fe-8) at: 3.133.8.252:30113
[mlrun] 2020-01-29 01:44:57,828 remote dashboard (node) port: 3.133.8.252:31064



blosc
+-----------+---------+
|           | version |
+-----------+---------+
| client    | None    |
| scheduler | 1.7.0   |
+-----------+---------+

lz4
+-----------+---------+
|           | version |
+-----------+---------+
| client    | None    |
| scheduler | 2.2.1   |
+-----------+---------+

msgpack
+-----------+---------+
|           | version |
+-----------+---------+
| client    | 0.6.2   |
| scheduler | 0.6.1   |
+-----------+---------+

tornado
+-----------+---------+
|           | version |
+-----------+---------+
| client    | 5.1.1   |
| scheduler | 6.0.3   |
+-----------+---------+


   Year  Month  DayofMonth  DayOfWeek  DepTime  CRSDepTime  ArrTime  \
0  1996     12          10          2    932.0         935   1112.0   
1  1996     12          11          3    945.0         935   1145.0   
2  1996     12           7          6    730.0         730    940.0   
3  1996     12           1          7   2357.0        2005    212.0   
4  1996     12           2          1   2006.0        2005   2206.0   

   CRSArrTime UniqueCarrier  FlightNum  ...  Dest  Distance TaxiIn TaxiOut  \
0        1140            CO        661  ...   CAE     602.0    4.0    10.0   
1        1140            CO        661  ...   CAE     602.0    5.0    18.0   
2         937            CO        678  ...   CAE     602.0    5.0    20.0   
3        2210            CO        695  ...   CAE     602.0    4.0    25.0   
4        2210            CO        695  ...   CAE     602.0    3.0    26.0   

   Cancelled  CarrierDelay  WeatherDelay  NASDelay  SecurityDelay  \
0          0           NaN         

uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...21c2a1,0,Jan 29 01:44:51,completed,user-task-parq-to-dask,kind=daskowner=adminhost=jupyter-1-6ccccd5fdf-mz2ld,,"parquet_url=/User/mlrun/airlines/dataset/partitionsindex_cols=['Year', 'Month']shards=4threads_per=4persist=True",,


to track results use .show() or .logs() or in CLI: 
!mlrun get run 7f501215960146a89447aa29cb21c2a1  , !mlrun logs 7f501215960146a89447aa29cb21c2a1 
[mlrun] 2020-01-29 01:47:11,004 run executed, status=completed


In [9]:
rn.outputs

{}

In [10]:
summ = mlrun.new_function(command='/User/repos/functions/tests/describe.py', kind=JOB_KIND)

In [11]:
summ.spec.remote = True
summ.spec.replicas = 4 
summ.spec.max_replicas = 4
summ.spec.service_type = 'NodePort'
summ.spec.build.base_image = BASE_IMAGE

In [12]:
summ.export('/User/repos/functions/tests/describe.yaml')

[mlrun] 2020-01-29 01:47:51,451 function spec saved to path: /User/repos/functions/tests/describe.yaml


In [13]:
summ.apply(mlrun.mount_v3io())

summ.deploy(skip_deployed=True, with_mlrun=False)

[mlrun] 2020-01-29 01:47:56,228 starting remote build, image: .mlrun/func-default-describe-latest


True

In [14]:
# create and run the task
summ_task = mlrun.NewTask(
    'user-task-my-sum', 
    handler='table_summary',  
    params={
        'data_key'   : 'my_dask_dataframe',
        'target_path': '/User/mlrun/models',
        'name'       : 'table-summary.csv',
        'key'        : 'table-summary'})
# run
rn2 = summ.run(summ_task)

[mlrun] 2020-01-29 01:47:58,445 starting run user-task-my-sum uid=f46e35077e114018836ac247b025e8ae  -> http://mlrun-api:8080
[mlrun] 2020-01-29 01:47:58,559 saving function: describe, tag: latest
[mlrun] 2020-01-29 01:48:07,794 using remote dask scheduler (mlrun-describe-82c5acd0-e) at: 3.133.8.252:30018
[mlrun] 2020-01-29 01:48:07,795 remote dashboard (node) port: 3.133.8.252:32596
[mlrun] 2020-01-29 01:48:07,862 exec error - "Dataset 'my_dask_dataframe' not found"



blosc
+-----------+---------+
|           | version |
+-----------+---------+
| client    | None    |
| scheduler | 1.7.0   |
+-----------+---------+

lz4
+-----------+---------+
|           | version |
+-----------+---------+
| client    | None    |
| scheduler | 2.2.1   |
+-----------+---------+

msgpack
+-----------+---------+
|           | version |
+-----------+---------+
| client    | 0.6.2   |
| scheduler | 0.6.1   |
+-----------+---------+

tornado
+-----------+---------+
|           | version |
+-----------+---------+
| client    | 5.1.1   |
| scheduler | 6.0.3   |
+-----------+---------+
"Dataset 'my_dask_dataframe' not found"


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...25e8ae,0,Jan 29 01:47:58,error,user-task-my-sum,host=jupyter-1-6ccccd5fdf-mz2ldkind=daskowner=admin,,data_key=my_dask_dataframekey=table-summaryname=table-summary.csvtarget_path=/User/mlrun/models,,


to track results use .show() or .logs() or in CLI: 
!mlrun get run f46e35077e114018836ac247b025e8ae  , !mlrun logs f46e35077e114018836ac247b025e8ae 
[mlrun] 2020-01-29 01:48:07,933 run executed, status=error


RunError: "Dataset 'my_dask_dataframe' not found"

In [None]:
rn2.outputs

____

# tests

In [None]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [None]:
import dask
import dask.dataframe as dd

In [None]:
dataset = pq.ParquetDataset(os.path.join(SRC_PATH))
df = dataset.read().to_pandas()


ddf = dd.read_parquet(SRC_PATH) #+'/*.parquet')

In [None]:
ddf = ddf.persist()

In [None]:
ddf.head()