# parquet to dask
load a parquet dataset into a dask cluster

In [22]:
import mlrun
import os
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'
mlrun.mlconf.remote_host = '3.133.8.252' 

## parameters


In [23]:
FUNCTION           = 'parquet_to_dask'
DESCRIPTION        = 'load parquet dataset into a dask cluster'

BASE_IMAGE         = 'yjbds/mlrun-daskboost:dev'
JOB_KIND           = 'job'
TASK_NAME          = 'user-task-parq-to-dask'

CODE_BASE          = '/User/repos/functions/fileutils'

SRC_PATH           = '/User/mlrun/airlines/dataset-small/partitions'

PARTITION_COLS     = ['Year', 'Month']

DASK_SHARDS = 4
DASK_THREADS_PER = 4

## load and configure function

In [24]:
func_py = os.path.join(CODE_BASE, FUNCTION, 'function.py')
func_yaml = os.path.join(CODE_BASE, FUNCTION, 'function.yaml')

**If run the first time, create the function:**

In [25]:
# load function from a local Python file
parq2dask = mlrun.new_function(command=func_py, kind=JOB_KIND)

parq2dask.spec.remote = True
parq2dask.spec.replicas = 4 
parq2dask.spec.max_replicas = 4
parq2dask.spec.service_type = 'NodePort'
parq2dask.spec.build.base_image = BASE_IMAGE

In [26]:
parq2dask.export(func_yaml)

[mlrun] 2020-01-30 00:27:38,400 function spec saved to path: /User/repos/functions/fileutils/parquet_to_dask/function.yaml


**otherwise load it:**

In [27]:
parq2dask = mlrun.import_function(func_yaml)

parq2dask.apply(mlrun.mount_v3io())

parq2dask.deploy(skip_deployed=True, with_mlrun=False)

[mlrun] 2020-01-30 00:27:38,522 starting remote build, image: .mlrun/func-default-function-latest


True

In [28]:
# create and run the task
parq_to_dask_task = mlrun.NewTask(
    TASK_NAME, 
    handler=FUNCTION,  
    params={
        'parquet_url': SRC_PATH,
        'index_cols' : PARTITION_COLS,
        'shards'     : DASK_SHARDS,
        'threads_per': DASK_THREADS_PER,
        'persist'    : True,
        'dask_key'   : 'testdf1',
        'target_path': '/User/mlrun/models'})
# run
rn = parq2dask.run(parq_to_dask_task)

[mlrun] 2020-01-30 00:27:38,588 starting run user-task-parq-to-dask uid=56bcb9b52e1f4a1a81fc51f8d4e15d8e  -> http://mlrun-api:8080
[mlrun] 2020-01-30 00:27:38,693 Job is running in the background, pod: user-task-parq-to-dask-c9fmj
  json = yaml.load(f)
[mlrun] 2020-01-30 00:27:46,724 starting new cluster...
[mlrun] 2020-01-30 00:27:48,599 <Client: 'tcp://127.0.0.1:38329' processes=4 threads=16, memory=66.45 GB>
[mlrun] 2020-01-30 00:27:48,875 log artifact scheduler at /User/mlrun/models/scheduler.json, size: None, db: Y
   Year  Month  DayofMonth  ...  NASDelay  SecurityDelay  LateAircraftDelay
0  2007      1           1  ...       0.0            0.0               67.0
1  2007      1           1  ...       0.0            0.0               17.0
2  2007      1           1  ...       0.0            0.0                0.0
3  2007      1           1  ...       0.0            0.0                0.0
4  2007      1           1  ...       0.0            0.0                0.0

[5 rows x 23 colu

uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...e15d8e,0,Jan 30 00:27:46,completed,function,host=user-task-parq-to-dask-c9fmjkind=jobowner=admin,,"dask_key=testdf1index_cols=['Year', 'Month']parquet_url=/User/mlrun/airlines/dataset-small/partitionspersist=Trueshards=4target_path=/User/mlrun/modelsthreads_per=4",,scheduler


to track results use .show() or .logs() or in CLI: 
!mlrun get run 56bcb9b52e1f4a1a81fc51f8d4e15d8e  , !mlrun logs 56bcb9b52e1f4a1a81fc51f8d4e15d8e 
[mlrun] 2020-01-30 00:27:53,903 run executed, status=completed


In [29]:
rn.outputs['scheduler']

'/User/mlrun/models/scheduler.json'

#### What's the scheduler address?

In [30]:
import json
json.load(open(rn.outputs['scheduler']))

{'type': 'Scheduler',
 'id': 'Scheduler-0a0672a0-1532-42cb-a151-3aa3da211991',
 'address': 'tcp://127.0.0.1:38329',
 'services': {},
 'workers': {'tcp://127.0.0.1:32950': {'type': 'Worker',
   'id': 3,
   'host': '127.0.0.1',
   'resources': {},
   'local_directory': '/dask-worker-space/worker-db2wzpjl',
   'name': 3,
   'nthreads': 4,
   'memory_limit': 16612705280,
   'last_seen': 1580344068.5361557,
   'services': {},
   'metrics': {'cpu': 0.0,
    'memory': 154923008,
    'time': 1580344068.530411,
    'read_bytes': 0.0,
    'write_bytes': 0.0,
    'num_fds': 22,
    'executing': 0,
    'in_memory': 0,
    'ready': 0,
    'in_flight': 0,
    'bandwidth': {'total': 100000000, 'workers': {}, 'types': {}}},
   'nanny': 'tcp://127.0.0.1:34755'},
  'tcp://127.0.0.1:33754': {'type': 'Worker',
   'id': 2,
   'host': '127.0.0.1',
   'resources': {},
   'local_directory': '/dask-worker-space/worker-3hgbwwcw',
   'name': 2,
   'nthreads': 4,
   'memory_limit': 16612705280,
   'last_seen': 15

### create a component 'on the fly' to summarise the table

The nice thing about having a dask clkuster loaded with all you rdata is that you can write _quick and dirty_ jobs either in your notebook, a local file, or a gihub repo.

In [43]:
summ = mlrun.new_function(
    command='/User/repos/functions/tests/describe.py', 
    kind='job')

In [44]:
summ.spec.build.base_image = BASE_IMAGE

In [45]:
summ.export('/User/repos/functions/tests/describe.yaml')

[mlrun] 2020-01-30 00:35:45,128 function spec saved to path: /User/repos/functions/tests/describe.yaml


In [46]:
summ.apply(mlrun.mount_v3io())

summ.deploy(skip_deployed=True, with_mlrun=False)

'ready'

In [47]:
# create and run the task
summ_task = mlrun.NewTask(
    'user-task-my-sum', 
    handler='table_summary',  
    params={
        'dask_key'   : 'testdf',
        'dask_client': '3.133.8.252:8786',
        'target_path': '/User/mlrun/models',
        'name'       : 'table-summary.csv',
        'key'        : 'table-summary'})
# run
rn2 = summ.run(summ_task)

[mlrun] 2020-01-30 00:35:45,992 starting run user-task-my-sum uid=a90a033391c9471db638448f497fa9ef  -> http://mlrun-api:8080
[mlrun] 2020-01-30 00:35:46,062 Job is running in the background, pod: user-task-my-sum-fdq27
3.133.8.252:8786
[mlrun] 2020-01-30 00:36:04,279 Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/distributed/comm/core.py", line 218, in connect
    _raise(error)
  File "/opt/conda/lib/python3.7/site-packages/distributed/comm/core.py", line 203, in _raise
    raise IOError(msg)
OSError: Timed out trying to connect to 'tcp://127.0.0.1:38329' after 10 s: in <distributed.comm.tcp.TCPConnector object at 0x7f6863d0e750>: ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/mlrun/runtimes/local.py", line 199, in exec_from_params
    val = handler(*args_list)
  File "/User/repos/functions/tes

uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...7fa9ef,0,Jan 30 00:35:54,error,describe,host=user-task-my-sum-fdq27kind=jobowner=admin,,dask_client=3.133.8.252:8786dask_key=testdfkey=table-summaryname=table-summary.csvtarget_path=/User/mlrun/models,,


to track results use .show() or .logs() or in CLI: 
!mlrun get run a90a033391c9471db638448f497fa9ef  , !mlrun logs a90a033391c9471db638448f497fa9ef 
[mlrun] 2020-01-30 00:36:08,251 run executed, status=error
runtime error: Timed out trying to connect to 'tcp://127.0.0.1:38329' after 10 s: Timed out trying to connect to 'tcp://127.0.0.1:38329' after 10 s: in <distributed.comm.tcp.TCPConnector object at 0x7f6863d0e750>: ConnectionRefusedError: [Errno 111] Connection refused


RunError: Timed out trying to connect to 'tcp://127.0.0.1:38329' after 10 s: Timed out trying to connect to 'tcp://127.0.0.1:38329' after 10 s: in <distributed.comm.tcp.TCPConnector object at 0x7f6863d0e750>: ConnectionRefusedError: [Errno 111] Connection refused

In [None]:
rn2.outputs

## our cluster

In [None]:
from dask.distributed import Client, LocalCluster

client = Client(scheduler)

In [None]:
client.datasets['testdf']

____

# tests

In [None]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [None]:
import dask
import dask.dataframe as dd

In [None]:
dataset = pq.ParquetDataset(os.path.join(SRC_PATH))
df = dataset.read().to_pandas()


ddf = dd.read_parquet(SRC_PATH) #+'/*.parquet')

In [None]:
ddf = ddf.persist()

In [None]:
ddf.head()

In [None]:
ddf.shape[0].compute()