# archive to parquet

Convert a remote archive or csv file (or local file://), to parquet format

In [1]:
import mlrun
import os
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

In [2]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

## parameters
from **[h20ai](https://github.com/h2oai/h2o-2/wiki/Hacking-Airline-DataSet-with-H2O)**:

In [3]:
ARCHIVE_BIG        = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears_10.csv"
ARCHIVE            = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears.csv"
ARCHIVE_SMALL      = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv"

In [4]:
USE_ARCHIVE        = ARCHIVE_SMALL
SRC_PATH           = '/User/mlrun/airlines/dataset-small/partitions/*.parquet'

PARTITIONS_DEST = 'partitions'
PARTITION_COLS = ['Year', 'Month']

In [5]:
BASE_IMAGE         = 'yjbds/mlrun-ds:latest'

In [6]:
CODE_BASE          = '/User/repos/functions/' # 'https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/'
FUNCTION           = 'fileutils/parq_to_dask'
JOB_KIND           = 'dask'

In [7]:
LABEL_COLUMN = "IsArrDelayed"

## load and configure function

In [8]:
yaml_name = os.path.join(CODE_BASE, FUNCTION, 'function.yaml')

**If run the first time, create the function:**

In [9]:
# load function from a local Python file
parq2dask = mlrun.new_function(
    command=os.path.join(CODE_BASE, FUNCTION, 'function.py'), 
    kind=JOB_KIND)

parq2dask.spec.remote = True
parq2dask.spec.replicas = 4 
parq2dask.spec.max_replicas = 4
parq2dask.spec.service_type = 'NodePort'
parq2dask.spec.image_pull_policy = 'Always'
parq2dask.build_config(base_image=BASE_IMAGE, commands=[])

parq2dask.export(yaml_name)

[mlrun] 2020-01-28 00:08:03,363 function spec saved to path: /User/repos/functions/fileutils/parq_to_dask/function.yaml


**otherwise load it:**

In [10]:
# parq2dask = mlrun.import_function(yaml_name).apply(mlrun.mount_v3io())

## deploy / build

The following triggers a build when run for the first time using specs found in the yaml file above.

In [11]:
parq2dask.deploy(skip_deployed=True, with_mlrun=False)

'ready'

In [12]:
# create and run the task
parq_to_dask_task = mlrun.NewTask(
    'parq-to-dask', 
    handler='parquet_to_dask',  
    params={
        'parquet_url': SRC_PATH,
        'index_cols' : PARTITION_COLS,
        'shards'     : 4,
        'persist'    : True})
# run
run = parq2dask.run(parq_to_dask_task)

[mlrun] 2020-01-28 00:08:03,437 starting run parq-to-dask uid=687a2c492be3405abcfa85d5430fd42a  -> http://mlrun-api:8080
[mlrun] 2020-01-28 00:08:04,283 saving function: function, tag: latest


RunDBError: POST http://mlrun-api:8080/api/start/function, error: HTTPConnectionPool(host='mlrun-api', port=8080): Read timed out. (read timeout=20)