In [12]:
# nuclio: ignore
import nuclio

In [13]:
import os
import json
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem

from typing import List, Optional

def _chunk_readwrite(
    archive_url,
    dest_path,
    chunksize, 
    header, 
    encoding, 
    dtype
):
    """stream read and write archives
    
    pandas reads and parquet writes
    
    notes
    -----
    * dest_path can be either a file.parquet, or in hte case of partitioned parquet
      it will be only the destination folder of the parquet partition files
    """
    pqwriter = None
    header = []
    for i, df in enumerate(pd.read_csv(archive_url, chunksize=chunksize, 
                                       names=header, encoding=encoding, 
                                       dtype=dtype)):
        table = pa.Table.from_pandas(df)
        if i == 0:
            if dataset:
                header = copy(table.schema)
            else:
                pqwriter = pq.ParquetWriter(dest_path, table.schema)
        if dataset:
            pq.write_to_dataset(table, root_path=dest_path, partition_cols=partition_cols)
        else:
            pqwriter.write_table(table)
    if pqwriter:
        pqwriter.close()
    
    return header

def arc_to_parquet(
    context: MLClientCtx,
    archive_url: DataItem,
    header: List[str] = [None],
    chunksize: int = 0,
    dtype=None,
    encoding: str = "latin-1",
    key: str = "data",
    dataset: str = "None",
    part_cols = [],
    file_ext: str = "parquet",
    index: bool= False,
    refresh_data: bool = False,
    stats: bool = False
) -> None:
    """Open a file/object archive and save as a parquet file or dataset

    Notes
    -----
    * this function is typically for large files, please be sure to check all settings
    * partitioning requires precise specification of column types.
    * the archive_url can be any file readable by pandas read_csv, which includes tar files
    * if the `dataset` parameter is not empty, then a partitioned dataset will be created
    instead of a single file in the folder `dataset`
    * if a key exists already then it will not be re-acquired unless the `refresh_data` param
    is set to `True`.  This is in case the original file is corrupt, or a refresh is 
    required.

    :param context:        the function context
    :param archive_url:    MLRun data input (DataItem object)
    :param chunksize:      (0) when > 0, row size (chunk) to retrieve
                           per iteration
    :param dtype           destination data type of specified columns
    :param encoding        ("latin-8") file encoding
    :param key:            key in artifact store (when log_data=True)
    :param dataset:        (None) if not None then "target_path/dataset"
                           is folder for partitioned files
    :param part_cols:      ([]) list of partitioning columns
    :param file_ext:       (parquet) csv/parquet file extension
    :param index:          (False) pandas save index option
    :param refresh_data:   (False) overwrite existing data at that location
    :param stats:          (None) calculate table stats when logging artifact
    """
    base_path = context.artifact_path
    os.makedirs(base_path, exist_ok=True)
    
    archive_url = archive_url.local()
    
    if dataset is not None:
        dest_path = os.path.join(base_path, dataset)
        exists = os.path.isdir(dest_path)
    else:
        dest_path = os.path.join(base_path, key+f".{file_ext}")
        exists = os.path.isfile(dest_path)
        
    if not exists:
        context.logger.info("destination file does not exist, downloading")
        if chunksize > 0:
            header = _chunk_readwrite(archive_url, dest_path, chunksize,
                                      encoding, dtype)
            context.log_dataset(key=key, stats=stats, format='parquet', 
                                target_path=dest_path)
        else:
            df = pd.read_csv(archive_url)
            context.log_dataset(key, df=df, format=file_ext, index=index)
    else:
        context.logger.info("destination file already exists, nothing done")

In [14]:
# nuclio: end-code

### function setup

In [15]:
from mlutils import create_function
import os

fn_params = {
    "name"            : "arc_to_parquet",
    "project"         : "functions",
    "default_handler" : "arc_to_parquet",
    "desc"            : "retrieve remote archive, open and save as parquet",
    "categories"      : ['data-movement', 'utils'],
    "labels"          : {"author": "yjb"},
    "kind"            : "job",
    "image"           : os.environ["MLRUN_DOCKER_REPO"] + "/ml-base",
    "tag"             : os.environ["MLRUN_IMAGES_TAG"],
}

fn = create_function(fn_params, path='.')

[mlrun] 2020-05-31 14:48:22,033 function spec saved to path: ./function.yaml


### task definition

In [16]:
task_params = {
    "name" : "tasks archive to parquet",
    "params":{"key": "higgs-sample"}}

In [17]:
TEST_REPO = "https://raw.githubusercontent.com/yjb-ds/testdata/master" 
DATA_URL = "arc_to_parquet/higgs-sample.csv.gz"
# original large file "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"

## tests

In [18]:
from mlrun import NewTask    

task_params = {"name": "tasks archive to parquet", 
               "params":{"key": "higgs-sample"}}

In [21]:
from mlrun import run_local

run = run_local(NewTask(**task_params),
          handler=arc_to_parquet,
          inputs={"archive_url" : f"{TEST_REPO}/{DATA_URL}"})

[mlrun] 2020-05-31 14:49:39,927 starting run tasks archive to parquet uid=5f3c31fad33449e9957165170b747243  -> http://mlrun-api:8080
[mlrun] 2020-05-31 14:49:39,956 downloading https://raw.githubusercontent.com/yjb-ds/testdata/master/arc_to_parquet/higgs-sample.csv.gz to local tmp
[mlrun] 2020-05-31 14:49:40,028 destination file does not exist, downloading
[mlrun] 2020-05-31 14:49:40,186 log artifact higgs-sample at /User/artifacts/higgs-sample.parquet, size: 37645, db: Y



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...0b747243,0,May 31 14:49:39,completed,tasks archive to parquet,v3io_user=adminkind=handlerowner=adminhost=jupyter-697c84dd-pjs2q,archive_url,key=higgs-sample,,higgs-sample


to track results use .show() or .logs() or in CLI: 
!mlrun get run 5f3c31fad33449e9957165170b747243 --project default , !mlrun logs 5f3c31fad33449e9957165170b747243 --project default
[mlrun] 2020-05-31 14:49:40,273 run executed, status=completed


#### the following will run quickly if your artifact path hasn't changed, the large file will be detected and not downloaded a second time:

In [23]:
from mlrun import NewTask, mlconf
run = fn.run(NewTask(**task_params),
             inputs={"archive_url" : f"{TEST_REPO}/{DATA_URL}"},
             artifact_path=mlconf.artifact_path)

[mlrun] 2020-05-31 14:50:05,451 starting run tasks archive to parquet uid=cb73bfeb2e6246e28f9867cbf16eba9d  -> http://mlrun-api:8080
[mlrun] 2020-05-31 14:50:05,577 Job is running in the background, pod: tasks-archive-to-parquet-msl5r
[mlrun] 2020-05-31 14:50:09,155 starting local run: main.py # arc_to_parquet
[mlrun] 2020-05-31 14:50:09,194 downloading https://raw.githubusercontent.com/yjb-ds/testdata/master/arc_to_parquet/higgs-sample.csv.gz to local tmp
[mlrun] 2020-05-31 14:50:09,264 destination file does not exist, downloading
[mlrun] 2020-05-31 14:50:09,405 log artifact higgs-sample at /User/artifacts/higgs-sample.parquet, size: 38575, db: Y

[mlrun] 2020-05-31 14:50:09,460 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...f16eba9d,0,May 31 14:50:09,completed,tasks archive to parquet,v3io_user=adminkind=jobowner=adminhost=tasks-archive-to-parquet-msl5r,archive_url,key=higgs-sample,,higgs-sample


to track results use .show() or .logs() or in CLI: 
!mlrun get run cb73bfeb2e6246e28f9867cbf16eba9d  , !mlrun logs cb73bfeb2e6246e28f9867cbf16eba9d 
[mlrun] 2020-05-31 14:50:11,884 run executed, status=completed
