In [1]:
# nuclio: ignore
import nuclio

In [2]:
# Copyright 2018 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    # Legacy Python that doesn"t verify HTTPS certificates by default
        pass
else:
    # Handle target environment that doesn"t support HTTPS verification
    ssl._create_default_https_context = _create_unverified_https_context

import os
import json
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
from cloudpickle import dump, load

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact, TableArtifact

from typing import List, Optional

def arc_to_parquet(
    context: MLClientCtx,
    archive_url: str,
    header: Optional[List[str]] = None,
    chunksize: int = 10_000,
    dtype=None,
    encoding: str = "latin-1",
    key: str = "data",
    dataset: Optional[str] = None,
    part_cols = [],
    file_ext: str = 'parquet',
    refresh_data: bool = False
) -> None:
    """Open a file/object archive and save as a parquet file or dataset

    Notes
    -----
    * partitioning requires precise specification of column types.
    * the archive_url can be any file readable by pandas read_csv, which includes tar files
    * if the `dataset` parameter is not empty, then a partitioned dataset will be created
    instead of a single file in the folder `dataset`
    * if a key exists already then it will not be re-acquired unless the `refresh_data` param
    is set to `True`.  This is in case the original file is corrupt, or a refresh is 
    required.

    :param context:      function context
    :param archive_url:  any valid string path consistent with the path variable
                         of pandas.read_csv, including strings as file paths, as urls, 
                         pathlib.Path objects, etc...
    :param header:       column names
    :param chunksize:    (0) row size retrieved per iteration
    :param dtype         destination data type of specified columns
    :param encoding      ("latin-8") file encoding
    :param key:          key in artifact store (when log_data=True)
    :param dataset:      (None) if not None then "target_path/dataset"
                         is folder for partitioned files
    :param file_ext:     (parquet) csv/parquet file extension
    :param part_cols:    ([]) list of partitioning columns
    :param refresh_data: (False) overwrite existing data at that location/kye
    """
    base_path = context.artifact_path
    os.makedirs(base_path, exist_ok=True)
    
    archive_url = str(archive_url)
    
    if dataset is not None:
        dest_path = os.path.join(base_path, dataset)
        exists = os.path.isdir(dest_path)
    else:
        dest_path = os.path.join(base_path, key+f".{file_ext}")
        exists = os.path.isfile(dest_path)

    # todo: more logic for header
    if not exists:
        context.logger.info("destination file does not exist, downloading")
        pqwriter = None
        for i, df in enumerate(pd.read_csv(archive_url, 
                                           chunksize=chunksize, 
                                           names=header,
                                           encoding=encoding, 
                                           dtype=dtype)):
            table = pa.Table.from_pandas(df)
            if i == 0:
                if dataset:
                    # just write header here
                    pq.ParquetWriter(os.path.join(base_path,f"header-only.{file_ext}"), table.schema)
                else:
                    # start writing file
                    pqwriter = pq.ParquetWriter(dest_path, table.schema)
                context.log_artifact("header", local_path=f"header-only.{file_ext}")
            if dataset:
                pq.write_to_dataset(table, root_path=dest_path, partition_cols=partition_cols)
            else:
                pqwriter.write_table(table)
        if pqwriter:
            pqwriter.close()

        context.logger.info(f"saved table to {dest_path}")
    else:
        context.logger.info("destination file already exists")
    context.log_artifact(key, local_path=key+f".{file_ext}")


In [3]:
# nuclio: end-code

### mlconfig

In [8]:
from mlrun import mlconf

mlconf.dbpath = mlconf.dbpath or './'
mlconf.dbpath

vcs_branch = 'development'
base_vcs = f'https://raw.githubusercontent.com/mlrun/functions/{vcs_branch}/'

mlconf.hub_url = mlconf.hub_url or base_vcs + f'{name}/function.yaml'
mlconf.hub_url

import os
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["V3IO_HOME"]}/artifacts'
mlconf.artifact_path

import os
TAG = os.environ['MLRUN_COMMIT']

### save

In [9]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function('arc_to_parquet', kind='job', with_doc=True,
                      handler=arc_to_parquet, 
                      image=f'mlrun/ml-base:{TAG}')

# add metadata (for templates and reuse)
fn.spec.default_handler = 'arc_to_parquet'
fn.spec.description = "retrieve remote archive, open and save as parquet"
fn.metadata.categories = ['fileutils', 'retrieve']
fn.metadata.labels = {"author": "yjb"}

fn.save()
fn.export('function.yaml')

[mlrun] 2020-04-30 01:55:03,003 saving function: arc-to-parquet, tag: latest
[mlrun] 2020-04-30 01:55:03,042 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f13c5013940>

## tests

In [10]:
from mlrun import import_function

func = import_function("hub://arc_to_parquet")

if "V3IO_HOME" in list(os.environ):
    from mlrun import mount_v3io
    func.apply(mount_v3io())
else:
    # is you set up mlrun using the instructions at https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
    from mlrun.platforms import mount_pvc
    func.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))

In [11]:
from mlrun import NewTask    

task_params = {
    "name" : "tasks archive to parquet",
    "params" :  {
        "header"      : ['labels', 'lepton_pT', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', 
                         'missing_energy_phi', 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b-tag', 
                         'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag', 'jet_3_pt', 'jet_3_eta',
                         'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_b-tag',
                         'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb'],
        
        "key"         : "higgs"}}

In [12]:
from mlrun import run_local

run = run_local(NewTask(**task_params),
          handler=arc_to_parquet,
          inputs={"archive_url" : "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"},
          artifact_path=mlconf.artifact_path)

[mlrun] 2020-04-30 01:55:03,099 starting run tasks archive to parquet uid=668325b0c58a4dbda13e3656af138127  -> http://mlrun-api:8080
[mlrun] 2020-04-30 01:55:03,136 destination file already exists
[mlrun] 2020-04-30 01:55:03,153 log artifact higgs at /User/artifacts/higgs.parquet, size: None, db: Y



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...af138127,0,Apr 30 01:55:03,completed,tasks archive to parquet,v3io_user=adminkind=handlerowner=adminhost=jupyter-b5bf5cfcd-cwf84,archive_url,"header=['labels', 'lepton_pT', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', 'missing_energy_phi', 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b-tag', 'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag', 'jet_3_pt', 'jet_3_eta', 'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_b-tag', 'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']key=higgs",,higgs


to track results use .show() or .logs() or in CLI: 
!mlrun get run 668325b0c58a4dbda13e3656af138127 --project default , !mlrun logs 668325b0c58a4dbda13e3656af138127 --project default
[mlrun] 2020-04-30 01:55:03,202 run executed, status=completed


#### the following will run quickly if your artifact path hasn't changed, the large file will be detected and not downloaded a second time:

In [13]:
from mlrun import NewTask
run = func.run(NewTask(**task_params),
              artifact_path=mlconf.artifact_path)

[mlrun] 2020-04-30 01:55:03,215 starting run tasks archive to parquet uid=84a6d915e2d848de834c955ef489efab  -> http://mlrun-api:8080
[mlrun] 2020-04-30 01:55:03,331 Job is running in the background, pod: tasks-archive-to-parquet-kqxnj
[mlrun] 2020-04-30 01:55:18,439 destination file already exists
[mlrun] 2020-04-30 01:55:18,459 log artifact higgs at /User/artifacts/higgs.parquet, size: None, db: Y

[mlrun] 2020-04-30 01:55:18,470 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...f489efab,0,Apr 30 01:55:18,completed,tasks archive to parquet,host=tasks-archive-to-parquet-kqxnjkind=jobowner=adminv3io_user=admin,,"header=['labels', 'lepton_pT', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', 'missing_energy_phi', 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b-tag', 'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag', 'jet_3_pt', 'jet_3_eta', 'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_b-tag', 'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']key=higgs",,higgs


to track results use .show() or .logs() or in CLI: 
!mlrun get run 84a6d915e2d848de834c955ef489efab  , !mlrun logs 84a6d915e2d848de834c955ef489efab 
[mlrun] 2020-04-30 01:55:22,531 run executed, status=completed
