In [1]:
# nuclio: ignore
import nuclio

In [2]:
# Copyright 2018 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    # Legacy Python that doesn"t verify HTTPS certificates by default
        pass
else:
    # Handle target environment that doesn"t support HTTPS verification
    ssl._create_default_https_context = _create_unverified_https_context

import os
import json
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
from cloudpickle import dump, load

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact, TableArtifact

from typing import IO, AnyStr, Union, List, Optional

def arc_to_parquet(
    context: MLClientCtx,
    archive_url: Union[str, DataItem],
    header: Optional[List[str]] = None,
    chunksize: int = 10_000,
    dtype=None,
    encoding: str = "latin-1",
    key: str = "data",
    dataset: Optional[str] = None,
    part_cols = [],
    file_ext: str = 'parquet'
) -> None:
    """Open a file/object archive and save as a parquet file.

    Partitioning requires precise specification of column types.

    :param context:      function context
    :param archive_url:  any valid string path consistent with the path variable
                         of pandas.read_csv, including strings as file paths, as urls, 
                         pathlib.Path objects, etc...
    :param header:       column names
    :param chunksize:    (0) row size retrieved per iteration
    :param dtype         destination data type of specified columns
    :param encoding      ("latin-8") file encoding
    :param key:          key in artifact store (when log_data=True)
    :param dataset:      (None) if not None then "target_path/dataset"
                         is folder for partitioned files
    :param file_ext:     (parquet) csv/parquet file extension
    :param part_cols:    ([]) list of partitioning columns

    """
    base_path = context.artifact_path
    os.makedirs(base_path, exist_ok=True)

    if dataset is not None:
        dest_path = os.path.join(base_path, dataset)
        exists = os.path.isdir(dest_path)
    else:
        dest_path = os.path.join(base_path, key+f".{file_ext}")
        exists = os.path.isfile(dest_path)

    # todo: more logic for header
    if not exists:
        context.logger.info("destination file does not exist, downloading")
        pqwriter = None
        for i, df in enumerate(pd.read_csv(archive_url, 
                                           chunksize=chunksize, 
                                           names=header,
                                           encoding=encoding, 
                                           dtype=dtype)):
            table = pa.Table.from_pandas(df)
            if i == 0:
                if dataset:
                    # just write header here
                    pq.ParquetWriter(os.path.join(base_path,f"header-only.{file_ext}"), table.schema)
                else:
                    # start writing file
                    pqwriter = pq.ParquetWriter(dest_path, table.schema)
                context.log_artifact("header", local_path=f"header-only.{file_ext}")
            if dataset:
                pq.write_to_dataset(table, root_path=dest_path, partition_cols=partition_cols)
            else:
                pqwriter.write_table(table)
        if pqwriter:
            pqwriter.close()

        context.logger.info(f"saved table to {dest_path}")
    else:
        context.logger.info("destination file already exists")
    context.log_artifact(key, local_path=key+f".{file_ext}")


In [3]:
# nuclio: end-code

In [11]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function('arc_to_parquet', kind='job', with_doc=True,
                      handler=arc_to_parquet, image='mlrun/ml-base:0.4.6')

# add metadata (for templates and reuse)
fn.spec.default_handler = 'arc_to_parquet'
fn.spec.description = "retrieve remote archive, open and save as parquet"
fn.metadata.categories = ['fileutils', 'retrieve']
fn.spec.image_pull_policy = "Always"
fn.metadata.labels = {"author": "yjb"}

In [12]:
fn.save()

[mlrun] 2020-03-27 21:49:22,332 saving function: arc-to-parquet, tag: latest


'94ba467ea897965e31837b330d73bf3126015cf9'

In [13]:
fn.export('function.yaml')

[mlrun] 2020-03-27 21:49:22,379 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f956460cd90>

In [14]:
# test db, yaml

In [15]:
from mlrun import import_function, mount_v3io
func = import_function("hub://arc_to_parquet").apply(mount_v3io())
# func = import_function("function.yaml").apply(mlrun.mount_v3io())

In [16]:
task_params = {
    "name" : "tasks archive to parquet",
    "params" :  {
        "archive_url" : "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz",
        
        "header"      : ['labels', 'lepton_pT', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', 
                         'missing_energy_phi', 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b-tag', 
                         'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag', 'jet_3_pt', 'jet_3_eta',
                         'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_b-tag',
                         'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb'],
        
        "key"         : "higgs"}}

In [17]:
from mlrun import NewTask
run = func.run(NewTask(**task_params), artifact_path="/User/artifacts")

[mlrun] 2020-03-27 21:49:22,429 starting run tasks archive to parquet uid=099ed3c90b20483da62177e3b1e56832  -> http://mlrun-api:8080
[mlrun] 2020-03-27 21:49:22,572 Job is running in the background, pod: tasks-archive-to-parquet-47rgn
[mlrun] 2020-03-27 21:49:29,442 destination file does not exist, downloading
[mlrun] 2020-03-27 21:56:29,625 log artifact header at /User/artifacts/header-only.parquet, size: None, db: Y
[mlrun] 2020-03-27 21:59:48,982 saved table to /User/artifacts/higgs.parquet
[mlrun] 2020-03-27 21:59:49,009 log artifact higgs at /User/artifacts/higgs.parquet, size: None, db: Y

[mlrun] 2020-03-27 21:59:49,037 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...e56832,0,Mar 27 21:49:29,completed,tasks archive to parquet,host=tasks-archive-to-parquet-47rgnkind=jobowner=admin,,"archive_url=https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gzheader=['labels', 'lepton_pT', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', 'missing_energy_phi', 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b-tag', 'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag', 'jet_3_pt', 'jet_3_eta', 'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_b-tag', 'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']key=higgs",,headerhiggs


to track results use .show() or .logs() or in CLI: 
!mlrun get run 099ed3c90b20483da62177e3b1e56832  , !mlrun logs 099ed3c90b20483da62177e3b1e56832 
[mlrun] 2020-03-27 21:59:53,603 run executed, status=completed
