In [1]:
# nuclio: ignore
import nuclio

In [2]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/ml-base"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/ml-base'


In [3]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    # Legacy Python that doesn"t verify HTTPS certificates by default
        pass
else:
    # Handle target environment that doesn"t support HTTPS verification
    ssl._create_default_https_context = _create_unverified_https_context

import os
import json
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
from cloudpickle import dump, load

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact, TableArtifact

from typing import List, Optional

def arc_to_parquet(
    context: MLClientCtx,
    archive_url: DataItem,
    header: str = "",
    chunksize: int = 10_000,
    dtype=None,
    encoding: str = "latin-1",
    key: str = "data",
    dataset: str = "",
    part_cols = [],
    file_ext: str = "parquet",
    refresh_data: bool = False
) -> None:
    """Open a file/object archive and save as a parquet file or dataset

    Notes
    -----
    * partitioning requires precise specification of column types.
    * the archive_url can be any file readable by pandas read_csv, which includes tar files
    * if the `dataset` parameter is not empty, then a partitioned dataset will be created
    instead of a single file in the folder `dataset`
    * if a key exists already then it will not be re-acquired unless the `refresh_data` param
    is set to `True`.  This is in case the original file is corrupt, or a refresh is 
    required.

    :param context:      function context
    :param archive_url:  MLRun data input (DataItem object)
    :param header:       column names
    :param chunksize:    (0) row size retrieved per iteration
    :param dtypes        destination data type of columns, as dict(col, type), for example
                         {‘a’: np.float64, ‘b’: np.int32, ‘c’: ‘Int64’}  
    :param encoding      ("latin-8") file encoding
    :param key:          key in artifact store
    :param dataset:      (None) if not None then "target_path/dataset"
                         is folder for partitioned files
    :param file_ext:     (parquet) csv/parquet file extension
    :param part_cols:    ([]) list of partitioning columns
    :param refresh_data: (False) overwrite existing data at that location/kye
    """
    base_path = context.artifact_path
    os.makedirs(base_path, exist_ok=True)
    
    archive_url = archive_url.local()
    
    if dataset:
        dest_path = os.path.join(base_path, dataset)
        exists = os.path.isdir(dest_path)
    else:
        dest_path = os.path.join(base_path, key+f".{file_ext}")
        exists = os.path.isfile(dest_path)

    # todo: more logic for header
    if not exists:
        context.logger.info("destination file does not exist, downloading")
        pqwriter = None
        for i, df in enumerate(pd.read_csv(archive_url, 
                                           chunksize=chunksize, 
                                           names=header,
                                           encoding=encoding, 
                                           dtype=dtypes)):
            table = pa.Table.from_pandas(df)
            if i == 0:
                if dataset:
                    # just write header here
                    pq.ParquetWriter(os.path.join(base_path,f"header-only.{file_ext}"), table.schema)
                else:
                    # start writing file
                    pqwriter = pq.ParquetWriter(dest_path, table.schema)
                context.log_artifact("header", local_path=f"header-only.{file_ext}")
            if dataset:
                pq.write_to_dataset(table, root_path=dest_path, partition_cols=partition_cols)
            else:
                pqwriter.write_table(table)
        if pqwriter:
            pqwriter.close()

        context.logger.info(f"saved table to {dest_path}")
    else:
        context.logger.info("destination file already exists")
    context.log_artifact(key, local_path=key+f".{file_ext}")


In [4]:
# nuclio: end-code

### mlconfig

In [5]:
from mlrun import mlconf
import os
mlconf.dbpath = mlconf.dbpath or "http://mlrun-api:8080"
mlconf.artifact_path = mlconf.artifact_path or f"{os.environ['HOME']}/artifacts"

### save

In [6]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("arc_to_parquet")

# add metadata (for templates and reuse)
fn.spec.default_handler = "arc_to_parquet"
fn.spec.description = "retrieve remote archive, open and save as parquet"
fn.metadata.categories = ['data-movement', 'utils']
fn.metadata.labels = {"author": "yjb"}

fn.export("function.yaml")

[mlrun] 2020-05-03 15:13:16,443 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f653615ff98>

## tests

In [7]:
if "V3IO_HOME" in list(os.environ):
    from mlrun import mount_v3io
    fn.apply(mount_v3io())
else:
    # is you set up mlrun using the instructions at https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
    from mlrun.platforms import mount_pvc
    fn.apply(mount_pvc("nfsvol", "nfsvol", "/home/joyan/data"))

In [8]:
from mlrun import NewTask    

task_params = {
    "name" : "tasks archive to parquet",
    "params" :  {
        "header" : ["labels", "lepton_pT", "lepton_eta", "lepton_phi", "missing_energy_magnitude", 
                    "missing_energy_phi", "jet_1_pt", "jet_1_eta", "jet_1_phi", "jet_1_b-tag", 
                    "jet_2_pt", "jet_2_eta", "jet_2_phi", "jet_2_b-tag", "jet_3_pt", "jet_3_eta",
                    "jet_3_phi", "jet_3_b-tag", "jet_4_pt", "jet_4_eta", "jet_4_phi", "jet_4_b-tag",
                    "m_jj", "m_jjj", "m_lv", "m_jlv", "m_bb", "m_wbb", "m_wwbb"],
        
        "key"    : "higgs"}}

In [9]:
from mlrun import run_local

run = run_local(NewTask(**task_params),
          handler=arc_to_parquet,
          inputs={"archive_url" : "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"})

[mlrun] 2020-05-03 15:13:16,608 starting run tasks archive to parquet uid=9f3b17539f864831a68677e87135ef2a  -> http://mlrun-api:8080
[mlrun] 2020-05-03 15:13:16,701 downloading https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz to local tmp
[mlrun] 2020-05-03 15:15:07,427 destination file already exists
[mlrun] 2020-05-03 15:15:07,446 log artifact higgs at /User/artifacts/higgs.parquet, size: None, db: Y



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...7135ef2a,0,May 03 15:13:16,completed,tasks archive to parquet,v3io_user=adminkind=handlerowner=adminhost=jupyter-6c5fccf844-gxlrw,archive_url,"header=['labels', 'lepton_pT', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', 'missing_energy_phi', 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b-tag', 'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag', 'jet_3_pt', 'jet_3_eta', 'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_b-tag', 'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']key=higgs",,higgs


to track results use .show() or .logs() or in CLI: 
!mlrun get run 9f3b17539f864831a68677e87135ef2a --project default , !mlrun logs 9f3b17539f864831a68677e87135ef2a --project default
[mlrun] 2020-05-03 15:15:07,525 run executed, status=completed


#### the following will run quickly if your artifact path hasn"t changed, the large file will be detected and not downloaded a second time:

In [None]:
from mlrun import NewTask
run = fn.run(NewTask(**task_params))

[mlrun] 2020-05-03 15:17:25,660 starting run tasks archive to parquet uid=eef123972eaf40e9beb164be1e2a358a  -> http://mlrun-api:8080
[mlrun] 2020-05-03 15:17:25,771 Job is running in the background, pod: tasks-archive-to-parquet-ngtwq
[mlrun] 2020-05-03 15:17:31,659 Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/mlrun/runtimes/local.py", line 184, in exec_from_params
    val = handler(*args_list)
  File "main.py", line 68, in arc_to_parquet
    archive_url = archive_url.local()
AttributeError: 'NoneType' object has no attribute 'local'


[mlrun] 2020-05-03 15:17:31,670 exec error - 'NoneType' object has no attribute 'local'
[mlrun] 2020-05-03 15:17:31,700 run executed, status=error
'NoneType' object has no attribute 'local'
runtime error: 'NoneType' object has no attribute 'local'
final state: failed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...1e2a358a,0,May 03 15:17:31,error,tasks archive to parquet,host=tasks-archive-to-parquet-ngtwqkind=jobowner=adminv3io_user=admin,,"header=['labels', 'lepton_pT', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', 'missing_energy_phi', 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b-tag', 'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag', 'jet_3_pt', 'jet_3_eta', 'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_b-tag', 'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']key=higgs",,


to track results use .show() or .logs() or in CLI: 
!mlrun get run eef123972eaf40e9beb164be1e2a358a  , !mlrun logs eef123972eaf40e9beb164be1e2a358a 
[mlrun] 2020-05-03 15:17:34,921 run executed, status=error
runtime error: 'NoneType' object has no attribute 'local'
