# archive to parquet

In [1]:
import mlrun
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

In [2]:
# nuclio: ignore
import nuclio

In [3]:
%%nuclio cmd -c
python -m pip uninstall mlrun
python -m pip install -U -q mlrun
python -m pip install -U -q pandas
python -m pip install -U -q pyarrow

In [4]:
%nuclio config spec.build.baseImage = "python:3.6-jessie"

%nuclio: setting spec.build.baseImage to 'python:3.6-jessie'


In [5]:
import os
import json
from pathlib import Path
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
from pickle import dump, load

from mlrun.execution import MLClientCtx
from typing import IO, AnyStr, Union, List, Optional


def arc_to_parquet(
    context: MLClientCtx,
    archive_url: Union[str, Path, IO[AnyStr]],
    header: Optional[List[str]] = None,
    target_path: str = "",
    name: str = "",
    chunksize: int = 10_000,
    log_data: bool = True,
    add_uid: bool = False,
    key: str = "raw_data",
) -> None:
    """Open a file/object archive and save as a parquet file.
    
    :param context:     function context
    :param archive_url: any valid string path consistent with the path variable
                        of pandas.read_csv, including strings as file paths, as urls, 
                        pathlib.Path objects, etc...
    :param header:      column names
    :param target_path: destination folder of table
    :param name:        name file to be saved locally, also
    :param chunksize:   (0) row size retrieved per iteration
    :param key:         key in artifact store (when log_data=True)
    """
    if not name.endswith(".parquet"):
        name += ".parquet"

    dest_path = os.path.join(target_path, name)
    os.makedirs(os.path.join(target_path), exist_ok=True)
    if not os.path.isfile(dest_path):
        context.logger.info("destination file does not exist, downloading")
        pqwriter = None
        for i, df in enumerate(pd.read_csv(archive_url, chunksize=chunksize, names=header)):
            table = pa.Table.from_pandas(df)
            if i == 0:
                pqwriter = pq.ParquetWriter(dest_path, table.schema)
            pqwriter.write_table(table)

        if pqwriter:
            pqwriter.close()

        context.logger.info(f"saved table to {dest_path}")
    else:
        context.logger.info("destination file already exists")

    context.log_artifact(key, target_path=dest_path)
    # log header
    filepath = os.path.join(target_path, 'header.pkl')
    dump(header, open(filepath, 'wb'))
    context.log_artifact('header', target_path=filepath)       

In [6]:
# nuclio: end-code

In [7]:
# create job function object from notebook code
fn = mlrun.code_to_function(
    'arc to parquet',
    runtime='job', 
    handler=arc_to_parquet)

#### load and configure function

In [8]:
# load function from a local Python file
# fn = mlrun.code_to_function('/User/repos/functions/fileutils/arc_to_parquet/arc_to_parquet.py', kind='job')

In [9]:
# export function yaml
f# n.export('/User/repos/functions/fileutils/arc_to_parquet/arc_to_parquet.yaml')

[mlrun] 2020-01-21 15:12:18,489 function spec saved to path: /User/repos/functions/fileutils/arc_to_parquet/arc_to_parquet.yaml


In [10]:
# import function yaml
# fn = mlrun.import_function('/User/repos/functions/fileutils/arc_to_parquet/arc_to_parquet.yaml')

In [11]:
# push yaml to github

In [10]:
# load function from Github
fn = mlrun.import_function(
  'https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/fileutils/arc_to_parquet/arc_to_parquet.yaml')



In [11]:
# configure function: mount on the Iguazio data fabric, set as interactive (return stdout)
fn.apply(mlrun.mount_v3io())
fn.interactive = True

#### deploy / build

The following triggers a build when run for the first time using specs found in the yaml file above.  Unless that file changes, this only needs to be run once, even after the notebook has been restarted:

In [16]:
fn.deploy()

[mlrun] 2020-01-21 19:27:06,066 starting remote build, image: .mlrun/func-default-arc-to-parquet-latest
[36mINFO[0m[0000] Resolved base name python:3.6-jessie to python:3.6-jessie 
[36mINFO[0m[0000] Resolved base name python:3.6-jessie to python:3.6-jessie 
[36mINFO[0m[0000] Downloading base image python:3.6-jessie     
[36mINFO[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:0318d80cb241983eda20b905d77fa0bfb06e29e5aabf075c7941ea687f1c125a: no such file or directory 
[36mINFO[0m[0000] Downloading base image python:3.6-jessie     
[36mINFO[0m[0000] Built cross stage deps: map[]                
[36mINFO[0m[0000] Downloading base image python:3.6-jessie     
[36mINFO[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:0318d80cb241983eda20b905d77fa0bfb06e29e5aabf075c7941ea687f1c125a: no such file or directory 
[36mINFO[0m[0000] Downloading base image python:3.6-jessie     
[36mINFO[0m[0001] Unpa

True

In [12]:
# fn.with_code()

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fd3619549e8>

Also note that the build time can be reduced if you specifiy a pre-built image with all required packages pre-installed.

In [20]:
# useful constants
target_path = '/User/mlrun/models'
archive = "https://fpsignals-public.s3.amazonaws.com/higgs-small.tar.gz"
parquet_file = 'higgs.parquet' # the file extension is not necessary
parquet_file_path = target_path + "/" + parquet_file
artifact_key = 'higgs_small'

In [21]:
HIGGS_HEADER = ['labels', 'lepton_pT', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', 'missing_energy_phi',
 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b-tag', 'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag', 'jet_3_pt',
 'jet_3_eta', 'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_b-tag', 'm_jj', 'm_jjj', 'm_lv',
 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']

In [22]:
# create and run the task
arc_to_parq_task = mlrun.NewTask(
    'arc2parq', 
    handler='arc_to_parquet',  
    params={
        'target_path': target_path,
        'name'       : parquet_file, 
        'key'        : artifact_key,
        'archive_url': archive,
        'header'     : higgs_header},
    outputs=[artifact_key])

# run
run = fn.run(arc_to_parq_task)

[mlrun] 2020-01-21 20:23:50,660 starting run arc2parq uid=e20e88ae28a545da90e7ded360b78d6d  -> http://mlrun-api:8080
[mlrun] 2020-01-21 20:23:50,882 Job is running in the background, pod: arc2parq-c65q8
[mlrun] 2020-01-21 20:24:05,984 destination file already exists
[mlrun] 2020-01-21 20:24:06,002 log artifact higgs_small at /User/mlrun/models/higgs.parquet, size: None, db: Y
[mlrun] 2020-01-21 20:24:06,017 log artifact header at /User/mlrun/models/header.pkl, size: None, db: Y

[mlrun] 2020-01-21 20:24:06,029 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...b78d6d,0,Jan 21 20:24:05,completed,arc-to-parquet,host=arc2parq-c65q8kind=jobowner=admin,,"archive_url=https://fpsignals-public.s3.amazonaws.com/higgs-small.tar.gzheader=['labels', 'lepton pT ', 'lepton eta ', 'lepton phi ', 'missing energy magnitude ', 'missing energy phi ', 'jet 1 pt ', 'jet 1 eta ', 'jet 1 phi ', 'jet 1 b-tag ', 'jet 2 pt ', 'jet 2 eta ', 'jet 2 phi ', 'jet 2 b-tag ', 'jet 3 pt ', 'jet 3 eta ', 'jet 3 phi ', 'jet 3 b-tag ', 'jet 4 pt ', 'jet 4 eta ', 'jet 4 phi ', 'jet 4 b-tag', 'm_jj', 'm_jjj', 'm_lv ', 'm_jlv', 'm_bb ', 'm_wbb ', 'm_wwbb']key=higgs_smallname=higgs.parquettarget_path=/User/mlrun/models",,higgs_smallheader


to track results use .show() or .logs() or in CLI: 
!mlrun get run e20e88ae28a545da90e7ded360b78d6d  , !mlrun logs e20e88ae28a545da90e7ded360b78d6d 
[mlrun] 2020-01-21 20:24:10,173 run executed, status=completed


___

### tests

In [None]:
import os
import numpy as np
import pandas as pd

In [23]:
# add more context tests
# convert these to real tests

In [24]:
assert artifact_key in run.outputs.keys(), f"mlrun.functions: key {artifact_key} not found in outputs"
assert os.path.isfile(parquet_file_path),  f"mlrun.functions: artifact source not found at {parquet_file_path}"

In [25]:
original = pd.read_csv(archive).values
copied   = pd.read_parquet(parquet_file_path, engine="pyarrow").values
assert np.array_equal(original, copied),   "mlrun.functions: original and copied data not equal"

TypeError: unhashable type: 'dict'

### cleanup

In [None]:
os.remove(parquet_file_path)