# archive to parquet

In [1]:
import mlrun
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

In [2]:
# nuclio: ignore
import nuclio

In [3]:
%nuclio config spec.build.baseImage = "yjbds/mlrun-files:latest"

%nuclio: setting spec.build.baseImage to 'yjbds/mlrun-files:latest'


In [4]:
import os
import json
from pathlib import Path
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
from cloudpickle import dump, load

from mlrun.execution import MLClientCtx
from typing import IO, AnyStr, Union, List, Optional


def arc_to_parquet(
    context: MLClientCtx,
    archive_url: Union[str, Path, IO[AnyStr]],
    header: Optional[List[str]] = None,
    target_path: str = "",
    name: str = "",
    chunksize: int = 10_000,
    log_data: bool = True,
    add_uid: bool = False,
    key: str = "raw_data",
) -> None:
    """Open a file/object archive and save as a parquet file.
    
    :param context:     function context
    :param archive_url: any valid string path consistent with the path variable
                        of pandas.read_csv, including strings as file paths, as urls, 
                        pathlib.Path objects, etc...
    :param header:      column names
    :param target_path: destination folder of table
    :param name:        name file to be saved locally, also
    :param chunksize:   (0) row size retrieved per iteration
    :param key:         key in artifact store (when log_data=True)
    """
    if not name.endswith(".parquet"):
        name += ".parquet"

    dest_path = os.path.join(target_path, name)
    os.makedirs(os.path.join(target_path), exist_ok=True)
    if not os.path.isfile(dest_path):
        context.logger.info("destination file does not exist, downloading")
        pqwriter = None
        for i, df in enumerate(pd.read_csv(archive_url, chunksize=chunksize, names=header)):
            parquet_schema = pa.Table.from_pandas(df=df).schema
            if i == 0:
                pqwriter = pq.ParquetWriter(dest_path, parquet_schema)
            table = pa.Table.from_pandas(df, parquet_schema)
            pqwriter.write_table(table)
        if pqwriter:
            pqwriter.close()

        context.logger.info(f"saved table to {dest_path}")
    else:
        context.logger.info("destination file already exists")

    context.log_artifact(key, target_path=dest_path)
    # log header
    filepath = os.path.join(target_path, 'header.pkl')
    dump(header, open(filepath, 'wb'))
    context.log_artifact('header', target_path=filepath)       

In [5]:
# nuclio: end-code

In [6]:
# create job function object from notebook code
fn = mlrun.code_to_function(
    'arc to parquet',
    runtime='job', 
    handler=arc_to_parquet)

#### load and configure function

In [7]:
# load function from a local Python file
# fn = mlrun.code_to_function('/User/repos/functions/fileutils/arc_to_parquet/arc_to_parquet.py', kind='job')

In [8]:
# export function yaml
fn.export('/User/repos/functions/fileutils/arc_to_parquet/arc_to_parquet.yaml')

[mlrun] 2020-01-22 17:42:17,438 function spec saved to path: /User/repos/functions/fileutils/arc_to_parquet/arc_to_parquet.yaml


In [None]:
# import function yaml
# fn = mlrun.import_function('/User/repos/functions/fileutils/arc_to_parquet/arc_to_parquet.yaml')

In [None]:
# push yaml to github

In [None]:
# load function from Github
# fn = mlrun.import_function(
#   'https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/fileutils/arc_to_parquet/arc_to_parquet.yaml')

In [None]:
# configure function: mount on the Iguazio data fabric, set as interactive (return stdout)
fn.apply(mlrun.mount_v3io())
fn.interactive = True

#### deploy / build

The following triggers a build when run for the first time using specs found in the yaml file above.  Unless that file changes, this only needs to be run once, even after the notebook has been restarted:

In [None]:
fn.deploy()

In [None]:
# fn.with_code()

Also note that the build time can be reduced if you specifiy a pre-built image with all required packages pre-installed.

In [18]:
# useful constants
target_path = '/User/mlrun/models'
# archive = "https://fpsignals-public.s3.amazonaws.com/higgs-small.tar.gz"
archive = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
parquet_file = 'higgs.parquet' # the file extension is not necessary
parquet_file_path = target_path + "/" + parquet_file
artifact_key = 'higgs_large'

In [19]:
HIGGS_HEADER = ['labels', 'lepton_pT', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', 'missing_energy_phi',
 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b-tag', 'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag', 'jet_3_pt',
 'jet_3_eta', 'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_b-tag', 'm_jj', 'm_jjj', 'm_lv',
 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']

In [None]:
# create and run the task
arc_to_parq_task = mlrun.NewTask(
    'arc2parq', 
    handler='arc_to_parquet',  
    params={
        'target_path': target_path,
        'name'       : parquet_file, 
        'key'        : artifact_key,
        'archive_url': archive,
        'header'     : HIGGS_HEADER},
    outputs=[artifact_key])

# run
run = fn.run(arc_to_parq_task)

___

### tests

In [10]:
import os
import numpy as np
import pandas as pd

In [11]:
# add more context tests
# convert these to real tests

In [16]:
assert artifact_key in run.outputs.keys(), f"mlrun.functions: key {artifact_key} not found in outputs"
assert os.path.isfile(parquet_file_path),  f"mlrun.functions: artifact source not found at {parquet_file_path}"

In [20]:
copied   = pd.read_parquet(parquet_file_path, engine="pyarrow")

In [21]:
copied.head()

Unnamed: 0,labels,lepton_pT,lepton_eta,lepton_phi,missing_energy_magnitude,missing_energy_phi,jet_1_pt,jet_1_eta,jet_1_phi,jet_1_b-tag,...,jet_4_eta,jet_4_phi,jet_4_b-tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
0,1.0,0.869293,-0.635082,0.22569,0.32747,-0.689993,0.754202,-0.248573,-1.092064,0.0,...,-0.010455,-0.045767,3.101961,1.35376,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1,1.0,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
2,1.0,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
3,0.0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
4,1.0,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487


In [22]:
copied.shape

(11000000, 29)

### cleanup

In [7]:
# os.remove(parquet_file_path)