# archive to parquet

In [1]:
import mlrun
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

In [2]:
# nuclio: ignore
import nuclio

In [3]:
%%nuclio cmd -c
python -m pip uninstall mlrun
python -m pip install -U -q mlrun
python -m pip install -U -q pandas
python -m pip install -U -q pyarrow
python -m pip install -U -q numpy==1.17.4

In [4]:
import os
import json
from pathlib import Path
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa

from mlrun.execution import MLClientCtx
from typing import IO, AnyStr, Union, List, Optional


def arc_to_parquet(
    context: MLClientCtx,
    archive_url: Union[str, Path, IO[AnyStr]],
    header: Union[int, List[str], None] = 0,
    target_path: str = "",
    name: str = "",
    chunksize: int = 10_000,
    log_data: bool = True,
    add_uid: bool = False,
    key: str = "raw_data",
) -> None:
    """Open a file/object archive and save as a parquet file.
    
    :param context:     function context
    :param archive_url: any valid string path consistent with the path variable
                        of pandas.read_csv, including strings as file paths, as urls, 
                        pathlib.Path objects, etc...
    :param header:      column names
    :param target_path: destination folder of table
    :param name:        name file to be saved locally, also
    :param chunksize:   (0) row size retrieved per iteration
    :param log_data:    (True) if True, log the data so that it is available
                        at the next step
    :param add_uid:     (False) add the metadata uid to the target_path so that 
                        runs can be identified
    :param key:         key in artifact store (when log_data=True)
    """
    if not name.endswith(".parquet"):
        name += ".parquet"

    if not add_uid:
        uid = ""
    else:
        uid = context.uid

    dest_path = os.path.join(target_path, uid, name)
    os.makedirs(os.path.join(target_path, uid), exist_ok=True)
    if header == 0:
        header = pd.read_csv(archive_url, header=None, nrows=1).iloc[0].values
    header = [x.replace(' ', '_') for x in header]
    if not os.path.isfile(dest_path):
        context.logger.info("destination file does not exist, downloading")
        pqwriter = None
        for i, df in enumerate(pd.read_csv(archive_url, chunksize=chunksize, names=header)):
            table = pa.Table.from_pandas(df)
            if i == 0:
                pqwriter = pq.ParquetWriter(dest_path, table.schema)
            pqwriter.write_table(table)

        if pqwriter:
            pqwriter.close()

        context.logger.info(f"saved table to {dest_path}")
    else:
        context.logger.info("destination file already exists")

    if log_data:
        context.log_artifact(key, target_path=dest_path)
        # log header
        filepath = os.path.join(target_path, 'header.json')
        json.dump(header, open(filepath, 'w'))
        context.log_artifact('header', target_path=filepath)


In [5]:
# nuclio: end-code

In [6]:
# create job function object from notebook code
fn = mlrun.code_to_function(
    'arc to parquet',
    runtime='job', 
    handler=arc_to_parquet)

In [7]:
fn.export('/User/repos/functions/fileutils/arc_to_parquet/arc_to_parquet.yaml')

[mlrun] 2020-01-21 07:11:35,663 function spec saved to path: /User/repos/functions/fileutils/arc_to_parquet/arc_to_parquet.yaml


#### load and configure function

In [8]:
# load function from a local Python file
# fn = mlrun.code_to_function('/User/repos/functions/fileutils/arc_to_parquet/arc_to_parquet.py', kind='job')

In [9]:
# export function yaml
# fn.export('/User/repos/functions/fileutils/arc_to_parquet/arc_to_parquet.yaml')

In [2]:
# import function yaml
fn = mlrun.import_function('/User/repos/functions/fileutils/arc_to_parquet/arc_to_parquet.yaml')

In [11]:
# push yaml to github

In [12]:
# load function from Github
# fn = mlrun.import_function(
#   'https://raw.githubusercontent.com/mlrun/functions/master/fileutils/arc_to_parquet/arc_to_parquet.yaml')

In [3]:
# configure function: mount on the Iguazio data fabric, set as interactive (return stdout)
fn.apply(mlrun.mount_v3io())
fn.interactive = True

#### deploy / build

The following triggers a build when run for the first time using specs found in the yaml file above.  Unless that file changes, this only needs to be run once, even after the notebook has been restarted:

In [14]:
fn.deploy()

[mlrun] 2020-01-21 07:11:35,802 starting remote build, image: .mlrun/func-default-arc-to-parquet-latest
[36mINFO[0m[0000] Resolved base name python:3.6-jessie to python:3.6-jessie 
[36mINFO[0m[0000] Resolved base name python:3.6-jessie to python:3.6-jessie 
[36mINFO[0m[0000] Downloading base image python:3.6-jessie     
[36mINFO[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:0318d80cb241983eda20b905d77fa0bfb06e29e5aabf075c7941ea687f1c125a: no such file or directory 
[36mINFO[0m[0000] Downloading base image python:3.6-jessie     
[36mINFO[0m[0000] Built cross stage deps: map[]                
[36mINFO[0m[0000] Downloading base image python:3.6-jessie     
[36mINFO[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:0318d80cb241983eda20b905d77fa0bfb06e29e5aabf075c7941ea687f1c125a: no such file or directory 
[36mINFO[0m[0000] Downloading base image python:3.6-jessie     
[36mINFO[0m[0001] Unpa

True

In [15]:
# fn.with_code()

Also note that the build time can be reduced if you specifiy a pre-built image with all required packages pre-installed.

In [4]:
# useful constants
target_path = '/User/mlrun/functions/parquet'
archive = 'https://fpsignals-public.s3.amazonaws.com/one_csv.tar.gz'
parquet_file = 'x_test_50.parquet' # the file extension is not necessary
parquet_file_path = target_path + "/" + parquet_file
artifact_key = 'raw_data'

In [5]:
# create and run the task
arc_to_parq_task = mlrun.NewTask(
    'arc2parq', 
    handler='arc_to_parquet',  # a string since we are calling this 'remotely', outside this notebook
    params={
        'target_path': target_path,
        'name'       : parquet_file, 
        'key'        : artifact_key,
        'archive_url': archive},
    outputs=[artifact_key])

# run
run = fn.run(arc_to_parq_task)

[mlrun] 2020-01-21 07:17:44,068 starting run arc2parq uid=2a211d65872442cf85e745bde5c81392  -> http://mlrun-api:8080
[mlrun] 2020-01-21 07:17:44,153 Job is running in the background, pod: arc2parq-vjhbh
[mlrun] 2020-01-21 07:17:50,433 destination file does not exist, downloading
[mlrun] 2020-01-21 07:17:50,536 saved table to /User/mlrun/functions/parquet/x_test_50.parquet
[mlrun] 2020-01-21 07:17:50,549 log artifact raw_data at /User/mlrun/functions/parquet/x_test_50.parquet, size: None, db: Y
[mlrun] 2020-01-21 07:17:50,561 log artifact header at /User/mlrun/functions/parquet/header.json, size: None, db: Y

[mlrun] 2020-01-21 07:17:50,571 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...c81392,0,Jan 21 07:17:50,completed,arc-to-parquet,host=arc2parq-vjhbhkind=jobowner=admin,,archive_url=https://fpsignals-public.s3.amazonaws.com/one_csv.tar.gzkey=raw_dataname=x_test_50.parquettarget_path=/User/mlrun/functions/parquet,,raw_dataheader


to track results use .show() or .logs() or in CLI: 
!mlrun get run 2a211d65872442cf85e745bde5c81392  , !mlrun logs 2a211d65872442cf85e745bde5c81392 
[mlrun] 2020-01-21 07:17:53,318 run executed, status=completed


___

### tests

In [6]:
import os
import numpy as np
import pandas as pd

In [7]:
# add more context tests
# convert these to real tests

In [8]:
assert artifact_key in run.outputs.keys(), f"mlrun.functions: key {artifact_key} not found in outputs"
assert os.path.isfile(parquet_file_path),  f"mlrun.functions: artifact source not found at {parquet_file_path}"

In [9]:
original = pd.read_csv(archive).values
copied   = pd.read_parquet(parquet_file_path, engine="pyarrow").values
assert np.array_equal(original, copied),   "mlrun.functions: original and copied data not equal"

AssertionError: mlrun.functions: original and copied data not equal

### cleanup

In [10]:
os.remove(parquet_file_path)