In [None]:
import mlrun
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

# archive to folder

In [None]:
import urllib3
urllib3.disable_warnings()

In [None]:
# load function from Github
xfn = mlrun.import_function('https://raw.githubusercontent.com/yjb-ds/functions/master/fileutils/open_archive/function.yaml')

# configute it: mount on iguazio fabric, set as interactive (return stdout)
xfn.apply(mlrun.mount_v3io())
xfn.interactive = True

# create and run the task

images_path = '/User/mlrun/functions/images'

open_archive_task = mlrun.NewTask(
    'download',
    handler='open_archive', 
    params={'target_path': images_path,
            'key'        : 'contents'},
    inputs={'archive_url': 'http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip'}
)

# run
run = xfn.run(open_archive_task)

_________

# archive to parquet

#### load and configure function

In [None]:
# load function from Github
xfn = mlrun.import_function('https://raw.githubusercontent.com/yjb-ds/functions/master/fileutils/arc_to_parquet/arc_to_parquet.yaml')

# configure function: mount on the Iguazio data fabric, set as interactive (return stdout)
xfn.apply(mlrun.mount_v3io())
xfn.interactive = True

#### deploy / build

The following triggers a build when run for the first time using specs found in the yaml file above.  Unless that file changes, this only needs to be run once, even after the notebook has been restarted:

In [None]:
xfn.deploy()

Also note that the build time can be reduced if you specifiy a pre-built image with all required packages.

In [None]:
# useful constants
target_path = '/User/mlrun/functions/parquet'
archive = 'https://fpsignals-public.s3.amazonaws.com/x_test_50.csv.gz'
parquet_file = 'x_test_50.parquet' # the file extension is not necessary
parquet_file_path = target_path + "/" + parquet_file
artifact_key = 'raw_data'

In [None]:
# create and run the task
arc_to_parq_task = mlrun.NewTask(
    'arc2parq', 
    handler='arc_to_parquet',  # a string since we are calling this 'remotely', outside this notebook
    params={
        'target_path': target_path,
        'name'       : parquet_file, 
        'key'        : artifact_key,
        'archive_url': archive},
    outputs=[artifact_key])

# run
run = xfn.run(arc_to_parq_task)

___

### tests

In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
# add more context tests
# convert these to real tests

In [None]:
assert artifact_key in run.outputs.keys(), f"mlrun.functions: key {artifact_key} not fond in outputs"
assert os.path.isfile(parquet_file_path),  f"mlrun.functions: artifact source not found at {parquet_file_path}"

In [None]:
original = pd.read_csv(archive)
copied   = pd.read_parquet(parquet_file_path, engine="pyarrow")
assert np.array_equal(original, copied),   "mlrun.functions: original and copied data not equal"

### cleanup

In [None]:
os.remove(parquet_file_path)