In [1]:
import urllib3
urllib3.disable_warnings()

In [2]:
import mlrun
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

In [3]:
# load function from Github
xfn = mlrun.import_function('https://raw.githubusercontent.com/yjb-ds/functions/master/fileutils/function.yaml')

# configute it: mount on iguazio fabric, set as interactive (return stdout)
xfn.apply(mlrun.mount_v3io())
xfn.interactive = True

# create and run the task

images_path = '/User/mlrun/functions/images'

open_archive_task = mlrun.NewTask(
    'download',
    handler='open_archive', 
    params={'target_path': images_path,
            'key'        : 'contents'},
    inputs={'archive_url': 'http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip'}
)

# run
run = xfn.run(open_archive_task)

[mlrun] 2020-01-09 22:32:57,216 starting run download uid=e9e45fd65b924af1a403752845e2419c  -> http://mlrun-api:8080
[mlrun] 2020-01-09 22:33:01,719 downloading http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip to local tmp
[mlrun] 2020-01-09 22:33:03,034 Verified directories
[mlrun] 2020-01-09 22:33:03,034 Extracting zip
[mlrun] 2020-01-09 22:33:03,807 extracted archive to content

[mlrun] 2020-01-09 22:33:03,830 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...e2419c,0,Jan 09 22:33:01,completed,file_utils,host=download-9pcc7kind=jobowner=admin,archive_url,key=contentstarget_path=/User/mlrun/functions/images,,content


to track results use .show() or .logs() or in CLI: 
!mlrun get run e9e45fd65b924af1a403752845e2419c  , !mlrun logs e9e45fd65b924af1a403752845e2419c 
[mlrun] 2020-01-09 22:33:06,528 run executed, status=completed


#### load and configure function

In [4]:
# load function from Github
xfn = mlrun.import_function('https://raw.githubusercontent.com/yjb-ds/functions/arc2parq/fileutils/arc_to_parquet.yaml')

# configure function: mount on the Iguazio data fabric, set as interactive (return stdout)
xfn.apply(mlrun.mount_v3io())
xfn.interactive = True

#### deploy / build

The following triggers a build when run for the first time using specs found in the yaml file above.  UNless that file changes, this only needs to be run once, even after the notebook has been restarted:

In [5]:
#xfn.deploy()

In [6]:
# useful constants
target_path = '/User/mlrun/functions/parquet'
archive = 'https://fpsignals-public.s3.amazonaws.com/x_test_50.csv.gz'
parquet_file = 'x_test_50.parquet' # the file extension is not necessary
parquet_file_path = target_path + "/" + parquet_file
artifact_key = 'raw_data'

In [7]:
# create and run the task
arc_to_parq_task = mlrun.NewTask(
    'arc2parq', 
    handler='arc_to_parquet',  # a string since we are calling this 'remotely', outside this notebook
    params={
        'target_path': target_path,
        'name'       : parquet_file, 
        'key'        : artifact_key,
        'archive_url': archive},
    outputs=[artifact_key])

# run
run = xfn.run(arc_to_parq_task)

[mlrun] 2020-01-09 22:33:06,658 starting run arc2parq uid=e41ed105188447fba818e0e4bbccc55e  -> http://mlrun-api:8080
[mlrun] 2020-01-09 22:33:12,145 destination file does not exist, downloading
[mlrun] 2020-01-09 22:33:12,339 saved table to /User/mlrun/functions/parquet/x_test_50.parquet
[mlrun] 2020-01-09 22:33:12,339 logging /User/mlrun/functions/parquet/x_test_50.parquet to context

[mlrun] 2020-01-09 22:33:12,363 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...ccc55e,0,Jan 09 22:33:12,completed,arc-to-parquet,host=arc2parq-x9llxkind=jobowner=admin,,archive_url=https://fpsignals-public.s3.amazonaws.com/x_test_50.csv.gzkey=raw_dataname=x_test_50.parquettarget_path=/User/mlrun/functions/parquet,,raw_data


to track results use .show() or .logs() or in CLI: 
!mlrun get run e41ed105188447fba818e0e4bbccc55e  , !mlrun logs e41ed105188447fba818e0e4bbccc55e 
[mlrun] 2020-01-09 22:33:15,955 run executed, status=completed


___

### tests

In [8]:
import os
import numpy as np
import pandas as pd

In [9]:
# add more context tests
# convert these to real tests

In [10]:
assert artifact_key in run.outputs.keys(), f"mlrun.functions: key {artifact_key} not fond in outputs"
assert os.path.isfile(parquet_file_path),  f"mlrun.functions: artifact source not found at {parquet_file_path}"

In [11]:
original = pd.read_csv(archive)
copied   = pd.read_parquet(parquet_file_path, engine="pyarrow")
assert np.array_equal(original, copied),   "mlrun.functions: original and copied data not equal"

In [12]:
os.remove(parquet_file_path)