In [1]:
# !python -m pip uninstall -y mlrun
# !python -m pip install mlrun

In [1]:
import mlrun
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

# archive to folder

In [2]:
# nuclio: ignore
import nuclio

In [3]:
import urllib.request
# urllib.disable_warnings()

In [4]:
import os
import zipfile
import urllib
import tarfile
import json

from mlrun.execution import MLClientCtx

def open_archive(context: MLClientCtx, 
                 target_dir: str = 'content',
                 archive_url: str = ''):
    """Open a file/object archive into a target directory
    
    Currently supports zip and tar.gz
    """
    # Define locations
    os.makedirs(target_dir, exist_ok=True)
    context.logger.info('Verified directories')
    print(archive_url)
    splits = archive_url.split('.')
    print(splits)
    if (splits[-1] == 'gz'):
        # Extract dataset from tar
        context.logger.info('opening tar_gz')
        ref = tarfile.open(fileobj=urllib.request.urlopen(archive_url), mode='r|gz')
    elif splits[-1] == 'zip':
        # Extract dataset from zip
        context.logger.info('opening zip')
        ref = zipfile.ZipFile(archive_url, 'r')

    ref.extractall(target_dir)
    ref.close()

    context.log_artifact('content', target_path=target_dir)


In [5]:
# nuclio: end-code

In [6]:
# create job function object from notebook code
fn = mlrun.code_to_function(
    'open_archive', 
    runtime='job', 
    handler=open_archive, 
    image='mlrun/mlrun:latest')

In [20]:
# export function yaml
# fn.export('/User/repos/functions/fileutils/open_archive/function.yaml')

In [21]:
# import function yaml
# fn = mlrun.import_function('/User/repos/functions/fileutils/open_archive/function.yaml')

In [2]:
# load function from Github
fn = mlrun.import_function('https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/fileutils/open_archive/function.yaml')



In [3]:
# configute it: mount on iguazio fabric, set as interactive (return stdout)
fn.apply(mlrun.mount_v3io())
fn.interactive = True

### zip file

In [4]:
# create and run the task
images_path = '/User/mlrun/functions/images'

open_archive_task = mlrun.NewTask(
    'download-zip',
    handler='open_archive', 
    params={'target_dir' : images_path,
            'key'        : 'contents'},
    inputs={'archive_url': 'http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip'})

In [5]:
# run
run = fn.run(open_archive_task)

[mlrun] 2020-01-21 19:19:43,612 starting run download uid=31c5db9ef8174d40ac94c6dad0258069  -> http://mlrun-api:8080
[mlrun] 2020-01-21 19:19:43,808 Job is running in the background, pod: download-tcrfc
[mlrun] 2020-01-21 19:20:04,079 downloading http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip to local tmp
[mlrun] 2020-01-21 19:20:05,501 Verified directories
/tmp/tmp4_eoapfc.zip
['/tmp/tmp4_eoapfc', 'zip']
[mlrun] 2020-01-21 19:20:05,501 opening zip
[mlrun] 2020-01-21 19:20:13,406 log artifact content at /User/mlrun/functions/images, size: None, db: Y

[mlrun] 2020-01-21 19:20:13,416 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...258069,0,Jan 21 19:20:04,completed,open-archive,host=download-tcrfckind=jobowner=admin,archive_url,key=contentstarget_dir=/User/mlrun/functions/images,,content


to track results use .show() or .logs() or in CLI: 
!mlrun get run 31c5db9ef8174d40ac94c6dad0258069  , !mlrun logs 31c5db9ef8174d40ac94c6dad0258069 
[mlrun] 2020-01-21 19:20:16,127 run executed, status=completed


### tar.gz

In [9]:
# create and run the task
images_path = '/User/mlrun/functions/images-from-tar'

open_archive_task = mlrun.NewTask(
    'download-tar',
    handler='open_archive', 
    params={'target_dir' : images_path,
            'key'        : 'contents',
            'archive_url': 'https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz'})

In [10]:
# run
run = fn.run(open_archive_task)

[mlrun] 2020-01-21 19:22:37,587 starting run download-tar uid=500c634fd1c546c5a58292d37f50320f  -> http://mlrun-api:8080
[mlrun] 2020-01-21 19:22:37,659 Job is running in the background, pod: download-tar-zh72r
[mlrun] 2020-01-21 19:22:42,412 Verified directories
https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz
['https://fpsignals-public', 's3', 'amazonaws', 'com/catsndogs', 'tar', 'gz']
[mlrun] 2020-01-21 19:22:42,412 opening tar_gz
[mlrun] 2020-01-21 19:22:57,936 log artifact content at /User/mlrun/functions/images-from-tar, size: None, db: Y

[mlrun] 2020-01-21 19:22:57,948 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...50320f,0,Jan 21 19:22:42,completed,open-archive,host=download-tar-zh72rkind=jobowner=admin,,archive_url=https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gzkey=contentstarget_dir=/User/mlrun/functions/images-from-tar,,content


to track results use .show() or .logs() or in CLI: 
!mlrun get run 500c634fd1c546c5a58292d37f50320f  , !mlrun logs 500c634fd1c546c5a58292d37f50320f 
[mlrun] 2020-01-21 19:23:06,873 run executed, status=completed


In [11]:
# create and run the task
images_path = '/User/mlrun/functions/images-from-tar-as-inputs'

open_archive_task = mlrun.NewTask(
    'download',
    handler='open_archive', 
    params={'target_dir' : images_path,
            'key'        : 'contents'},
    inputs={'archive_url': 'https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz'})

In [12]:
# run
run = fn.run(open_archive_task)

[mlrun] 2020-01-21 19:23:39,448 starting run download uid=c163869b83cd49cc888f5e9126301911  -> http://mlrun-api:8080
[mlrun] 2020-01-21 19:23:39,535 Job is running in the background, pod: download-7qf2w
[mlrun] 2020-01-21 19:23:44,057 downloading https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz to local tmp
[mlrun] 2020-01-21 19:23:44,877 Verified directories
/tmp/tmptshxsk7d.gz
['/tmp/tmptshxsk7d', 'gz']
[mlrun] 2020-01-21 19:23:44,877 opening tar_gz
[mlrun] 2020-01-21 19:23:44,879 Traceback (most recent call last):
  File "/usr/local/lib/python3.6/site-packages/mlrun-0.4.3-py3.6.egg/mlrun/runtimes/local.py", line 174, in exec_from_params
    val = handler(*args_list)
  File "main.py", line 30, in open_archive
    ref = tarfile.open(fileobj=urllib.request.urlopen(archive_url), mode='r|gz')
  File "/usr/local/lib/python3.6/urllib/request.py", line 223, in urlopen
    return opener.open(url, data, timeout)
  File "/usr/local/lib/python3.6/urllib/request.py", line 511, in open


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...301911,0,Jan 21 19:23:44,error,open-archive,host=download-7qf2wkind=jobowner=admin,archive_url,key=contentstarget_dir=/User/mlrun/functions/images-from-tar-as-inputs,,


to track results use .show() or .logs() or in CLI: 
!mlrun get run c163869b83cd49cc888f5e9126301911  , !mlrun logs c163869b83cd49cc888f5e9126301911 
[mlrun] 2020-01-21 19:23:48,687 run executed, status=error
runtime error: unknown url type: '/tmp/tmptshxsk7d.gz'


RunError: unknown url type: '/tmp/tmptshxsk7d.gz'