In [1]:
# !python -m pip uninstall -y mlrun
# !python -m pip install mlrun

In [2]:
import mlrun
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

# archive to folder

In [3]:
# nuclio: ignore
import nuclio

In [4]:
import urllib.request
# urllib.disable_warnings()

In [5]:
import os
import zipfile
import urllib
import tarfile
import json

from mlrun.execution import MLClientCtx

def open_archive(context: MLClientCtx, 
                 target_dir: str = 'content',
                 archive_url: str = '',
                 archive_tar: str = '', #  fudge parameter
                 archive_TMP: str = ''  #  fudge parameter
):
    """Open a file/object archive into a target directory
    
    Currently supports zip and tar.gz
    """
    # Define locations
    os.makedirs(target_dir, exist_ok=True)
    context.logger.info('Verified directories')
    
    # performs an implicit download to /tmp at this point and MANGLES name:
    print('archive url', archive_url)
    print('archive tar', archive_tar)
    print('archive TMP', archive_TMP)
#    assert archive_tar == archive_url 

    splits = archive_url.split('.')
    print(splits)
    if (splits[-1] == 'gz'):
        # Extract dataset from tar
        context.logger.info('opening tar_gz')
        ref = tarfile.open(fileobj=urllib.request.urlopen(archive_url), mode='r|gz')
    elif splits[-1] == 'zip':
        # Extract dataset from zip
        context.logger.info('opening zip')
        ref = zipfile.ZipFile(archive_url, 'r')

    ref.extractall(target_dir)
    ref.close()

    context.log_artifact('content', target_path=target_dir)


In [6]:
# nuclio: end-code

In [7]:
# create job function object from notebook code
fn = mlrun.code_to_function(
    'open_archive', 
    runtime='job', 
    handler=open_archive, 
    image='mlrun/mlrun:latest')

In [8]:
# export function yaml
fn.export('/User/repos/functions/fileutils/open_archive/function.yaml')

[mlrun] 2020-01-21 09:34:20,162 function spec saved to path: /User/repos/functions/fileutils/open_archive/function.yaml


In [9]:
# import function yaml
fn = mlrun.import_function('/User/repos/functions/fileutils/open_archive/function.yaml')

In [10]:
# load function from Github
# fn = mlrun.import_function('https://raw.githubusercontent.com/yjb-ds/functions/arc2parq/fileutils/open_archive/function.yaml')

In [11]:
# configute it: mount on iguazio fabric, set as interactive (return stdout)
fn.apply(mlrun.mount_v3io())
fn.interactive = True

### zip file

In [12]:
# create and run the task

images_path = '/User/mlrun/functions/images'

In [13]:
open_archive_task = mlrun.NewTask(
    'download',
    handler='open_archive', 
    params={'target_dir' : images_path,
            'key'        : 'contents'},
    inputs={'archive_url': 'http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip'})

In [14]:
# run
run = fn.run(open_archive_task)

[mlrun] 2020-01-21 09:34:20,222 starting run download uid=f867d42711c346128fe5fb8abf152b75  -> http://mlrun-api:8080
[mlrun] 2020-01-21 09:34:20,303 Job is running in the background, pod: download-ffkxl
[mlrun] 2020-01-21 09:34:26,693 downloading http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip to local tmp
[mlrun] 2020-01-21 09:34:27,552 Verified directories
archive url /tmp/tmp05kj3avl.zip
archive tar 
archive TMP 
['/tmp/tmp05kj3avl', 'zip']
[mlrun] 2020-01-21 09:34:27,552 opening zip
[mlrun] 2020-01-21 09:34:34,430 log artifact content at /User/mlrun/functions/images, size: None, db: Y

[mlrun] 2020-01-21 09:34:34,443 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...152b75,0,Jan 21 09:34:26,completed,open-archive,host=download-ffkxlkind=jobowner=admin,archive_url,key=contentstarget_dir=/User/mlrun/functions/images,,content


to track results use .show() or .logs() or in CLI: 
!mlrun get run f867d42711c346128fe5fb8abf152b75  , !mlrun logs f867d42711c346128fe5fb8abf152b75 
[mlrun] 2020-01-21 09:34:35,542 run executed, status=completed


### tar.gz

In [15]:
# create and run the task
images_path = '/User/mlrun/functions/t00'

open_archive_task = mlrun.NewTask(
    'download',
    handler='open_archive', 
    params={'target_dir' : images_path,
            'key'        : 'contents'},
    inputs={
        'archive_url': 'https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz',
        'archive_tar': 'https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz',
        'archive_TMP': 'https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz'})

In [16]:
# run
run = fn.run(open_archive_task)

[mlrun] 2020-01-21 09:34:35,557 starting run download uid=053019b4cdd34f1f9aad440d3423e400  -> http://mlrun-api:8080
[mlrun] 2020-01-21 09:34:35,625 Job is running in the background, pod: download-hhzsd
[mlrun] 2020-01-21 09:34:39,966 downloading https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz to local tmp
[mlrun] 2020-01-21 09:34:40,790 downloading https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz to local tmp
[mlrun] 2020-01-21 09:34:41,615 downloading https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz to local tmp
[mlrun] 2020-01-21 09:34:42,575 Verified directories
archive url /tmp/tmp_a8c0lt3.gz
archive tar /tmp/tmp7pmvyi7g.gz
archive TMP /tmp/tmpw_5blqyr.gz
['/tmp/tmp_a8c0lt3', 'gz']
[mlrun] 2020-01-21 09:34:42,575 opening tar_gz
[mlrun] 2020-01-21 09:34:42,578 Traceback (most recent call last):
  File "/usr/local/lib/python3.6/site-packages/mlrun-0.4.3-py3.6.egg/mlrun/runtimes/local.py", line 174, in exec_from_params
    val = handler(*args_list)
  Fi

uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...23e400,0,Jan 21 09:34:39,error,open-archive,host=download-hhzsdkind=jobowner=admin,archive_TMParchive_tararchive_url,key=contentstarget_dir=/User/mlrun/functions/t00,,


to track results use .show() or .logs() or in CLI: 
!mlrun get run 053019b4cdd34f1f9aad440d3423e400  , !mlrun logs 053019b4cdd34f1f9aad440d3423e400 
[mlrun] 2020-01-21 09:34:44,783 run executed, status=error
runtime error: unknown url type: '/tmp/tmp_a8c0lt3.gz'


RunError: unknown url type: '/tmp/tmp_a8c0lt3.gz'

______

### test outside mlrun

In [None]:
ref = tarfile.open(fileobj=urllib.request.urlopen('https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz'), mode='r|gz')

In [None]:
ref.extractall('/User/test25')