In [1]:
# !python -m pip uninstall -y mlrun
# !python -m pip install mlrun

In [2]:
import mlrun
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

# archive to folder

In [3]:
# nuclio: ignore
import nuclio

In [4]:
import urllib.request
# urllib.disable_warnings()

In [5]:
import os
import zipfile
import urllib
import tarfile
import json

from mlrun.execution import MLClientCtx

def open_archive(context: MLClientCtx, 
                 target_dir: str = 'content',
                 archive_url: str = ''):
    """Open a file/object archive into a target directory
    
    Currently supports zip and tar.gz
    """
    # Define locations
    os.makedirs(target_dir, exist_ok=True)
    context.logger.info('Verified directories')
    print(archive_url)
    splits = archive_url.split('.')
    print(splits)
    if (splits[-1] == 'gz'):
        # Extract dataset from tar
        context.logger.info('opening tar_gz')
        ref = tarfile.open(fileobj=urllib.request.urlopen(archive_url), mode='r|gz')
    elif splits[-1] == 'zip':
        # Extract dataset from zip
        context.logger.info('opening zip')
        ref = zipfile.ZipFile(archive_url, 'r')

    ref.extractall(target_dir)
    ref.close()

    context.log_artifact('content', target_path=target_dir)


In [6]:
# nuclio: end-code

In [7]:
# create job function object from notebook code
fn = mlrun.code_to_function(
    'open_archive', 
    runtime='job', 
    handler=open_archive, 
    image='mlrun/mlrun:latest')

In [8]:
# export function yaml
fn.export('/User/repos/functions/fileutils/open_archive/function.yaml')

[mlrun] 2020-01-21 09:47:47,371 function spec saved to path: /User/repos/functions/fileutils/open_archive/function.yaml


In [9]:
# import function yaml
fn = mlrun.import_function('/User/repos/functions/fileutils/open_archive/function.yaml')

In [10]:
# load function from Github
# fn = mlrun.import_function('https://raw.githubusercontent.com/yjb-ds/functions/arc2parq/fileutils/open_archive/function.yaml')

In [11]:
# configute it: mount on iguazio fabric, set as interactive (return stdout)
fn.apply(mlrun.mount_v3io())
fn.interactive = True

### zip file

In [12]:
# create and run the task
images_path = '/User/mlrun/functions/images'

open_archive_task = mlrun.NewTask(
    'download',
    handler='open_archive', 
    params={'target_dir' : images_path,
            'key'        : 'contents'},
    inputs={'archive_url': 'http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip'})

In [13]:
# run
run = fn.run(open_archive_task)

[mlrun] 2020-01-21 09:47:47,427 starting run download uid=299b648c59294e9891334ded6159d8aa  -> http://mlrun-api:8080
[mlrun] 2020-01-21 09:47:47,497 Job is running in the background, pod: download-92bzm
[mlrun] 2020-01-21 09:47:51,918 downloading http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip to local tmp
[mlrun] 2020-01-21 09:47:52,812 Verified directories
/tmp/tmpxqpdi5zq.zip
['/tmp/tmpxqpdi5zq', 'zip']
[mlrun] 2020-01-21 09:47:52,812 opening zip
[mlrun] 2020-01-21 09:47:59,625 log artifact content at /User/mlrun/functions/images, size: None, db: Y

[mlrun] 2020-01-21 09:47:59,635 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...59d8aa,0,Jan 21 09:47:51,completed,open-archive,host=download-92bzmkind=jobowner=admin,archive_url,key=contentstarget_dir=/User/mlrun/functions/images,,content


to track results use .show() or .logs() or in CLI: 
!mlrun get run 299b648c59294e9891334ded6159d8aa  , !mlrun logs 299b648c59294e9891334ded6159d8aa 
[mlrun] 2020-01-21 09:48:02,711 run executed, status=completed


### tar.gz

In [16]:
# create and run the task
images_path = '/User/mlrun/functions/t000'

open_archive_task = mlrun.NewTask(
    'download',
    handler='open_archive', 
    params={'target_dir' : images_path,
            'key'        : 'contents',
            'archive_url': 'https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz'})

In [17]:
# run
run = fn.run(open_archive_task)

[mlrun] 2020-01-21 09:48:45,223 starting run download uid=e5df4261e94847c999df30bbe88fe6c8  -> http://mlrun-api:8080
[mlrun] 2020-01-21 09:48:45,298 Job is running in the background, pod: download-sr2pp
[mlrun] 2020-01-21 09:48:49,674 Verified directories
https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz
['https://fpsignals-public', 's3', 'amazonaws', 'com/catsndogs', 'tar', 'gz']
[mlrun] 2020-01-21 09:48:49,674 opening tar_gz
[mlrun] 2020-01-21 09:49:03,258 log artifact content at /User/mlrun/functions/t000, size: None, db: Y

[mlrun] 2020-01-21 09:49:03,273 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...8fe6c8,0,Jan 21 09:48:49,completed,open-archive,host=download-sr2ppkind=jobowner=admin,,archive_url=https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gzkey=contentstarget_dir=/User/mlrun/functions/t000,,content


to track results use .show() or .logs() or in CLI: 
!mlrun get run e5df4261e94847c999df30bbe88fe6c8  , !mlrun logs e5df4261e94847c999df30bbe88fe6c8 
[mlrun] 2020-01-21 09:49:04,471 run executed, status=completed


In [18]:
# create and run the task
images_path = '/User/mlrun/functions/t0000'

open_archive_task = mlrun.NewTask(
    'download',
    handler='open_archive', 
    params={'target_dir' : images_path,
            'key'        : 'contents'},
    inputs={'archive_url': 'https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz'})

In [19]:
# run
run = fn.run(open_archive_task)

[mlrun] 2020-01-21 09:50:08,023 starting run download uid=74f0391c2da04aabb3f0735bfa977b17  -> http://mlrun-api:8080
[mlrun] 2020-01-21 09:50:08,112 Job is running in the background, pod: download-8wt5x
[mlrun] 2020-01-21 09:50:14,529 downloading https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz to local tmp
[mlrun] 2020-01-21 09:50:15,651 Verified directories
/tmp/tmp14moqiew.gz
['/tmp/tmp14moqiew', 'gz']
[mlrun] 2020-01-21 09:50:15,651 opening tar_gz
[mlrun] 2020-01-21 09:50:15,653 Traceback (most recent call last):
  File "/usr/local/lib/python3.6/site-packages/mlrun-0.4.3-py3.6.egg/mlrun/runtimes/local.py", line 174, in exec_from_params
    val = handler(*args_list)
  File "main.py", line 30, in open_archive
    ref = tarfile.open(fileobj=urllib.request.urlopen(archive_url), mode='r|gz')
  File "/usr/local/lib/python3.6/urllib/request.py", line 223, in urlopen
    return opener.open(url, data, timeout)
  File "/usr/local/lib/python3.6/urllib/request.py", line 511, in open


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...977b17,0,Jan 21 09:50:14,error,open-archive,host=download-8wt5xkind=jobowner=admin,archive_url,key=contentstarget_dir=/User/mlrun/functions/t0000,,


to track results use .show() or .logs() or in CLI: 
!mlrun get run 74f0391c2da04aabb3f0735bfa977b17  , !mlrun logs 74f0391c2da04aabb3f0735bfa977b17 
[mlrun] 2020-01-21 09:50:17,234 run executed, status=error
runtime error: unknown url type: '/tmp/tmp14moqiew.gz'


RunError: unknown url type: '/tmp/tmp14moqiew.gz'