In [1]:
import mlrun, os
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

# archive to folder

In [3]:
# nuclio: ignore
import nuclio

In [6]:
import os
import zipfile
import urllib.request
import tarfile
import json

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem

from typing import Union

def open_archive(
    context: MLClientCtx, 
    archive_url: Union[DataItem, str] = '',
    key: str = 'images'
):
    """Open a file/object archive into a target directory
    
    Currently supports zip and tar.gz
    
    :param context:      function execution context
    :param archive_url:  url of archive file
    :param key:          key of archive contents in artifact store
    """
    target_dir = context.artifact_path
    
    # Define locations
    os.makedirs(target_dir, exist_ok=True)
    context.logger.info('Verified directories')
    
    splits = str(archive_url).split('.')
    if ('.'.join(splits[-2:]) == 'tar.gz'):
        # Extract dataset from tar
        context.logger.info('opening tar_gz')
        ftpstream = urllib.request.urlopen(archive_url)
        with tarfile.open(fileobj=ftpstream, mode="r|gz") as ref:
            ref.extractall(target_dir)
    elif splits[-1] == 'zip':
        # Extract dataset from zip
        context.logger.info('opening zip')
        with zipfile.ZipFile(archive_url, 'r') as ref:
            ref.extractall(target_dir)
    
    context.log_artifact(key, local_path=target_dir)

In [7]:
# nuclio: end-code

### save function.yaml fo reuse

In [10]:
fn = mlrun.code_to_function('open_archive', kind='job', image='mlrun/mlrun:latest')
fn.spec.default_handler = 'open_archive'

fn.export(os.getcwd()+ '/function.yaml')

In [11]:
fn.apply(mlrun.mount_v3io())

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f1a3bcca160>

## tests

### parameters

In [12]:
# create and run the task
images_path = os.path.join(os.getcwd(), 'images')
ZIP_ARCHIVE = 'http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip'
TAR_ARCHIVE = 'https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz'

### zip file

In [13]:
open_archive_task = mlrun.NewTask(
    'download-zip',
    params={'key'        : 'images'},
    inputs={'archive_url': ZIP_ARCHIVE},
    artifact_path=images_path)

# run
run1 = fn.run(open_archive_task)

[mlrun] 2020-03-13 11:01:40,058 starting run download-zip uid=512ebe4799774dd5b7d03f1720104d2b  -> http://mlrun-api:8080
[mlrun] 2020-03-13 11:01:40,146 Job is running in the background, pod: download-zip-gn6qb
[mlrun] 2020-03-13 11:01:45,085 downloading http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip to local tmp
[mlrun] 2020-03-13 11:01:46,756 Verified directories
[mlrun] 2020-03-13 11:01:46,756 opening zip
[mlrun] 2020-03-13 11:01:55,023 log artifact images at /User/functions-refac/open_archive/images/User/functions-refac/open_archive/images, size: None, db: Y

[mlrun] 2020-03-13 11:01:55,034 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...104d2b,0,Mar 13 11:01:45,completed,download-zip,host=download-zip-gn6qbkind=jobowner=admin,archive_url,key=images,,images


to track results use .show() or .logs() or in CLI: 
!mlrun get run 512ebe4799774dd5b7d03f1720104d2b  , !mlrun logs 512ebe4799774dd5b7d03f1720104d2b 
[mlrun] 2020-03-13 11:01:58,362 run executed, status=completed


### tar.gz

In [14]:
open_archive_task = mlrun.NewTask(
    'download-tar',
    params={'key'        : 'tar-data', 'archive_url': TAR_ARCHIVE},
    artifact_path=images_path)

# run
run2 = fn.run(open_archive_task)

[mlrun] 2020-03-13 11:01:58,380 starting run download-tar uid=ba62f7dff43045e899d1c7f83692e030  -> http://mlrun-api:8080
[mlrun] 2020-03-13 11:01:58,463 Job is running in the background, pod: download-tar-gv69v
[mlrun] 2020-03-13 11:02:03,461 Verified directories
[mlrun] 2020-03-13 11:02:03,461 opening tar_gz
[mlrun] 2020-03-13 11:02:20,178 log artifact tar-data at /User/functions-refac/open_archive/images/User/functions-refac/open_archive/images, size: None, db: Y

[mlrun] 2020-03-13 11:02:20,191 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...92e030,0,Mar 13 11:02:03,completed,download-tar,host=download-tar-gv69vkind=jobowner=admin,,archive_url=https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gzkey=tar-data,,tar-data


to track results use .show() or .logs() or in CLI: 
!mlrun get run ba62f7dff43045e899d1c7f83692e030  , !mlrun logs ba62f7dff43045e899d1c7f83692e030 
[mlrun] 2020-03-13 11:02:27,658 run executed, status=completed


In [15]:
open_archive_task = mlrun.NewTask(
    'download-tar-2',
    params={'key'        : 'tar-data'}, 
    inputs={'archive_url': TAR_ARCHIVE},
    artifact_path=images_path)

# run
run3 = fn.run(open_archive_task)

[mlrun] 2020-03-13 11:02:27,663 starting run download-tar-2 uid=096a50275e0143b99d5afd49720f50d1  -> http://mlrun-api:8080
[mlrun] 2020-03-13 11:02:27,732 Job is running in the background, pod: download-tar-2-4wxfr
[mlrun] 2020-03-13 11:02:32,785 downloading https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz to local tmp
[mlrun] 2020-03-13 11:02:33,597 Verified directories
[mlrun] 2020-03-13 11:02:33,611 log artifact tar-data at /User/functions-refac/open_archive/images/User/functions-refac/open_archive/images, size: None, db: Y

[mlrun] 2020-03-13 11:02:33,620 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...0f50d1,0,Mar 13 11:02:32,completed,download-tar-2,host=download-tar-2-4wxfrkind=jobowner=admin,archive_url,key=tar-data,,tar-data


to track results use .show() or .logs() or in CLI: 
!mlrun get run 096a50275e0143b99d5afd49720f50d1  , !mlrun logs 096a50275e0143b99d5afd49720f50d1 
[mlrun] 2020-03-13 11:02:36,846 run executed, status=completed


## cleanup

In [20]:
import shutil
shutil.rmtree(os.path.join(os.getcwd(), 'images'))

In [21]:
!mlrun clean -p -r

[mlrun] 2020-03-13 10:58:00,329 using in-cluster config.
state      started          type     name
