# archive to folder

In [1]:
# nuclio: ignore
import nuclio

In [2]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/mlrun"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/mlrun'


In [15]:
import os
import zipfile
import urllib.request
import tarfile
import json

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem

from typing import Union

def open_archive(
    context: MLClientCtx, 
    archive_url: DataItem,
    subdir: str = "content",
    key: str = "content"
):
    """Open a file/object archive into a target directory
    
    Currently supports zip and tar.gz
    
    :param context:      function execution context
    :param archive_url:  url of archive file
    :param subdir:       path within artifact store where extracted files
                         are stored
    :param key:          key of archive contents in artifact store
    """
    os.makedirs(subdir, exist_ok=True)
    
    archive_url = archive_url.local()
    if archive_url.endswith("gz"):
        with tarfile.open(archive_url, mode="r|gz") as ref:
            ref.extractall(subdir)
    elif archive_url.endswith("zip"):
        with zipfile.ZipFile(archive_url, "r") as ref:
            ref.extractall(subdir)
    else:
        raise ValueError(f'unsupported archive type in {archive_url}')
    
    context.log_artifact(key, local_path=subdir)

In [16]:
# nuclio: end-code

### mlconfig

In [17]:
from mlrun import mlconf
import os

mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'

### save

In [26]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("open_archive")

# add metadata (for templates and reuse)
fn.spec.default_handler = "open_archive"
fn.spec.description = "Open a file/object archive into a target directory"
fn.metadata.categories = ["data-movement", "utils"]
fn.metadata.labels = {"author": "yaronh"}
fn.export("function.yaml")

[mlrun] 2020-05-01 22:51:51,873 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f6bebcfb0b8>

## tests

In [19]:
# load function from marketplacen
from mlrun import import_function

# vcs_branch = 'development'
# base_vcs = f'https://raw.githubusercontent.com/mlrun/functions/{vcs_branch}/'
# mlconf.hub_url = mlconf.hub_url or base_vcs + f'{name}/function.yaml'
# fn = import_function("hub://open_archive")

In [20]:
from mlrun import run_local

if "V3IO_HOME" in list(os.environ):
    from mlrun import mount_v3io
    fn.apply(mount_v3io())
else:
    # is you set up mlrun using the instructions at https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
    from mlrun.platforms import mount_pvc
    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))

### tar

In [23]:
run = run_local(
    handler=open_archive, 
    inputs={'archive_url': "https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz"})

[mlrun] 2020-05-01 22:44:38,365 starting run mlrun-d6a743-open_archive uid=8074d2621a9e4068b3318a5b9d2458f3  -> http://10.196.88.27:80
[mlrun] 2020-05-01 22:44:38,441 downloading https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz to local tmp
[mlrun] 2020-05-01 22:44:55,601 log artifact content at /User/artifacts/content/, size: None, db: Y



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...9d2458f3,0,May 01 22:44:38,completed,mlrun-d6a743-open_archive,v3io_user=adminkind=handlerowner=adminhost=jupyter-74f9488695-mqhsp,archive_url,,,content


to track results use .show() or .logs() or in CLI: 
!mlrun get run 8074d2621a9e4068b3318a5b9d2458f3 --project default , !mlrun logs 8074d2621a9e4068b3318a5b9d2458f3 --project default
[mlrun] 2020-05-01 22:44:55,668 run executed, status=completed


### zip

In [25]:
run_local(
    handler=open_archive, 
    inputs={'archive_url': 'http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip'})

[mlrun] 2020-05-01 22:51:06,768 starting run mlrun-079aa7-open_archive uid=1a6a901b14aa4691a9a24d2e31875c05  -> http://10.196.88.27:80
[mlrun] 2020-05-01 22:51:06,803 downloading http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip to local tmp
[mlrun] 2020-05-01 22:51:22,748 log artifact content at /User/artifacts/content/, size: None, db: Y



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...31875c05,0,May 01 22:51:06,completed,mlrun-079aa7-open_archive,v3io_user=adminkind=handlerowner=adminhost=jupyter-74f9488695-mqhsp,archive_url,,,content


to track results use .show() or .logs() or in CLI: 
!mlrun get run 1a6a901b14aa4691a9a24d2e31875c05 --project default , !mlrun logs 1a6a901b14aa4691a9a24d2e31875c05 --project default
[mlrun] 2020-05-01 22:51:22,806 run executed, status=completed


<mlrun.model.RunObject at 0x7f6bf57d3160>