# archive to folder

In [1]:
# nuclio: ignore
import nuclio

In [2]:
import os
import zipfile
import urllib.request
import tarfile
import json

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem

from typing import Union

def open_archive(
    context: MLClientCtx, 
    archive_url: DataItem = "",
    subdir: str = "content",
    key: str = "content"
):
    """Open a file/object archive into a target directory
    
    Currently supports zip and tar.gz
    
    :param context:      function execution context
    :param archive_url:  url of archive file
    :param subdir:       path within artifact store where extracted files
                         are stored
    :param key:          key of archive contents in artifact store
    """
    target_dir = os.path.join(context.artifact_path, subdir)
    os.makedirs(target_dir, exist_ok=True)
    
    archive_url = str(archive_url)
    if archive_url.endswith("gz"):
        with tarfile.open(archive_url, mode="r|gz") as ref:
            ref.extractall(target_dir)
    elif archive_url.endswith("zip"):
        with zipfile.ZipFile(archive_url, "r") as ref:
            ref.extractall(target_dir)
    else:
        raise ValueError(f'unsupported archive type in {archive_url}')
    
    context.log_artifact(key, local_path=subdir)

In [3]:
# nuclio: end-code

### save

In [4]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("open_archive", kind="job", with_doc=True,
                      handler=open_archive, image="mlrun/ml-base:test")

# add metadata (for templates and reuse)
fn.spec.default_handler = "open_archive"
fn.spec.description = "Open a file/object archive into a target directory"
fn.metadata.categories = ["fileutils", "retrieve"]
fn.metadata.labels = {"author": "yaronh"}

fn.save()
fn.export("function.yaml")

[mlrun] 2020-04-20 19:25:45,633 saving function: open-archive, tag: latest
[mlrun] 2020-04-20 19:25:45,661 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fc45f113c88>

## tests

In [5]:
from mlrun import import_function, mount_v3io, NewTask, run_local

In [6]:
func = import_function("function.yaml").apply(mount_v3io())

### tar

In [7]:
run_local(handler=open_archive, inputs={'archive_url': "https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz"})

[mlrun] 2020-04-20 19:25:46,625 artifact path is not defined or is local, artifacts will not be visible in the UI
[mlrun] 2020-04-20 19:25:46,633 starting run mlrun-edd43d-open_archive uid=79aa00d2a6984c0198e6a4a59038e243  -> http://mlrun-api:8080
[mlrun] 2020-04-20 19:25:46,666 downloading https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz to local tmp
[mlrun] 2020-04-20 19:26:07,076 log artifact content at content, size: None, db: Y



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...9038e243,0,Apr 20 19:25:46,completed,mlrun-edd43d-open_archive,v3io_user=iguaziokind=handlerowner=iguaziohost=jupyter-iguazio-fbcf6f67b-bcshp,archive_url,,,content


to track results use .show() or .logs() or in CLI: 
!mlrun get run 79aa00d2a6984c0198e6a4a59038e243 --project default , !mlrun logs 79aa00d2a6984c0198e6a4a59038e243 --project default
[mlrun] 2020-04-20 19:26:07,138 run executed, status=completed


<mlrun.model.RunObject at 0x7fc48c2be940>

### zip

In [8]:
run_local(handler=open_archive, inputs={'archive_url': 'http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip'})

[mlrun] 2020-04-20 19:26:07,155 artifact path is not defined or is local, artifacts will not be visible in the UI
[mlrun] 2020-04-20 19:26:07,162 starting run mlrun-26293d-open_archive uid=9efac55cbfad4e4382695b8caaec5376  -> http://mlrun-api:8080
[mlrun] 2020-04-20 19:26:07,191 downloading http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip to local tmp
[mlrun] 2020-04-20 19:26:24,609 log artifact content at content, size: None, db: Y



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...aaec5376,0,Apr 20 19:26:07,completed,mlrun-26293d-open_archive,v3io_user=iguaziokind=handlerowner=iguaziohost=jupyter-iguazio-fbcf6f67b-bcshp,archive_url,,,content


to track results use .show() or .logs() or in CLI: 
!mlrun get run 9efac55cbfad4e4382695b8caaec5376 --project default , !mlrun logs 9efac55cbfad4e4382695b8caaec5376 --project default
[mlrun] 2020-04-20 19:26:24,660 run executed, status=completed


<mlrun.model.RunObject at 0x7fc460a11898>