# archive to folder

In [1]:
# nuclio: ignore
import nuclio

In [2]:
%nuclio config kind = "job"
%nuclio config spec.image = "yjbds/ml-base:0.4.8:"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'yjbds/ml-base:0.4.8:'


In [3]:
import os
import zipfile
import urllib.request
import tarfile
import json

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem

from typing import Union

def open_archive(
    context: MLClientCtx, 
    archive_url: DataItem,
    subdir: str = "content",
    key: str = "content"
):
    """Open a file/object archive into a target directory
    
    Currently supports zip and tar.gz
    
    :param context:      function execution context
    :param archive_url:  url of archive file
    :param subdir:       path within artifact store where extracted files
                         are stored
    :param key:          key of archive contents in artifact store
    """
    dest_path = context.artifact_path + f'/{subdir}'
    os.makedirs(dest_path, exist_ok=True)
    
    archive_url = archive_url.local()
    if archive_url.endswith("gz"):
        with tarfile.open(archive_url, mode="r|gz") as ref:
            ref.extractall(dest_path)
    elif archive_url.endswith("zip"):
        with zipfile.ZipFile(archive_url, "r") as ref:
            ref.extractall(dest_path)
    else:
        raise ValueError(f'unsupported archive type in {archive_url}')
    
    context.log_artifact(key, local_path=subdir)

In [4]:
# nuclio: end-code

### save

In [5]:
from mlrun import code_to_function, mlconf
# create job function object from notebook code
fn = code_to_function("open_archive")

# add metadata (for templates and reuse)
fn.spec.default_handler = "open_archive"
fn.spec.description = "Open a file/object archive into a target directory"
fn.metadata.categories = ["data-movement", "utils"]
fn.metadata.labels = {"author": "yaronh"}
fn.export("function.yaml")

[mlrun] 2020-05-27 18:51:04,519 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f27009be668>

In [6]:
from mlutils import get_vol_mount
fn.apply(get_vol_mount())

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f27009be668>

## tests

* was the archive extracted to correct location (inside try/catch in case it fails)
* are the top level contents as expected
* does each subfolder have the correct number of files...


In [7]:
import os
from mlrun import run_local
from mlrun import mlconf

In [8]:
def test(outputs, archive_type, artifact_path):
    assert outputs==artifact_path+f'/open_archive/{archive_type}/content/'

    extracted = ['cats_n_dogs', 'train', 'validation', 'vectorize.py']
    sizes = [2000, 0, 1001]

    l = os.listdir(outputs)
    assert l==extracted

    for i,sz in enumerate(sizes):
        assert len(os.listdir(outputs+f'{l[i]}'))==sizes[i]

In [9]:
runtar = run_local(
    handler=open_archive, 
    inputs={'archive_url': "https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz"},
    artifact_path = mlconf.artifact_path+'/open_archive/tar')

[mlrun] 2020-05-27 18:51:05,574 starting run mlrun-993a94-open_archive uid=a1228c301f514b2d94de999d32877a74  -> http://mlrun-api:8080
[mlrun] 2020-05-27 18:51:05,613 downloading https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz to local tmp
[mlrun] 2020-05-27 18:51:19,935 log artifact content at /User/artifacts/open_archive/tar/content/, size: None, db: Y



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...32877a74,0,May 27 18:51:05,completed,mlrun-993a94-open_archive,v3io_user=adminkind=handlerowner=adminhost=jupyter-f9d6597fd-ns9cj,archive_url,,,content


to track results use .show() or .logs() or in CLI: 
!mlrun get run a1228c301f514b2d94de999d32877a74 --project default , !mlrun logs a1228c301f514b2d94de999d32877a74 --project default
[mlrun] 2020-05-27 18:51:19,980 run executed, status=completed


In [10]:
runzip = run_local(
    handler=open_archive, 
    inputs={'archive_url': 'http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip'},
    artifact_path = mlconf.artifact_path+'/open_archive/zip')

[mlrun] 2020-05-27 18:51:19,992 starting run mlrun-e0f145-open_archive uid=058458fe8ab84957b1a5beb536f8ac32  -> http://mlrun-api:8080
[mlrun] 2020-05-27 18:51:20,026 downloading http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip to local tmp
[mlrun] 2020-05-27 18:51:29,583 log artifact content at /User/artifacts/open_archive/zip/content/, size: None, db: Y



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...36f8ac32,0,May 27 18:51:20,completed,mlrun-e0f145-open_archive,v3io_user=adminkind=handlerowner=adminhost=jupyter-f9d6597fd-ns9cj,archive_url,,,content


to track results use .show() or .logs() or in CLI: 
!mlrun get run 058458fe8ab84957b1a5beb536f8ac32 --project default , !mlrun logs 058458fe8ab84957b1a5beb536f8ac32 --project default
[mlrun] 2020-05-27 18:51:29,621 run executed, status=completed


In [13]:
test(runtar.outputs["content"], "tar", mlconf.artifact_path)

In [14]:
test(runzip.outputs["content"], "zip", mlconf.artifact_path)