In [1]:
# nuclio: ignore
import nuclio

In [2]:
# Copyright 2018 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import pandas as pd
from mlrun.execution import MLClientCtx


def load_dataset(
        context: MLClientCtx,
        dataset: str,
        name: str = '',
        file_ext: str = 'parquet',
        params: dict = {}
) -> None:
    """Loads a scikit-learn toy dataset for classification or regression

    The following datasets are available ('name' : desription):

        'boston'          : boston house-prices dataset (regression)
        'iris'            : iris dataset (classification)
        'diabetes'        : diabetes dataset (regression)
        'digits'          : digits dataset (classification)
        'linnerud'        : linnerud dataset (multivariate regression)
        'wine'            : wine dataset (classification)
        'breast_cancer'   : breast cancer wisconsin dataset (classification)

    The scikit-learn functions return a data bunch including the following items:
    - data              the features matrix
    - target            the ground truth labels
    - DESCR             a description of the dataset
    - feature_names     header for data

    The features (and their names) are stored with the target labels in a DataFrame.

    For further details see https://scikit-learn.org/stable/datasets/index.html#toy-datasets

    :param context:    function execution context
    :param dataset:    name of the dataset to load
    :param name:       artifact name (defaults to dataset)
    :param file_ext:   output file_ext: parquet or csv
    :param params:     params of the sklearn load_data method
    """
    # reach into module and import the appropriate load_xxx function
    pkg_module = 'sklearn.datasets'
    fname = f'load_{dataset}'

    pkg_module = __import__(pkg_module, fromlist=[fname])
    load_data_fn = getattr(pkg_module, fname)

    data = load_data_fn(**params)
    feature_names = data['feature_names']

    # create the toy dataset
    xy = np.concatenate([data['data'], data['target'].reshape(-1, 1)], axis=1)
    if hasattr(feature_names, 'append'):
        # its a list
        feature_names.append('labels')
    else:
        # its an array
        feature_names = np.append(feature_names, 'labels')
    df = pd.DataFrame(data=xy, columns=feature_names)

    # log and upload the dataset
    context.log_dataset(name or dataset, df=df, format=file_ext, index=False)

In [3]:
# nuclio: end-code

### save

In [7]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("load_dataset", kind="job", with_doc=True,
                      handler=load_dataset, image="mlrun/ml-models:0.4.6")

# add metadata (for templates and reuse)
fn.spec.default_handler = "load_dataset"
fn.spec.description = "load a toy dataset from scikit-learn"
fn.metadata.categories = ["datagen", "filesutils"]
fn.spec.image_pull_policy = "Always"
fn.metadata.labels = {"author": "yjb"}

fn.save()
fn.export("function.yaml")

[mlrun] 2020-03-27 21:37:20,299 saving function: load-dataset, tag: latest
[mlrun] 2020-03-27 21:37:20,331 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f8956a9e050>

### test

In [16]:
from mlrun import import_function, mount_v3io, NewTask

func = import_function("hub://load_dataset").apply(mount_v3io())
# func = import_function("function.yaml").apply(mlrun.mount_v3io())


for dataset in ["iris", "wine", "breast_cancer"]:
    task_params = {"name"          : f"tasks loading {dataset}", 
                   "params"        : {"dataset"     : dataset}}
    run = func.run(NewTask(**task_params), artifact_path="/User/artifacts")

[mlrun] 2020-03-27 21:42:09,919 starting run tasks loading iris uid=43f55f0040024f4bb57b1f5812e8764b  -> http://mlrun-api:8080
[mlrun] 2020-03-27 21:42:10,038 Job is running in the background, pod: tasks-loading-iris-4wdm4
[mlrun] 2020-03-27 21:42:24,052 log artifact iris at /User/artifacts/iris.parquet, size: 5049, db: Y

[mlrun] 2020-03-27 21:42:24,072 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...e8764b,0,Mar 27 21:42:22,completed,tasks loading iris,host=tasks-loading-iris-4wdm4kind=jobowner=admin,,dataset=iris,,iris


to track results use .show() or .logs() or in CLI: 
!mlrun get run 43f55f0040024f4bb57b1f5812e8764b  , !mlrun logs 43f55f0040024f4bb57b1f5812e8764b 
[mlrun] 2020-03-27 21:42:29,297 run executed, status=completed
[mlrun] 2020-03-27 21:42:29,298 starting run tasks loading wine uid=3e23cc2ff6eb49c69062ff7f2119cf61  -> http://mlrun-api:8080
[mlrun] 2020-03-27 21:42:29,424 Job is running in the background, pod: tasks-loading-wine-zxncp
[mlrun] 2020-03-27 21:42:42,958 log artifact wine at /User/artifacts/wine.parquet, size: 18006, db: Y

[mlrun] 2020-03-27 21:42:42,997 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...19cf61,0,Mar 27 21:42:41,completed,tasks loading wine,host=tasks-loading-wine-zxncpkind=jobowner=admin,,dataset=wine,,wine


to track results use .show() or .logs() or in CLI: 
!mlrun get run 3e23cc2ff6eb49c69062ff7f2119cf61  , !mlrun logs 3e23cc2ff6eb49c69062ff7f2119cf61 
[mlrun] 2020-03-27 21:42:48,725 run executed, status=completed
[mlrun] 2020-03-27 21:42:48,726 starting run tasks loading breast_cancer uid=875c618594454fe8a72a97b02a82d2ee  -> http://mlrun-api:8080
[mlrun] 2020-03-27 21:42:48,847 Job is running in the background, pod: tasks-loading-breast-cancer-vqc6q
[mlrun] 2020-03-27 21:43:02,716 log artifact breast_cancer at /User/artifacts/breast_cancer.parquet, size: 146754, db: Y

[mlrun] 2020-03-27 21:43:02,786 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...82d2ee,0,Mar 27 21:43:01,completed,tasks loading breast_cancer,host=tasks-loading-breast-cancer-vqc6qkind=jobowner=admin,,dataset=breast_cancer,,breast_cancer


to track results use .show() or .logs() or in CLI: 
!mlrun get run 875c618594454fe8a72a97b02a82d2ee  , !mlrun logs 875c618594454fe8a72a97b02a82d2ee 
[mlrun] 2020-03-27 21:43:08,264 run executed, status=completed


In [17]:
!mlrun clean -p -r

[mlrun] 2020-03-27 21:43:17,687 using in-cluster config.
state      started          type     name
Succeeded  Mar 27 21:42:48  job      tasks-loading-breast-cancer-vqc6q
Succeeded  Mar 27 21:42:10  job      tasks-loading-iris-4wdm4
Succeeded  Mar 27 21:42:29  job      tasks-loading-wine-zxncp


In [18]:
import mlrun
mlrun.__version__

'0.4.6'