In [2]:
# nuclio: ignore
import nuclio

In [3]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/ml-models"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/ml-models'


In [4]:
# Copyright 2018 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import pandas as pd
from mlrun.execution import MLClientCtx


def load_dataset(
        context: MLClientCtx,
        dataset: str,
        name: str = '',
        file_ext: str = 'parquet',
        params: dict = {}
) -> None:
    """Loads a scikit-learn toy dataset for classification or regression

    The following datasets are available ('name' : desription):

        'boston'          : boston house-prices dataset (regression)
        'iris'            : iris dataset (classification)
        'diabetes'        : diabetes dataset (regression)
        'digits'          : digits dataset (classification)
        'linnerud'        : linnerud dataset (multivariate regression)
        'wine'            : wine dataset (classification)
        'breast_cancer'   : breast cancer wisconsin dataset (classification)

    The scikit-learn functions return a data bunch including the following items:
    - data              the features matrix
    - target            the ground truth labels
    - DESCR             a description of the dataset
    - feature_names     header for data

    The features (and their names) are stored with the target labels in a DataFrame.

    For further details see https://scikit-learn.org/stable/datasets/index.html#toy-datasets

    :param context:    function execution context
    :param dataset:    name of the dataset to load
    :param name:       artifact name (defaults to dataset)
    :param file_ext:   output file_ext: parquet or csv
    :param params:     params of the sklearn load_data method
    """
    dataset = str(dataset)
    # reach into module and import the appropriate load_xxx function
    pkg_module = 'sklearn.datasets'
    fname = f'load_{dataset}'

    pkg_module = __import__(pkg_module, fromlist=[fname])
    load_data_fn = getattr(pkg_module, fname)

    data = load_data_fn(**params)
    feature_names = data['feature_names']

    # create the toy dataset
    xy = np.concatenate([data['data'], data['target'].reshape(-1, 1)], axis=1)
    if hasattr(feature_names, 'append'):
        # its a list
        feature_names.append('labels')
    else:
        # its an array
        feature_names = np.append(feature_names, 'labels')
    df = pd.DataFrame(data=xy, columns=feature_names)

    # log and upload the dataset
    context.log_dataset(name or dataset, df=df, format=file_ext, index=False)

In [5]:
# nuclio: end-code

### mlconfig

In [6]:
from mlrun import mlconf
import os

mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'

### save

In [7]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("load_dataset")

# add metadata (for templates and reuse)
fn.spec.default_handler = "load_dataset"
fn.spec.description = "load a toy dataset from scikit-learn"
fn.metadata.categories = ["data-source", "ml"]
fn.metadata.labels = {"author": "yjb", "framework": "sklearn"}
fn.export("function.yaml")

[mlrun] 2020-05-02 18:33:25,588 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7efddf870400>

## tests

In [8]:
# load function from marketplacen
from mlrun import import_function

# vcs_branch = 'development'
# base_vcs = f'https://raw.githubusercontent.com/mlrun/functions/{vcs_branch}/'
# mlconf.hub_url = mlconf.hub_url or base_vcs + f'{name}/function.yaml'
# fn = import_function("hub://load_dataset")

In [9]:
if "V3IO_HOME" in list(os.environ):
    from mlrun import mount_v3io
    fn.apply(mount_v3io())
else:
    # is you set up mlrun using the instructions at https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
    from mlrun.platforms import mount_pvc
    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))

In [10]:
from mlrun import NewTask    

task_params = {
    "name"   : "tasks load toy dataset", 
    "params" : {"dataset"     : "wine"}}

### run remotely

In [11]:
run = fn.run(NewTask(**task_params), artifact_path=mlconf.artifact_path)

[mlrun] 2020-05-02 18:33:33,454 starting run tasks load toy dataset uid=f77967ec29a348e1ac8f01d2a1da27f5  -> http://10.196.88.27:80
[mlrun] 2020-05-02 18:33:33,614 Job is running in the background, pod: tasks-load-toy-dataset-4kdkf
[mlrun] 2020-05-02 18:33:41,268 log artifact wine at /User/artifacts/wine.parquet, size: 17997, db: Y

[mlrun] 2020-05-02 18:33:41,306 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...a1da27f5,0,May 02 18:33:40,completed,tasks load toy dataset,host=tasks-load-toy-dataset-4kdkfkind=jobowner=adminv3io_user=admin,,dataset=wine,,wine


to track results use .show() or .logs() or in CLI: 
!mlrun get run f77967ec29a348e1ac8f01d2a1da27f5  , !mlrun logs f77967ec29a348e1ac8f01d2a1da27f5 
[mlrun] 2020-05-02 18:33:42,997 run executed, status=completed


### or locally

In [12]:
from mlrun import run_local

In [13]:
for dataset in ["wine", "iris", "breast_cancer"]:
    run_local(handler=load_dataset,
              inputs={"dataset": dataset}, artifact_path=mlconf.artifact_path)

[mlrun] 2020-05-02 18:33:57,094 starting run mlrun-232bcb-load_dataset uid=dd6dbe32cf5642eaa411406c2711f1fb  -> http://10.196.88.27:80
[mlrun] 2020-05-02 18:33:57,800 log artifact wine at /User/artifacts/wine.parquet, size: 18006, db: Y



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...2711f1fb,0,May 02 18:33:57,completed,mlrun-232bcb-load_dataset,v3io_user=adminkind=handlerowner=adminhost=jupyter-74f9488695-mqhsp,dataset,,,wine


to track results use .show() or .logs() or in CLI: 
!mlrun get run dd6dbe32cf5642eaa411406c2711f1fb --project default , !mlrun logs dd6dbe32cf5642eaa411406c2711f1fb --project default
[mlrun] 2020-05-02 18:33:57,877 run executed, status=completed
[mlrun] 2020-05-02 18:33:57,887 starting run mlrun-e1dfc3-load_dataset uid=de5fa1a48564494d9684024ad12db238  -> http://10.196.88.27:80
[mlrun] 2020-05-02 18:33:57,991 log artifact iris at /User/artifacts/iris.parquet, size: 5049, db: Y



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...d12db238,0,May 02 18:33:57,completed,mlrun-e1dfc3-load_dataset,v3io_user=adminkind=handlerowner=adminhost=jupyter-74f9488695-mqhsp,dataset,,,iris


to track results use .show() or .logs() or in CLI: 
!mlrun get run de5fa1a48564494d9684024ad12db238 --project default , !mlrun logs de5fa1a48564494d9684024ad12db238 --project default
[mlrun] 2020-05-02 18:33:58,052 run executed, status=completed
[mlrun] 2020-05-02 18:33:58,068 starting run mlrun-5e04a9-load_dataset uid=a71153ecb6a64b108bdf63179df6737b  -> http://10.196.88.27:80
[mlrun] 2020-05-02 18:33:58,379 log artifact breast_cancer at /User/artifacts/breast_cancer.parquet, size: 146754, db: Y



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...9df6737b,0,May 02 18:33:58,completed,mlrun-5e04a9-load_dataset,v3io_user=adminkind=handlerowner=adminhost=jupyter-74f9488695-mqhsp,dataset,,,breast_cancer


to track results use .show() or .logs() or in CLI: 
!mlrun get run a71153ecb6a64b108bdf63179df6737b --project default , !mlrun logs a71153ecb6a64b108bdf63179df6737b --project default
[mlrun] 2020-05-02 18:33:58,479 run executed, status=completed
