In [1]:
# nuclio: ignore
import nuclio

In [2]:
# Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from typing import Optional, List, Any
from sklearn.datasets import make_classification

from mlrun.execution import MLClientCtx


def gen_class_data(
    context: MLClientCtx,
    n_samples: int,
    m_features: int,
    k_classes: int,
    header: Optional[List[str]],
    label_column: Optional[str] = "labels",
    weight: float = 0.5,
    random_state: int = 1,
    key: str = "classifier-data", 
    file_ext: str = "parquet",
    sk_params = {}
):
    """Create a binary classification sample dataset and save.
    If no filename is given it will default to:
    "simdata-{n_samples}X{m_features}.parquet".
    
    Additional scikit-learn parameters can be set using **sk_params, please see https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html for more details.
    
    :param context:       function context
    :param n_samples:     number of rows/samples
    :param m_features:    number of cols/features
    :param k_classes:     number of classes
    :param header:        header for features array
    :param label_column:  column name of ground-truth series
    :param weight:        fraction of sample negative value (ground-truth=0)
    :param random_state:  rng seed (see https://scikit-learn.org/stable/glossary.html#term-random-state)
    :param key:           key of data in artifact store
    :param file_ext:      (pqt) extension for parquet file
    :param sk_params:     additional parameters for `sklearn.datasets.make_classification`
    """
    features, labels = make_classification(
        n_samples=n_samples,
        n_features=m_features,
        weights=weight,
        n_classes=k_classes,
        random_state=random_state, 
        **sk_params)

    # make dataframes, add column names, concatenate (X, y)
    X = pd.DataFrame(features)
    if not header:
        X.columns = ["feat_" + str(x) for x in range(m_features)]
    else:
        X.columns = header

    y = pd.DataFrame(labels, columns=[label_column])
    data = pd.concat([X, y], axis=1)
    
    context.log_dataset(key, df=data, format=file_ext, index=False)

In [3]:
# nuclio: end-code

### mlconfig

In [4]:
from mlrun import mlconf

In [5]:
mlconf.dbpath = mlconf.dbpath or './'
mlconf.dbpath

'http://mlrun-api:8080'

In [6]:
vcs_branch = 'development'
base_vcs = f'https://raw.githubusercontent.com/mlrun/functions/{vcs_branch}/'

mlconf.hub_url = mlconf.hub_url or base_vcs + f'{name}/function.yaml'
mlconf.hub_url

'/User/repos/functions/{name}/function.yaml'

In [7]:
import os
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["V3IO_HOME"]}/artifacts'
mlconf.artifact_path

'/User/artifacts'

### save

In [8]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("gen_class_data", kind="job", with_doc=True,
                      handler=gen_class_data, 
                      image="mlrun/ml-models")

# add metadata (for templates and reuse)
fn.spec.default_handler = "gen_class_data"
fn.spec.description = "simulate classification data using scikit-learn"
fn.metadata.categories = ["datagen", "filesutils"]
fn.metadata.labels = {"author": "yjb"}

fn.save()
fn.export("function.yaml")

[mlrun] 2020-04-25 14:31:30,182 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f7d13f35f98>

### test

In [9]:
from mlrun import import_function

func = import_function("hub://gen_class_data")

if "V3IO_HOME" in list(os.environ):
    # if you are running this on the iguazio data fabric
    from mlrun import mount_v3io
    func.apply(mount_v3io())
else:
    # is you set up mlrun using the instructions at https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
    from mlrun.platforms import mount_pvc
    func.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))



In [10]:
from mlrun import NewTask    

task_params = {
    "name":        "tasks generate classification data", 
    "params" : {
        "n_samples"   : 100_000,
        "m_features"  : 64,
        "k_classes"   : 2,
        "weight"      : [0.6, 0.4],
        "sk_params"   : {"n_informative": 12},
        "file_ext"    : "csv"}}

In [11]:
run = func.run(NewTask(**task_params))

[mlrun] 2020-04-25 14:31:30,252 starting run tasks generate classification data uid=580a0667136f44258db0afde06e80221  -> http://mlrun-api:8080
[mlrun] 2020-04-25 14:31:30,369 Job is running in the background, pod: tasks-generate-classification-data-bfmjm
[mlrun] 2020-04-25 14:31:54,999 log artifact classifier-data at /User/artifacts/classifier-data.csv, size: 125371547, db: Y

[mlrun] 2020-04-25 14:31:55,125 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...e80221,0,Apr 25 14:31:37,completed,tasks generate classification data,host=tasks-generate-classification-data-bfmjmkind=jobowner=adminv3io_user=admin,,"file_ext=csvk_classes=2m_features=64n_samples=100000sk_params={'n_informative': 12}weight=[0.6, 0.4]",,classifier-data


to track results use .show() or .logs() or in CLI: 
!mlrun get run 580a0667136f44258db0afde06e80221  , !mlrun logs 580a0667136f44258db0afde06e80221 
[mlrun] 2020-04-25 14:32:00,274 run executed, status=completed
