In [1]:
# nuclio: ignore
import nuclio

In [2]:
%nuclio config kind = "job"
%nuclio config spec.image = "yjbds/ml-models:0.4.8"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'yjbds/ml-models:0.4.8'


In [3]:
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from typing import Optional, List, Any
from sklearn.datasets import make_classification

from mlrun.execution import MLClientCtx


def gen_class_data(
    context: MLClientCtx,
    n_samples: int,
    m_features: int,
    k_classes: int,
    header: Optional[List[str]],
    label_column: Optional[str] = "labels",
    weight: float = 0.5,
    random_state: int = 1,
    key: str = "classifier-data", 
    file_ext: str = "parquet",
    sk_params = {}
):
    """Create a binary classification sample dataset and save.
    If no filename is given it will default to:
    "simdata-{n_samples}X{m_features}.parquet".
    
    Additional scikit-learn parameters can be set using **sk_params, please see https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html for more details.
    
    :param context:       function context
    :param n_samples:     number of rows/samples
    :param m_features:    number of cols/features
    :param k_classes:     number of classes
    :param header:        header for features array
    :param label_column:  column name of ground-truth series
    :param weight:        fraction of sample negative value (ground-truth=0)
    :param random_state:  rng seed (see https://scikit-learn.org/stable/glossary.html#term-random-state)
    :param key:           key of data in artifact store
    :param file_ext:      (pqt) extension for parquet file
    :param sk_params:     additional parameters for `sklearn.datasets.make_classification`
    """
    features, labels = make_classification(
        n_samples=n_samples,
        n_features=m_features,
        weights=weight,
        n_classes=k_classes,
        random_state=random_state, 
        **sk_params)

    # make dataframes, add column names, concatenate (X, y)
    X = pd.DataFrame(features)
    if not header:
        X.columns = ["feat_" + str(x) for x in range(m_features)]
    else:
        X.columns = header

    y = pd.DataFrame(labels, columns=[label_column])
    data = pd.concat([X, y], axis=1)
    
    context.log_dataset(key, df=data, format=file_ext, index=False)

In [4]:
# nuclio: end-code

### mlconfig

In [5]:
from mlrun import mlconf
import os

mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'

### save

In [6]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("gen_class_data")

# add metadata (for templates and reuse)
fn.spec.default_handler = "gen_class_data"
fn.spec.description = "simulate classification data using scikit-learn"
fn.metadata.categories = ["simulators", "ml"]
fn.metadata.labels = {"author": "yjb", 'framework': 'sklearn'}
fn.export("function.yaml")

[mlrun] 2020-05-26 01:39:14,336 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f17d92e48d0>

In [7]:
from mlutils import get_vol_mount
fn.apply(get_vol_mount())

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f17d92e48d0>

### test

In [8]:
from mlrun import import_function

In [19]:
from mlrun import NewTask    

task_params = {
    "name":        "tasks generate classification data", 
    "params" : {
        "n_samples"   : 10_000,
        "m_features"  : 24,
        "k_classes"   : 2,
        "weight"      : [0.5, 0.5],
        "sk_params"   : {"n_informative": 6},
        "file_ext"    : "csv",
        "key"         : "clf-k2-m24-n10k-bal"}}

In [20]:
run = fn.run(
    NewTask(**task_params), artifact_path=mlconf.artifact_path)

[mlrun] 2020-05-26 01:41:08,082 starting run tasks generate classification data uid=52ce4d21880340958ad90d689e298238  -> http://mlrun-api:8080
[mlrun] 2020-05-26 01:41:08,183 Job is running in the background, pod: tasks-generate-classification-data-jjl8k
[mlrun] 2020-05-26 01:41:12,035 starting local run: main.py # gen_class_data
[mlrun] 2020-05-26 01:41:13,050 log artifact clf-k2-m24-n10k-bal at /User/artifacts/clf-k2-m24-n10k-bal.csv, size: 4714023, db: Y

[mlrun] 2020-05-26 01:41:13,094 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...9e298238,0,May 26 01:41:12,completed,tasks generate classification data,v3io_user=adminkind=jobowner=adminhost=tasks-generate-classification-data-jjl8k,,"n_samples=10000m_features=24k_classes=2weight=[0.5, 0.5]sk_params={'n_informative': 6}file_ext=csvkey=clf-k2-m24-n10k-bal",,clf-k2-m24-n10k-bal


to track results use .show() or .logs() or in CLI: 
!mlrun get run 52ce4d21880340958ad90d689e298238  , !mlrun logs 52ce4d21880340958ad90d689e298238 
[mlrun] 2020-05-26 01:41:14,377 run executed, status=completed
