In [None]:
# nuclio: ignore
import nuclio

In [None]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/ml-models"

In [None]:
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from typing import Optional, List, Any
from sklearn.datasets import make_classification

from mlrun.execution import MLClientCtx


def gen_class_data(
    context: MLClientCtx,
    n_samples: int,
    m_features: int,
    k_classes: int,
    header: Optional[List[str]],
    label_column: Optional[str] = "labels",
    weight: float = 0.5,
    random_state: int = 1,
    key: str = "classifier-data", 
    file_ext: str = "parquet",
    sk_params = {}
):
    """Create a binary classification sample dataset and save.
    If no filename is given it will default to:
    "simdata-{n_samples}X{m_features}.parquet".
    
    Additional scikit-learn parameters can be set using **sk_params, please see https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html for more details.
    
    :param context:       function context
    :param n_samples:     number of rows/samples
    :param m_features:    number of cols/features
    :param k_classes:     number of classes
    :param header:        header for features array
    :param label_column:  column name of ground-truth series
    :param weight:        fraction of sample negative value (ground-truth=0)
    :param random_state:  rng seed (see https://scikit-learn.org/stable/glossary.html#term-random-state)
    :param key:           key of data in artifact store
    :param file_ext:      (pqt) extension for parquet file
    :param sk_params:     additional parameters for `sklearn.datasets.make_classification`
    """
    features, labels = make_classification(
        n_samples=n_samples,
        n_features=m_features,
        weights=weight,
        n_classes=k_classes,
        random_state=random_state, 
        **sk_params)

    # make dataframes, add column names, concatenate (X, y)
    X = pd.DataFrame(features)
    if not header:
        X.columns = ["feat_" + str(x) for x in range(m_features)]
    else:
        X.columns = header

    y = pd.DataFrame(labels, columns=[label_column])
    data = pd.concat([X, y], axis=1)
    
    context.log_dataset(key, df=data, format=file_ext, index=False)

In [None]:
# nuclio: end-code

### mlconfig

In [None]:
from mlrun import mlconf
import os

mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'

### save

In [None]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("gen_class_data")

# add metadata (for templates and reuse)
fn.spec.default_handler = "gen_class_data"
fn.spec.description = "simulate classification data using scikit-learn"
fn.metadata.categories = ["simulators", "ml"]
fn.metadata.labels = {"author": "yjb", 'framework': 'sklearn'}
fn.export("function.yaml")

### test

In [None]:
# load function from marketplacen
from mlrun import import_function

# vcs_branch = 'development'
# base_vcs = f'https://raw.githubusercontent.com/mlrun/functions/{vcs_branch}/'
# mlconf.hub_url = mlconf.hub_url or base_vcs + f'{name}/function.yaml'
# fn = import_function("hub://gen_class_data")

In [None]:
from mlrun import import_function

if "V3IO_HOME" in list(os.environ):
    # if you are running this on the iguazio data fabric
    from mlrun import mount_v3io
    fn.apply(mount_v3io())
else:
    # is you set up mlrun using the instructions at https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
    from mlrun.platforms import mount_pvc
    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))

In [None]:
from mlrun import NewTask    

task_params = {
    "name":        "tasks generate classification data", 
    "params" : {
        "n_samples"   : 100_000,
        "m_features"  : 64,
        "k_classes"   : 2,
        "weight"      : [0.4, 0.6],
        "sk_params"   : {"n_informative": 12},
        "file_ext"    : "csv"}}

In [None]:
run = fn.run(
    NewTask(**task_params), artifact_path=mlconf.artifact_path)