In [None]:
# nuclio: ignore
import nuclio

In [None]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/ml-models"

In [3]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [4]:
import os
import pandas as pd
from mlrun.datastore import DataItem
from mlrun.artifacts import get_model

def test_classifier(
    context,
    models_path: DataItem, 
    test_set: DataItem,
    label_column: str,
    plots_dest: str = "",
) -> None:
    """Test one or more classifier models against held-out dataset
    
    Using held-out test features, evaluates the peformance of the estimated model
    
    Can be part of a kubeflow pipeline as a test step that is run post EDA and 
    training/validation cycles
    
    :param context:         the function context
    :param models_path:     artifact models representing a file or a folder
    :param test_set:        test features and labels
    :param label_column:    column name for ground truth labels
    :param score_method:    for multiclass classification
    :param plots_dest:      dir for test plots
    :param model_evaluator: NOT IMPLEMENTED: specific method to generate eval, passed in as string
                            or available in this folder
    """
    xtest = test_set.as_df()
    ytest = xtest.pop(label_column)

    model_file, model_obj, _ = get_model(models_path.url, suffix='.pkl')
    print(model_obj)

    # there could be different eval_models, type of model (xgboost, tfv1, tfv2...)
    # or how probabilities are calculated, etc...
#     if not model_evaluator:
#         # binary and multiclass
#         y_hat = eval_class_model(xtest, ytest, model_file)
    
#     elif model_evaluator is "mutliclass":
#         pass
#     elif model_evaluator is "regression":
#         pass

#     # give the prediction columns titles/headers
#     if y_hat.ndim == 1 or y_hat.shape[1] == 1:
#         score_names = ["yscore"]
#     else:
#         score_names = ["yscore_" + str(x) for x in range(y_hat.shape[1])]

#     # log the test set and its predictions (should also bind model and metadata)
#     df = pd.concat([xtest, ytest, pd.DataFrame(y_hat, columns=score_names)], axis=1)
#     context.log_dataset("test_set_preds", df=df, format="parquet", index=False)

In [5]:
# nuclio: end-code

### mlconfig

In [6]:
from mlrun import mlconf
import os
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'

### save

In [7]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("test_classifier")

# add metadata (for templates and reuse)
fn.spec.default_handler = "test_classifier"
fn.spec.description = "test a classifier using held-out or new data"
fn.metadata.categories = ["ml", "test"]
fn.metadata.labels = {"author": "yjb", "framework": "sklearn"}

fn.export("function.yaml")

[mlrun] 2020-05-03 19:26:43,028 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fa0fac3d5c0>

## tests

In [8]:
if "V3IO_HOME" in list(os.environ):
    from mlrun import mount_v3io
    fn.apply(mount_v3io())
else:
    # is you set up mlrun using the instructions at https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
    from mlrun.platforms import mount_pvc
    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))



In [9]:
task_params = {
    "name" : "tasks - test classifier",
    "params": {
        # Ina pipeline setting, the models_path parameter would be the output of a training step
        "models_path"   : mlconf.artifact_path + "/models/sklearn_classifier",
        "label_column"  : "labels",
        "plots_dest"    : mlconf.artifact_path + "/models/sklearn_classifier/plots"}}

### run locally

In [10]:
from mlrun import run_local, NewTask

run = run_local(NewTask(**task_params),
                handler=test_classifier,
                inputs={"test_set":"test_set.parquet" })
                #,
                #workdir="/User/artifacts/",
                #artifact_path=mlconf.artifact_path + "/test-classifier")

[mlrun] 2020-05-03 19:26:43,087 starting run tasks - test classifier uid=b05d3905d305462e91e618089842eb4f  -> http://mlrun-api:8080
[mlrun] 2020-05-03 19:26:43,175 Traceback (most recent call last):
  File "/User/repos/mlrun/mlrun/runtimes/local.py", line 184, in exec_from_params
    val = handler(*args_list)
  File "<ipython-input-4-48e61531480f>", line 29, in test_classifier
    xtest = test_set.as_df()
  File "/User/repos/mlrun/mlrun/datastore/base.py", line 198, in as_df
    df_module=df_module, format=format, **kwargs)
  File "/User/repos/mlrun/mlrun/datastore/base.py", line 117, in as_df
    return reader(self._join(key), **kwargs)
  File "/User/.pythonlibs/jupyter/lib/python3.6/site-packages/pandas/io/parquet.py", line 310, in read_parquet
    return impl.read(path, columns=columns, **kwargs)
  File "/User/.pythonlibs/jupyter/lib/python3.6/site-packages/pandas/io/parquet.py", line 125, in read
    path, columns=columns, **kwargs
  File "/conda/lib/python3.6/site-packages/pyarrow

Passed non-file path: /User/artifacts/test_set.parquet


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...9842eb4f,0,May 03 19:26:43,error,tasks - test classifier,host=jupyter-6c5fccf844-gxlrwkind=handlerowner=adminv3io_user=admin,test_set,label_column=labelsmodels_path=/User/artifacts/models/sklearn_classifierplots_dest=/User/artifacts/models/sklearn_classifier/plots,,


to track results use .show() or .logs() or in CLI: 
!mlrun get run b05d3905d305462e91e618089842eb4f --project default , !mlrun logs b05d3905d305462e91e618089842eb4f --project default
[mlrun] 2020-05-03 19:26:43,260 run executed, status=error


RunError: Passed non-file path: /User/artifacts/test_set.parquet

### remotely

In [None]:
from mlrun import NewTask
run = func.run(NewTask(**task_params), 
               inputs={"test_set":"test_set.parquet" },
               workdir="/User/artifacts",
               artifact_path="/User/artifacts/test-classifier")