In [None]:
# nuclio: ignore
import nuclio

In [None]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/ml-models"

In [None]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [None]:
import os
import pandas as pd
from mlrun.datastore import DataItem
from mlrun.artifacts import get_model

In [None]:
def _eval_model(context, xtest, ytest, model, score_method="micro", plots_dest="plots"):
    """internal evaluate one model
    """
    from sklearn import metrics
    from cloudpickle import load
    import matplotlib.pyplot as plt
    from sklearn.preprocessing import label_binarize
    from sklearn.utils.multiclass import unique_labels
    from mlrun.artifacts import PlotArtifact
    from mlrun.mlutils import plot_roc, feature_importances, gcf_clear

    # enclose all except model
    ytestb = label_binarize(ytest, classes=ytest.unique())
    
    context.header = xtest.columns.names
    
    clf = load(open(model, "rb"))
    if callable(getattr(clf, "predict_proba")):
        y_score = clf.predict_proba(xtest.values, validate_features=False)
        ypred = clf.predict(xtest.values, validate_features=False)
        plot_roc(context, ytestb, y_score, key='roc', plots_dir="plots")
    else:
        ypred = clf.predict(xtest.values, validate_features=False) # refactor
        y_score = None

    gcf_clear(plt)
    metrics.plot_confusion_matrix(clf, xtest, ytest, 
                                  labels=ytest.unique(), normalize='true') 
    
    context.log_artifact(PlotArtifact("confusion", body=plt.gcf()), 
                         artifact_path_path=f"{model.split('.')[-2]}-confusion.html") 
    
    if hasattr(clf, "feature_importances_"):
        plot, tbl = feature_importances(clf, list(xtest.columns))
        context.log_artifact(plot, local_path="plots/feature-importances.html")
        context.log_artifact(tbl, local_path="plots/feature-importances-table.csc")

    ytestb = label_binarize(ytest, classes=ytest.unique())

    if ytestb.shape[1] > 1:
        average_precision = metrics.average_precision_score(ytestb,
                                                            y_score,
                                                            average=score_method)
        context.log_result(f"rocauc", metrics.roc_auc_score(ytestb, y_score))
    else:
        average_precision = metrics.average_precision_score(ytestb,
                                                            y_score[:, 1],
                                                            average=score_method)
        context.log_result(f"rocauc", metrics.roc_auc_score(ytestb, y_score[:, 1]))

    context.log_result(f"avg_precscore", average_precision)
    context.log_result(f"accuracy", float(clf.score(xtest, ytest)))
    context.log_result(f"f1_score", metrics.f1_score(ytest, ypred,
                                                     average=score_method))
    if y_score is None:
        return y_score
    else:
        return ypred

In [None]:

def test_classifier(
    context,
    models_path: DataItem, 
    test_set: DataItem,
    label_column: str,
    score_method: str = 'micro',
    plots_dest: str = "plots",
    model_evaluator = None
) -> None:
    """Test one or more classifier models against held-out dataset
    
    Using held-out test features, evaluates the peformance of the estimated model
    
    Can be part of a kubeflow pipeline as a test step that is run post EDA and 
    training/validation cycles
    
    :param context:         the function context
    :param models_path:     artifact models representing a file or a folder
    :param test_set:        test features and labels
    :param label_column:    column name for ground truth labels
    :param score_method:    for multiclass classification
    :param plots_dest:      dir for test plots
    :param model_evaluator: NOT IMPLEMENTED: specific method to generate eval, passed in as string
                            or available in this folder
    """
    xtest = test_set.as_df()
    ytest = xtest.pop(label_column)
    
    model_file, model_obj, _ = get_model(models_path.url, suffix='.pkl')
    
    # there could be different eval_models, type of model (xgboost, tfv1, tfv2...)
    # or how probabilities are calculated, etc...
    if not model_evaluator:
        # binary and multiclass
        y_hat = _eval_model(context, xtest, ytest, model_file, 
                            score_method,
                            plots_dest or 'plots')

    # give the prediction columns titles/headers
    if y_hat.ndim == 1 or y_hat.shape[1] == 1:
        score_names = ["yscore"]
    else:
        score_names = ["yscore_" + str(x) for x in range(y_hat.shape[1])]

    # log the test set and its predictions (should also bind model and metadata)
    df = pd.concat([xtest, ytest, pd.DataFrame(y_hat, columns=score_names)], axis=1)
    context.log_dataset("test_set_preds", df=df, format="parquet", index=False)

In [None]:
# nuclio: end-code

### mlconfig

In [None]:
from mlrun import mlconf
import os

mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'

### save

In [None]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("test_classifier")

# add metadata (for templates and reuse)
fn.spec.default_handler = "test_classifier"
fn.spec.description = "test a classifier using held-out or new data"
fn.metadata.categories = ["ml", "test"]
fn.metadata.labels = {"author": "yjb", "framework": "sklearn"}
fn.export("function.yaml")

## tests

In [None]:
if "V3IO_HOME" in list(os.environ):
    from mlrun import mount_v3io
    fn.apply(mount_v3io())
else:
    # is you set up mlrun using the instructions at https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
    from mlrun.platforms import mount_pvc
    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/jovyan/data'))

In [None]:
task_params = {
    "name" : "tasks - test classifier",
    "params": {
        # Ina pipeline setting, the models_path parameter would be the output of a training step
        "models_path"   : mlconf.artifact_path + "/models",
        "label_column"  : "labels",
        "plots_dest"    : mlconf.artifact_path + "/plots"}}

### run locally

In [None]:
DATA_URL = "https://raw.githubusercontent.com/yjb-ds/testdata/master/data/test_set.parquet"

In [None]:
from mlrun import run_local, NewTask

run = run_local(NewTask(**task_params),
                handler=test_classifier,
                inputs={"test_set": "/User/artifacts/test_set.parquet",
                        "models_path": "models"},
                workdir=mlconf.artifact_path)

### remotely

In [None]:
from mlrun import NewTask
run = fn.run(NewTask(**task_params), 
             inputs={"test_set": "/User/artifacts/test_set.parquet",
                        "models_path": "models"},
             workdir="/User/artifacts")