In [1]:
# nuclio: ignore
import nuclio

In [2]:
import os
import json
import importlib
from cloudpickle import load

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import label_binarize
from sklearn.utils.multiclass import unique_labels

from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import TableArtifact, PlotArtifact

from mlrun.mlutils import plot_roc, plot_importance, gcf_clear

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

def _gcf_clear(plt):
    plt.cla()
    plt.clf()
    plt.close()        

def test_classifier(
    context: MLClientCtx,
    models_path: str, 
    test_set: str,
    label_column: str,
    score_method: str = 'micro',
    plots_dest: str = ""
) -> None:
    """Test one or more classifier models against held-out dataset
    
    Using held-out test features, evaluates the peformance of the estimated model
    
    Can be part of a kubeflow pipeline as a test step that is run post EDA and 
    training/validation cycles
    
    :param context:         the function context
    :param models_path:     artifact models representing a file or a folder
    :param test_set:        test features and labels
    :param label_column:    column name for ground truth labels
    :param score_method:    for multiclass classification
    :param plots_dest:       dir for test plots
    """
    xtest = pd.read_parquet(str(test_set))
    ytest = xtest.pop(label_column)
    
    context.header = list(xtest.columns.values)
    def _eval_model(model):
        # enclose all except model
        ytestb = label_binarize(ytest, classes=ytest.unique())
        clf = load(open(model, "rb"))
        if callable(getattr(clf, "predict_proba")):
            y_score = clf.predict_proba(xtest.values)
            ypred = clf.predict(xtest.values)
            context.logger.info(f"y_score.shape {y_score.shape}")
            context.logger.info(f"ytestb.shape {ytestb.shape}")
            plot_roc(context, ytestb, y_score, key='roc', plots_dir=plots_dest)
        else:
            ypred = clf.predict(xtest.values) # refactor
            y_score = None
            
        gcf_clear(plt)
        # use sklearn >= v0.22 built in:
       
        metrics.plot_confusion_matrix(clf, xtest, ytest, 
                                      labels=ytest.unique(), normalize='true') 
        
        context.log_artifact(PlotArtifact("confusion", body=plt.gcf()), 
                             local_path=f"{plots_dest}/confusion.html")        
    
        #if hasattr(clf, "feature_importances_"):
        #    plot_importance(context, clf, key=f"featimp")

        ytestb = label_binarize(ytest, classes=ytest.unique()) # if binary 0/1 labels, will return labels as is
        context.logger.info(f"y_score.shape {y_score.shape}")
        context.logger.info(f"yvalidb.shape {ytestb.shape}")
        if ytestb.shape[1] > 1:
            # label encoding was applied:
            average_precision = metrics.average_precision_score(ytestb,
                                                                y_score,
                                                                average=score_method)
            context.log_result(f"rocauc", metrics.roc_auc_score(ytestb, y_score))
        else:
            average_precision = metrics.average_precision_score(ytestb,
                                                                y_score[:, 1],
                                                                average=score_method)
            context.log_result(f"rocauc", metrics.roc_auc_score(ytestb, y_score[:, 1]))

        context.log_result(f"avg_precscore", average_precision)
        context.log_result(f"accuracy", float(clf.score(xtest, ytest)))
        context.log_result(f"f1_score", metrics.f1_score(ytest, ypred,
                                                         average=score_method))
    
    models_path = str(models_path)
    if models_path.endswith('.pkl'):
        _eval_model(models_path)
        
    else:
        for model in os.listdir(models_path):
            if model.endswith('.pkl'):
                _eval_model(os.path.join(models_path, model))


In [None]:
# nuclio: end-code

### save

In [11]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("test_classifier", kind="job", with_doc=True,
                      image="mlrun/ml-models")

# add metadata (for templates and reuse)
fn.spec.default_handler = "test_classifier"
fn.spec.description = "test a classifier using held-out or new data"
fn.metadata.categories = ["models", "testing"]
fn.metadata.labels = {"author": "yjb"}

fn.save()
fn.export("function.yaml")

[mlrun] 2020-03-29 21:05:30,961 saving function: test-classifier, tag: latest
[mlrun] 2020-03-29 21:05:30,992 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fa64db53160>

### test

In [5]:
from mlrun import import_function, mount_v3io

func = import_function("hub://test_classifier").apply(mount_v3io())
# func = import_function("function.yaml").apply(mlrun.mount_v3io())

task_params = {
    "name" : "tasks - test classifier",
    "params": {
        # Ina pipeline setting, the models_path parameter would be the output of a training step
        "models_path"   : "/User/artifacts/models",
        "test_set"      : "/User/artifacts/test_set.parquet",
        "label_column"  : "labels"}}

from mlrun import NewTask
run = func.run(NewTask(**task_params), artifact_path="/User/artifacts")

[mlrun] 2020-03-26 15:30:16,927 starting run tasks - test classifier uid=7cfeb84664524a3a8713d62b10dd66ef  -> http://mlrun-api:8080
[mlrun] 2020-03-26 15:30:17,027 Job is running in the background, pod: tasks---test-classifier-pdzhs
No handles with labels found to put in legend.
[mlrun] 2020-03-26 15:30:27,471 y_score.shape (57, 2)
[mlrun] 2020-03-26 15:30:27,471 ytestb.shape (57, 1)
[mlrun] 2020-03-26 15:30:27,583 log artifact roc at /User/artifacts/plots/roc.html, size: 31054, db: Y
[mlrun] 2020-03-26 15:30:27,697 log artifact confusion at /User/artifacts/plots/confusion.html, size: 20680, db: Y
[mlrun] 2020-03-26 15:30:27,698 y_score.shape (57, 2)
[mlrun] 2020-03-26 15:30:27,698 yvalidb.shape (57, 1)
[mlrun] 2020-03-26 15:30:27,713 log artifact TODAYS-MODELS-TEST-REPORT at /User/artifacts/model.pkl, size: None, db: Y
[mlrun] 2020-03-26 15:30:27,724 log artifact DEPLOY at /User/artifacts/DEPLOY, size: 4, db: Y

[mlrun] 2020-03-26 15:30:27,737 run executed, status=completed
final stat

uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...dd66ef,0,Mar 26 15:30:27,completed,tasks - test classifier,host=tasks---test-classifier-pdzhskind=jobowner=admin,,label_column=labelsmodels_dir=/User/artifacts/modelstest_set=/User/artifacts/test_set.parquet,accuracy=0.9298245614035088avg_precscore=0.9920335330006206f1_score=0.9298245614035088rocauc=0.9900744416873448,rocconfusionTODAYS-MODELS-TEST-REPORTDEPLOY


to track results use .show() or .logs() or in CLI: 
!mlrun get run 7cfeb84664524a3a8713d62b10dd66ef  , !mlrun logs 7cfeb84664524a3a8713d62b10dd66ef 
[mlrun] 2020-03-26 15:30:36,259 run executed, status=completed
