In [1]:
# nuclio: ignore
import nuclio

In [2]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/ml-models"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/ml-models'


In [2]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

import json
import os

from cloudpickle import dumps, load, dump

from sklearn import metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn import metrics

from typing import List
from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact
from mlrun.mlutils import (get_class_fit, create_class,
                           plot_roc, plot_importance,
                           gcf_clear)

def train_model(
    context: MLClientCtx,
    model_pkg_class: str,
    dataset: DataItem,
    label_column: str = "labels",
    sample: int = -1,
    test_size: float = 0.05,
    train_val_split: float = 0.75,
    rng: int = 1,
    model_filename: str = "model",
    models_dest: str = "",
    cmap = plt.cm.Blues,
    plots_dest: str = "",
    score_method: str = "micro",
    file_ext: str = "parquet",
    model_pkg_file: str = "",    
) -> None:
    """train a classifier.

    :param context:           the function context
    :param model_pkg_class:   the model to train, e.g, "sklearn.neural_networks.MLPClassifier", 
                              or json model config
    :param dataset:           ("data") name of raw data file
    :param label_column:      ground-truth (y) labels
    :param sample:            Selects the first n rows, or select a sample
                              starting from the first. If negative <-1, select
                              a random sample
    :param model_filename:    model file filename,
                              points to a directory
    :param test_size:         (0.05) test set size
    :param train_val_split:   (0.75) Once the test set has been removed the
                              training set gets this proportion.
    :param rng:               (1) sklearn rng seed
    :param models_dest:       models subfolder on artifact path
    :param cmap:              matplotlib `Colormap`
    :param plots_dest:        plot subfolder on artifact path
    :param score_method:      for multiclass classification
    
    :param file_ext:          format for test_set_key hold out data
    :param model_pkg_file:    json model config file                                  
    """
    table = dataset.as_df()
    
    models_dest = models_dest or 'models'
    plots_dest = plots_dest or f'plots/{context.name}'
    
    if (sample == -1) or (sample >= 1):
        # get all rows, or contiguous sample starting at row 1.
        raw = table.dropna()
        labels = raw.pop(label_column)
        raw = raw.iloc[:sample, :]
        labels = labels.iloc[:sample]
    else:
        # grab a random sample
        raw = table.dropna().sample(sample * -1)
        labels = raw.pop(label_column)

    context.header = raw.columns.values
    
    yb = label_binarize(labels, classes=labels.unique())
    
    # here we hide the binary encoded labels inside the X matrix so that when splitting we preserve order in both the encoded
    # and non-encoded labels:
    x, xtest, y, ytest = train_test_split(np.concatenate([raw, yb], axis=1), labels, test_size=test_size, random_state=rng)
    xtrain, xvalid, ytrain, yvalid = train_test_split(x, y, train_size=train_val_split, random_state=rng)
    # now extract the hot_encoded labels
    ytrainb = xtrain[:, -yb.shape[1]:].copy()
    xtrain = xtrain[:, :-yb.shape[1]].copy()
    # extract the hot_encoded labels
    yvalidb = xvalid[:, -yb.shape[1]:].copy()
    xvalid = xvalid[:, :-yb.shape[1]].copy()
    # extract the hot_encoded labels
    ytestb = xtest[:, -yb.shape[1]:].copy()
    xtest = xtest[:, :-yb.shape[1]].copy()                                      
    
    test_set = pd.concat(
        [pd.DataFrame(data=xtest, columns=context.header),
         pd.DataFrame(data=ytest.values, columns=[label_column])],
        axis=1,)
    context.log_dataset('test_set', df=test_set, format=file_ext, index=False)

    if model_pkg_file:
        model_config = json.load(open(model_pkg_file, "r"))
    elif model_pkg_class:
        model_config = get_class_fit(model_pkg_class)
    else:
        raise ValueError('model_pkg_file or model_pkg_class must be provided')
    
    for k, v in context.parameters.items():
        if k.startswith('CLASS_'):
            model_config['CLASS'][k[6:]] = v
        if k.startswith('FIT_'):
            model_config['FIT'][k[4:]] = v

    model_config["FIT"].update({"X": xtrain,"y": ytrain.values})
    
    ClassifierClass = create_class(model_config["META"]["class"])
    model = ClassifierClass(**model_config["CLASS"])
    model.fit(**model_config["FIT"])
    
    data = dumps(model)
    try:
        accuracy = float(model.score(xvalid, yvalid))
    except:
        context.logger.info('Error while calculating accuracy')

    context.log_model('model', body=data, model_dir=models_dest, 
                      model_file=f"{model_filename.split('.')[-1]}.pkl",
                      metrics={'accuracy': accuracy})

    ypred = model.predict(xvalid)
    y_score = model.predict_proba(xvalid)
    
    try:
        if yvalidb.shape[1] > 1:
            average_precision = metrics.average_precision_score(yvalidb,
                                                                y_score,
                                                                average=score_method)
            context.log_result(f"rocauc", metrics.roc_auc_score(yvalidb, y_score))
        else:
            average_precision = metrics.average_precision_score(yvalidb,
                                                                y_score[:, 1],
                                                                average=score_method)
            context.log_result(f"rocauc", metrics.roc_auc_score(yvalidb, y_score[:, 1]))
    except:
        context.logger.info('Error while calculating precision')
        
    context.log_result(f"accuracy", accuracy)
    try:
        context.log_result(f"f1_score", metrics.f1_score(yvalid, ypred,
                                                         average=score_method))
    except:
        context.logger.info('Error while calculating f1_score')

    # TODO: missing validation plots, callbacks need to reintroduced
    
    plot_roc(context, yvalidb, y_score, key="roc", plots_dir=plots_dest)
    gcf_clear(plt)
    metrics.plot_confusion_matrix(model, xvalid, yvalid, labels=labels.unique(), normalize='true', cmap=cmap) 
    context.log_artifact(PlotArtifact("confusion", body=plt.gcf()), local_path=f"{plots_dest}/confusion.html")

In [4]:
# nuclio: end-code

### mlconfig

In [17]:
from mlrun import mlconf
import os

mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'

### save

In [3]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("sklearn_classifier")

# add metadata (for templates and reuse)
fn.spec.default_handler = "train_model"
fn.spec.description = "train any classifier using scikit-learn's API"
fn.metadata.categories = ["ml", "training"]
fn.metadata.labels = {"author": "yjb", "framework": "sklearn"}
fn.export("function.yaml")

[mlrun] 2020-05-01 23:55:07,482 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f4b74372dd8>

## tests

In [19]:
# load function from marketplacen
from mlrun import import_function

# vcs_branch = 'development'
# base_vcs = f'https://raw.githubusercontent.com/mlrun/functions/{vcs_branch}/'
# mlconf.hub_url = mlconf.hub_url or base_vcs + f'{name}/function.yaml'
# fn = import_function("hub://sklearn_classifier")

In [7]:
if "V3IO_HOME" in list(os.environ):
    from mlrun import mount_v3io
    func.apply(mount_v3io())
else:
    # is you set up mlrun using the instructions at 
    # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
    from mlrun.platforms import mount_pvc
    func.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))

In [8]:
## TODO: CLASS_xxx that aren't valid for a give algo, just report and skip, not fail

from mlrun import NewTask    

task_params = {
    "name" : "",
    "params" : {
        # CHOOSE YOUR MODEL AND CHANGE SOME DEFAULT PARAMETERS
       # "model_pkg_class"    : model_class,
        "model_filename"     : "model_pkg_class",
        "CLASS_random_state" : 1,
        #"CLASS_probability"  : True,  # USE ONLY FOR SVC
        "sample"             : -1,
        "label_column"       : "labels",
        "test_size"          : 0.10,
        "train_val_split"    : 0.75,
        "rng"                : 1,
        "models_dest"        : mlconf.artifact_path}}
  

### run remotely

In [9]:
MODELS = ["sklearn.ensemble.RandomForestClassifier", 
         "sklearn.linear_model.LogisticRegression",
         "sklearn.ensemble.AdaBoostClassifier",
         "lightgbm.LGBMClassifier",
         "xgboost.XGBClassifier"]

for model in MODELS:
    task_params.update({"params":{"model_pkg_class":model, "model_filename" : model}})
    task = NewTask(**task_params)
    run = func.run(
        task,
        inputs={"dataset"  : mlconf.artifact_path + "/iris.parquet"},
        artifact_path=mlconf.artifact_path+"/sklearn_classifier")

[mlrun] 2020-04-30 20:45:31,100 starting run sklearn-classifier-train_model uid=540121eef3914e45b6de48a122854840  -> http://mlrun-api:8080
[mlrun] 2020-04-30 20:45:31,220 Job is running in the background, pod: sklearn-classifier-train-model-9qq9p
No handles with labels found to put in legend.
[mlrun] 2020-04-30 20:45:35,662 log artifact test_set at /User/artifacts/sklearn_classifier/test_set.parquet, size: 4151, db: Y
[mlrun] 2020-04-30 20:45:35,862 log artifact model at models/RandomForestClassifier.pkl/model, size: 145634, db: Y
[mlrun] 2020-04-30 20:45:36,028 log artifact roc at /User/artifacts/sklearn_classifier/plots/sklearn-classifier-train_model/roc.html, size: 30962, db: Y
[mlrun] 2020-04-30 20:45:36,198 log artifact confusion at /User/artifacts/sklearn_classifier/plots/sklearn-classifier-train_model/confusion.html, size: 19828, db: Y

[mlrun] 2020-04-30 20:45:36,218 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...22854840,0,Apr 30 20:45:35,completed,sklearn-classifier-train_model,host=sklearn-classifier-train-model-9qq9pkind=jobowner=adminv3io_user=admin,dataset,model_filename=sklearn.ensemble.RandomForestClassifiermodel_pkg_class=sklearn.ensemble.RandomForestClassifier,accuracy=1.0f1_score=1.0rocauc=1.0,test_setmodelrocconfusion


to track results use .show() or .logs() or in CLI: 
!mlrun get run 540121eef3914e45b6de48a122854840  , !mlrun logs 540121eef3914e45b6de48a122854840 
[mlrun] 2020-04-30 20:45:37,393 run executed, status=completed
[mlrun] 2020-04-30 20:45:37,394 starting run sklearn-classifier-train_model uid=e3b83b25df75414d8a706826694d95b0  -> http://mlrun-api:8080
[mlrun] 2020-04-30 20:45:37,499 Job is running in the background, pod: sklearn-classifier-train-model-5np74
No handles with labels found to put in legend.
[mlrun] 2020-04-30 20:45:41,960 log artifact test_set at /User/artifacts/sklearn_classifier/test_set.parquet, size: 4151, db: Y
[mlrun] 2020-04-30 20:45:42,021 log artifact model at models/LogisticRegression.pkl/model, size: 780, db: Y
[mlrun] 2020-04-30 20:45:42,168 log artifact roc at /User/artifacts/sklearn_classifier/plots/sklearn-classifier-train_model/roc.html, size: 30962, db: Y
[mlrun] 2020-04-30 20:45:42,318 log artifact confusion at /User/artifacts/sklearn_classifier/plots/sklear

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...694d95b0,0,Apr 30 20:45:41,completed,sklearn-classifier-train_model,host=sklearn-classifier-train-model-5np74kind=jobowner=adminv3io_user=admin,dataset,model_filename=sklearn.linear_model.LogisticRegressionmodel_pkg_class=sklearn.linear_model.LogisticRegression,accuracy=1.0f1_score=1.0rocauc=1.0,test_setmodelrocconfusion


to track results use .show() or .logs() or in CLI: 
!mlrun get run e3b83b25df75414d8a706826694d95b0  , !mlrun logs e3b83b25df75414d8a706826694d95b0 
[mlrun] 2020-04-30 20:45:43,658 run executed, status=completed
[mlrun] 2020-04-30 20:45:43,659 starting run sklearn-classifier-train_model uid=9c7c7727b88c48a4af1db71e692f90bb  -> http://mlrun-api:8080
[mlrun] 2020-04-30 20:45:43,763 Job is running in the background, pod: sklearn-classifier-train-model-9p6p9
No handles with labels found to put in legend.
[mlrun] 2020-04-30 20:45:47,885 log artifact test_set at /User/artifacts/sklearn_classifier/test_set.parquet, size: 4151, db: Y
[mlrun] 2020-04-30 20:45:48,021 log artifact model at models/AdaBoostClassifier.pkl/model, size: 29487, db: Y
[mlrun] 2020-04-30 20:45:48,188 log artifact roc at /User/artifacts/sklearn_classifier/plots/sklearn-classifier-train_model/roc.html, size: 30962, db: Y
[mlrun] 2020-04-30 20:45:48,341 log artifact confusion at /User/artifacts/sklearn_classifier/plots/skle

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...692f90bb,0,Apr 30 20:45:47,completed,sklearn-classifier-train_model,host=sklearn-classifier-train-model-9p6p9kind=jobowner=adminv3io_user=admin,dataset,model_filename=sklearn.ensemble.AdaBoostClassifiermodel_pkg_class=sklearn.ensemble.AdaBoostClassifier,accuracy=1.0f1_score=1.0rocauc=1.0,test_setmodelrocconfusion


to track results use .show() or .logs() or in CLI: 
!mlrun get run 9c7c7727b88c48a4af1db71e692f90bb  , !mlrun logs 9c7c7727b88c48a4af1db71e692f90bb 
[mlrun] 2020-04-30 20:45:52,981 run executed, status=completed
[mlrun] 2020-04-30 20:45:52,982 starting run sklearn-classifier-train_model uid=5ab34590bbda407495ab81aae0cadd35  -> http://mlrun-api:8080
[mlrun] 2020-04-30 20:45:53,092 Job is running in the background, pod: sklearn-classifier-train-model-h4h74
No handles with labels found to put in legend.
[mlrun] 2020-04-30 20:45:57,261 log artifact test_set at /User/artifacts/sklearn_classifier/test_set.parquet, size: 4151, db: Y
[mlrun] 2020-04-30 20:45:57,755 log artifact model at models/LGBMClassifier.pkl/model, size: 162515, db: Y
[mlrun] 2020-04-30 20:45:57,965 log artifact roc at /User/artifacts/sklearn_classifier/plots/sklearn-classifier-train_model/roc.html, size: 30962, db: Y
[mlrun] 2020-04-30 20:45:58,139 log artifact confusion at /User/artifacts/sklearn_classifier/plots/sklearn

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...e0cadd35,0,Apr 30 20:45:57,completed,sklearn-classifier-train_model,host=sklearn-classifier-train-model-h4h74kind=jobowner=adminv3io_user=admin,dataset,model_filename=lightgbm.LGBMClassifiermodel_pkg_class=lightgbm.LGBMClassifier,accuracy=1.0f1_score=1.0rocauc=1.0,test_setmodelrocconfusion


to track results use .show() or .logs() or in CLI: 
!mlrun get run 5ab34590bbda407495ab81aae0cadd35  , !mlrun logs 5ab34590bbda407495ab81aae0cadd35 
[mlrun] 2020-04-30 20:46:02,350 run executed, status=completed
[mlrun] 2020-04-30 20:46:02,351 starting run sklearn-classifier-train_model uid=c2aa2c1688d2499988e548c5908e7473  -> http://mlrun-api:8080
[mlrun] 2020-04-30 20:46:02,461 Job is running in the background, pod: sklearn-classifier-train-model-98tvx
No handles with labels found to put in legend.
[mlrun] 2020-04-30 20:46:06,872 log artifact test_set at /User/artifacts/sklearn_classifier/test_set.parquet, size: 4151, db: Y
[mlrun] 2020-04-30 20:46:07,798 log artifact model at models/XGBClassifier.pkl/model, size: 86044, db: Y
[mlrun] 2020-04-30 20:46:07,987 log artifact roc at /User/artifacts/sklearn_classifier/plots/sklearn-classifier-train_model/roc.html, size: 30962, db: Y
[mlrun] 2020-04-30 20:46:08,180 log artifact confusion at /User/artifacts/sklearn_classifier/plots/sklearn-c

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...908e7473,0,Apr 30 20:46:06,completed,sklearn-classifier-train_model,host=sklearn-classifier-train-model-98tvxkind=jobowner=adminv3io_user=admin,dataset,model_filename=xgboost.XGBClassifiermodel_pkg_class=xgboost.XGBClassifier,accuracy=0.9722222222222222f1_score=0.9722222222222222rocauc=1.0,test_setmodelrocconfusion


to track results use .show() or .logs() or in CLI: 
!mlrun get run c2aa2c1688d2499988e548c5908e7473  , !mlrun logs c2aa2c1688d2499988e548c5908e7473 
[mlrun] 2020-04-30 20:46:11,696 run executed, status=completed
