In [1]:
# nuclio: ignore
import nuclio

In [2]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/ml-models"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/ml-models'


In [3]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

import json
import os

from cloudpickle import dumps, load, dump

from sklearn import metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn import metrics

from typing import List
from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact
from mlrun.mlutils import (get_class_fit, create_class,
                           plot_roc, feature_importances,
                           gcf_clear)

def train_model(
    context: MLClientCtx,
    model_pkg_class: str,
    dataset: DataItem,
    label_column: str = "labels",
    sample: int = -1,
    test_size: float = 0.05,
    train_val_split: float = 0.75,
    rng: int = 1,
    model_filename: str = "model",
    models_dest: str = "",
    cmap = plt.cm.Blues,
    plots_dest: str = "",
    score_method: str = "micro",
    file_ext: str = "parquet",
    model_pkg_file: str = "",    
) -> None:
    """train a classifier.

    :param context:           the function context
    :param model_pkg_class:   the model to train, e.g, "sklearn.neural_networks.MLPClassifier", 
                              or json model config
    :param dataset:           ("data") name of raw data file
    :param label_column:      ground-truth (y) labels
    :param sample:            Selects the first n rows, or select a sample
                              starting from the first. If negative <-1, select
                              a random sample
    :param model_filename:    model file filename,
                              points to a directory
    :param test_size:         (0.05) test set size
    :param train_val_split:   (0.75) Once the test set has been removed the
                              training set gets this proportion.
    :param rng:               (1) sklearn rng seed
    :param models_dest:       models subfolder on artifact path
    :param cmap:              matplotlib `Colormap`
    :param plots_dest:        plot subfolder on artifact path
    :param score_method:      for multiclass classification
    
    :param file_ext:          format for test_set_key hold out data
    :param model_pkg_file:    json model config file                                  
    """
    table = dataset.as_df()
    
    models_dest = models_dest or 'models'
    plots_dest = plots_dest or f'plots/{context.name}'
    
    if (sample == -1) or (sample >= 1):
        # get all rows, or contiguous sample starting at row 1.
        raw = table.dropna()
        labels = raw.pop(label_column)
        raw = raw.iloc[:sample, :]
        labels = labels.iloc[:sample]
    else:
        # grab a random sample
        raw = table.dropna().sample(sample * -1)
        labels = raw.pop(label_column)

    context.header = raw.columns.values
    
    yb = label_binarize(labels, classes=labels.unique())
    
    # here we hide the binary encoded labels inside the X matrix so that when splitting we preserve order in both the encoded
    # and non-encoded labels:
    x, xtest, y, ytest = train_test_split(np.concatenate([raw, yb], axis=1), labels, test_size=test_size, random_state=rng)
    xtrain, xvalid, ytrain, yvalid = train_test_split(x, y, train_size=train_val_split, random_state=rng)
    # now extract the hot_encoded labels
    ytrainb = xtrain[:, -yb.shape[1]:].copy()
    xtrain = xtrain[:, :-yb.shape[1]].copy()
    # extract the hot_encoded labels
    yvalidb = xvalid[:, -yb.shape[1]:].copy()
    xvalid = xvalid[:, :-yb.shape[1]].copy()
    # extract the hot_encoded labels
    ytestb = xtest[:, -yb.shape[1]:].copy()
    xtest = xtest[:, :-yb.shape[1]].copy()                                      
    
    test_set = pd.concat(
        [pd.DataFrame(data=xtest, columns=context.header),
         pd.DataFrame(data=ytest.values, columns=[label_column])],
        axis=1,)
    context.log_dataset('test_set', df=test_set, format=file_ext, index=False)

    if model_pkg_file:
        model_config = json.load(open(model_pkg_file, "r"))
    elif model_pkg_class:
        model_config = get_class_fit(model_pkg_class)
    else:
        raise ValueError('model_pkg_file or model_pkg_class must be provided')
    
    for k, v in context.parameters.items():
        if k.startswith('CLASS_'):
            model_config['CLASS'][k[6:]] = v
        if k.startswith('FIT_'):
            model_config['FIT'][k[4:]] = v

    model_config["FIT"].update({"X": xtrain,"y": ytrain.values})
    
    ClassifierClass = create_class(model_config["META"]["class"])
    model = ClassifierClass(**model_config["CLASS"])
    model.fit(**model_config["FIT"])
    
    data = dumps(model)
    ypred = model.predict(xvalid)
    y_score = model.predict_proba(xvalid)
    
    try:
        if yvalidb.shape[1] > 1:
            average_precision = metrics.average_precision_score(yvalidb,
                                                                y_score,
                                                                average=score_method)
            context.log_result(f"rocauc", metrics.roc_auc_score(yvalidb, y_score))
        else:
            average_precision = metrics.average_precision_score(yvalidb,
                                                                y_score[:, 1],
                                                                average=score_method)
            context.log_result(f"rocauc", metrics.roc_auc_score(yvalidb, y_score[:, 1]))
    except:
        context.logger.info('Error while calculating precision')
        
    try:
        context.log_result(f"accuracy", float(model.score(xvalid, yvalid)))
    except:
        context.logger.info('Error while calculating accuracy')
    try:
        context.log_result(f"f1_score", metrics.f1_score(yvalid, ypred,
                                                         average=score_method))
    except:
        context.logger.info('Error while calculating f1_score')

    # TODO: missing validation plots, callbacks need to reintroduced
    plot_roc(context, yvalidb, y_score, key="roc", plots_dir=plots_dest)
    gcf_clear(plt)
    metrics.plot_confusion_matrix(model, xvalid, yvalid, labels=labels.unique(), normalize='true', cmap=cmap) 
    confusion = context.log_artifact(PlotArtifact("confusion", body=plt.gcf()), local_path=f"{plots_dest}/confusion.html")
    
    context.log_model('model', body=data, model_dir=models_dest, 
                      model_file=f"{model_filename.split('.')[-1]}.pkl",
                      metrics=context.results, 
                      extra_data={'confusion': confusion.target_path})    

In [4]:
# nuclio: end-code

### mlconfig

In [5]:
from mlrun import mlconf
import os

mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'

### save

In [6]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("sklearn_classifier")

# add metadata (for templates and reuse)
fn.spec.default_handler = "train_model"
fn.spec.description = "train any classifier using scikit-learn's API"
fn.metadata.categories = ["ml", "training"]
fn.metadata.labels = {"author": "yjb", "framework": "sklearn"}
fn.export("function.yaml")

[mlrun] 2020-05-14 22:27:18,030 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fb15dc3bcf8>

## tests

In [7]:
if "V3IO_HOME" in list(os.environ):
    from mlrun import mount_v3io
    fn.apply(mount_v3io())
else:
    # is you set up mlrun using the instructions at 
    # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
    from mlrun.platforms import mount_pvc
    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/jovyan/data'))

In [8]:
from mlrun import NewTask    

task_params = {
    "name" : "",
    "params" : {
        # CHOOSE YOUR MODEL AND CHANGE SOME DEFAULT PARAMETERS
       # "model_pkg_class"    : model_class,
        "CLASS_random_state" : 1,
        #"CLASS_probability"  : True,  # USE ONLY FOR SVC
        "sample"             : -1,
        "label_column"       : "labels",
        "test_size"          : 0.10,
        "train_val_split"    : 0.75,
        "rng"                : 1,
        "models_dest"        : 'sklearn_models'}}
  

### run remotely

In [9]:
DATA_URL = "https://raw.githubusercontent.com/yjb-ds/testdata/master/data/classifier-data.csv"

In [10]:
MODELS = ["sklearn.ensemble.RandomForestClassifier", 
         "sklearn.linear_model.LogisticRegression",
         "sklearn.ensemble.AdaBoostClassifier",
         "lightgbm.LGBMClassifier",
         "xgboost.XGBClassifier"]

for model in MODELS:
    task_params.update({"params":{"model_pkg_class":model, 
                                  "models_dest" : f"sklearn/{model}"}})
    task = NewTask(**task_params)
    run = fn.run(task, name=model.replace('.', '_'),
                 inputs={"dataset"  : DATA_URL})

[mlrun] 2020-05-14 22:27:20,940 starting run sklearn_ensemble_RandomForestClassifier uid=d768e14e11934683abe6022780d24d6b  -> http://mlrun-api:8080
[mlrun] 2020-05-14 22:27:21,082 Job is running in the background, pod: sklearn-ensemble-randomforestclassifier-gpb9t
No handles with labels found to put in legend.
[mlrun] 2020-05-14 22:27:25,858 log artifact test_set at /User/artifacts/test_set.parquet, size: 4513, db: Y
[mlrun] 2020-05-14 22:27:26,200 log artifact roc at /User/artifacts/plots/sklearn_ensemble_RandomForestClassifier/roc.html, size: 32658, db: Y
[mlrun] 2020-05-14 22:27:26,354 log artifact confusion at /User/artifacts/plots/sklearn_ensemble_RandomForestClassifier/confusion.html, size: 22072, db: Y
[mlrun] 2020-05-14 22:27:26,375 log artifact model at /User/artifacts/sklearn/sklearn.ensemble.RandomForestClassifier/, size: 306403, db: Y

[mlrun] 2020-05-14 22:27:26,399 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...80d24d6b,0,May 14 22:27:25,completed,sklearn_ensemble_RandomForestClassifier,host=sklearn-ensemble-randomforestclassifier-gpb9tkind=jobowner=adminv3io_user=admin,dataset,model_pkg_class=sklearn.ensemble.RandomForestClassifiermodels_dest=sklearn/sklearn.ensemble.RandomForestClassifier,accuracy=0.9159663865546218f1_score=0.9159663865546218rocauc=0.963691159586682,test_setrocconfusionmodel


to track results use .show() or .logs() or in CLI: 
!mlrun get run d768e14e11934683abe6022780d24d6b  , !mlrun logs d768e14e11934683abe6022780d24d6b 
[mlrun] 2020-05-14 22:27:30,295 run executed, status=completed
[mlrun] 2020-05-14 22:27:30,296 starting run sklearn_linear_model_LogisticRegression uid=399ed4af5e214a4481c7d05a52b645a3  -> http://mlrun-api:8080
[mlrun] 2020-05-14 22:27:30,427 Job is running in the background, pod: sklearn-linear-model-logisticregression-w24rr
No handles with labels found to put in legend.
[mlrun] 2020-05-14 22:27:34,862 log artifact test_set at /User/artifacts/test_set.parquet, size: 4513, db: Y
[mlrun] 2020-05-14 22:27:35,045 log artifact roc at /User/artifacts/plots/sklearn_linear_model_LogisticRegression/roc.html, size: 31370, db: Y
[mlrun] 2020-05-14 22:27:35,176 log artifact confusion at /User/artifacts/plots/sklearn_linear_model_LogisticRegression/confusion.html, size: 18952, db: Y
[mlrun] 2020-05-14 22:27:35,196 log artifact model at /User/artifacts

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...52b645a3,0,May 14 22:27:34,completed,sklearn_linear_model_LogisticRegression,host=sklearn-linear-model-logisticregression-w24rrkind=jobowner=adminv3io_user=admin,dataset,model_pkg_class=sklearn.linear_model.LogisticRegressionmodels_dest=sklearn/sklearn.linear_model.LogisticRegression,accuracy=0.8907563025210085f1_score=0.8907563025210086rocauc=0.9701492537313433,test_setrocconfusionmodel


to track results use .show() or .logs() or in CLI: 
!mlrun get run 399ed4af5e214a4481c7d05a52b645a3  , !mlrun logs 399ed4af5e214a4481c7d05a52b645a3 
[mlrun] 2020-05-14 22:27:36,584 run executed, status=completed
[mlrun] 2020-05-14 22:27:36,585 starting run sklearn_ensemble_AdaBoostClassifier uid=b5c2b74483cd4a4480129f2e00ca7c20  -> http://mlrun-api:8080
[mlrun] 2020-05-14 22:27:36,718 Job is running in the background, pod: sklearn-ensemble-adaboostclassifier-k9bjn
No handles with labels found to put in legend.
[mlrun] 2020-05-14 22:27:41,146 log artifact test_set at /User/artifacts/test_set.parquet, size: 4513, db: Y
[mlrun] 2020-05-14 22:27:41,401 log artifact roc at /User/artifacts/plots/sklearn_ensemble_AdaBoostClassifier/roc.html, size: 31842, db: Y
[mlrun] 2020-05-14 22:27:41,564 log artifact confusion at /User/artifacts/plots/sklearn_ensemble_AdaBoostClassifier/confusion.html, size: 22000, db: Y
[mlrun] 2020-05-14 22:27:41,601 log artifact model at /User/artifacts/sklearn/sklearn

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...00ca7c20,0,May 14 22:27:40,completed,sklearn_ensemble_AdaBoostClassifier,host=sklearn-ensemble-adaboostclassifier-k9bjnkind=jobowner=adminv3io_user=admin,dataset,model_pkg_class=sklearn.ensemble.AdaBoostClassifiermodels_dest=sklearn/sklearn.ensemble.AdaBoostClassifier,accuracy=0.8991596638655462f1_score=0.8991596638655462rocauc=0.9724454649827785,test_setrocconfusionmodel


to track results use .show() or .logs() or in CLI: 
!mlrun get run b5c2b74483cd4a4480129f2e00ca7c20  , !mlrun logs b5c2b74483cd4a4480129f2e00ca7c20 
[mlrun] 2020-05-14 22:27:45,967 run executed, status=completed
[mlrun] 2020-05-14 22:27:45,968 starting run lightgbm_LGBMClassifier uid=b89db035e29b4d878595ebfab170a08c  -> http://mlrun-api:8080
[mlrun] 2020-05-14 22:27:46,099 Job is running in the background, pod: lightgbm-lgbmclassifier-f4w72
No handles with labels found to put in legend.
[mlrun] 2020-05-14 22:27:50,615 log artifact test_set at /User/artifacts/test_set.parquet, size: 4513, db: Y
[mlrun] 2020-05-14 22:27:51,280 log artifact roc at /User/artifacts/plots/lightgbm_LGBMClassifier/roc.html, size: 31794, db: Y
[mlrun] 2020-05-14 22:27:51,461 log artifact confusion at /User/artifacts/plots/lightgbm_LGBMClassifier/confusion.html, size: 22244, db: Y
[mlrun] 2020-05-14 22:27:51,486 log artifact model at /User/artifacts/sklearn/lightgbm.LGBMClassifier/, size: 165449, db: Y

[mlrun] 

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...b170a08c,0,May 14 22:27:50,completed,lightgbm_LGBMClassifier,host=lightgbm-lgbmclassifier-f4w72kind=jobowner=adminv3io_user=admin,dataset,model_pkg_class=lightgbm.LGBMClassifiermodels_dest=sklearn/lightgbm.LGBMClassifier,accuracy=0.8991596638655462f1_score=0.8991596638655462rocauc=0.9784730195177955,test_setrocconfusionmodel


to track results use .show() or .logs() or in CLI: 
!mlrun get run b89db035e29b4d878595ebfab170a08c  , !mlrun logs b89db035e29b4d878595ebfab170a08c 
[mlrun] 2020-05-14 22:27:55,320 run executed, status=completed
[mlrun] 2020-05-14 22:27:55,320 starting run xgboost_XGBClassifier uid=18ab9d3118f243018799a983a8dbdd7f  -> http://mlrun-api:8080
[mlrun] 2020-05-14 22:27:55,441 Job is running in the background, pod: xgboost-xgbclassifier-6zppp
No handles with labels found to put in legend.
[mlrun] 2020-05-14 22:27:59,849 log artifact test_set at /User/artifacts/test_set.parquet, size: 4513, db: Y
[mlrun] 2020-05-14 22:28:00,477 log artifact roc at /User/artifacts/plots/xgboost_XGBClassifier/roc.html, size: 31638, db: Y
[mlrun] 2020-05-14 22:28:00,761 log artifact confusion at /User/artifacts/plots/xgboost_XGBClassifier/confusion.html, size: 19260, db: Y
[mlrun] 2020-05-14 22:28:00,786 log artifact model at /User/artifacts/sklearn/xgboost.XGBClassifier/, size: 60946, db: Y

[mlrun] 2020-05-14 

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...a8dbdd7f,0,May 14 22:27:59,completed,xgboost_XGBClassifier,host=xgboost-xgbclassifier-6zpppkind=jobowner=adminv3io_user=admin,dataset,model_pkg_class=xgboost.XGBClassifiermodels_dest=sklearn/xgboost.XGBClassifier,accuracy=0.9327731092436975f1_score=0.9327731092436976rocauc=0.9728760045924225,test_setrocconfusionmodel


to track results use .show() or .logs() or in CLI: 
!mlrun get run 18ab9d3118f243018799a983a8dbdd7f  , !mlrun logs 18ab9d3118f243018799a983a8dbdd7f 
[mlrun] 2020-05-14 22:28:04,649 run executed, status=completed
