In [1]:
# nuclio: ignore
import nuclio

In [2]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

import json
import os

from cloudpickle import dumps, load, dump

from sklearn import metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn import metrics

from typing import List
from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact
from mlrun.mlutils import (get_class_fit, create_class,
                           plot_roc, plot_importance,
                           gcf_clear)

def train_model(
    context: MLClientCtx,
    model_pkg_class: str,
    dataset: DataItem,
    label_column: str = "labels",
    sample: int = -1,
    test_size: float = 0.05,
    train_val_split: float = 0.75,
    rng: int = 1,
    model_filename: str = "model",
    models_dest: str = "",
    cmap = plt.cm.Blues,
    plots_dest: str = "",
    score_method: str = "micro",
    file_ext: str = "parquet",
    model_pkg_file: str = "",    
) -> None:
    """train a classifier.

    :param context:           the function context
    :param model_pkg_class:   the model to train, e.g, "sklearn.neural_networks.MLPClassifier", 
                              or json model config
    :param dataset:           ("data") name of raw data file
    :param label_column:      ground-truth (y) labels
    :param sample:            Selects the first n rows, or select a sample
                              starting from the first. If negative <-1, select
                              a random sample
    :param model_filename:    model file filename,
                              points to a directory
    :param test_size:         (0.05) test set size
    :param train_val_split:   (0.75) Once the test set has been removed the
                              training set gets this proportion.
    :param rng:               (1) sklearn rng seed
    :param models_dest:       models subfolder on artifact path
    :param cmap:              matplotlib `Colormap`
    :param plots_dest:        plot subfolder on artifact path
    :param score_method:      for multiclass classification
    
    :param file_ext:          format for test_set_key hold out data
    :param model_pkg_file:    json model config file                                  
    """
    srcfilepath = str(dataset)
    
    models_dest = models_dest or 'models'
    plots_dest = plots_dest or f'plots/{context.name}'
    
    if srcfilepath.endswith("csv"):
        reader = pd.read_csv
    elif srcfilepath.endswith("parquet") or srcfilepath.endswith("pq"):
        reader = pd.read_parquet
    else:
        raise Exception(f"file type unhandled {srcfilepath}")

    if (sample == -1) or (sample >= 1):
        # get all rows, or contiguous sample starting at row 1.
        raw = reader(srcfilepath).dropna()
        labels = raw.pop(label_column)
        raw = raw.iloc[:sample, :]
        labels = labels.iloc[:sample]
    else:
        # grab a random sample
        raw = reader(srcfilepath).dropna().sample(sample * -1)
        labels = raw.pop(label_column)

    context.header = raw.columns.values
    
    yb = label_binarize(labels, classes=labels.unique())
    
    # here we hide the binary encoded labels inside the X matrix so that when splitting we preserve order in both the encoded
    # and non-encoded labels:
    x, xtest, y, ytest = train_test_split(np.concatenate([raw, yb], axis=1), labels, test_size=test_size, random_state=rng)
    xtrain, xvalid, ytrain, yvalid = train_test_split(x, y, train_size=train_val_split, random_state=rng)
    # now extract the hot_encoded labels
    ytrainb = xtrain[:, -yb.shape[1]:].copy()
    xtrain = xtrain[:, :-yb.shape[1]].copy()
    # extract the hot_encoded labels
    yvalidb = xvalid[:, -yb.shape[1]:].copy()
    xvalid = xvalid[:, :-yb.shape[1]].copy()
    # extract the hot_encoded labels
    ytestb = xtest[:, -yb.shape[1]:].copy()
    xtest = xtest[:, :-yb.shape[1]].copy()                                      
    
    test_set = pd.concat(
        [pd.DataFrame(data=xtest, columns=context.header),
         pd.DataFrame(data=ytest.values, columns=[label_column])],
        axis=1,)
    context.log_dataset('test_set', df=test_set, format=file_ext, index=False)

    if model_pkg_file:
        model_config = json.load(open(model_pkg_file, "r"))
    elif model_pkg_class:
        model_config = get_class_fit(model_pkg_class)
    else:
        raise ValueError('model_pkg_file or model_pkg_class must be provided')
    
    for k, v in context.parameters.items():
        if k.startswith('CLASS_'):
            model_config['CLASS'][k[6:]] = v
        if k.startswith('FIT_'):
            model_config['FIT'][k[4:]] = v

    model_config["FIT"].update({"X": xtrain,"y": ytrain.values})
    
    ClassifierClass = create_class(model_config["META"]["class"])
    model = ClassifierClass(**model_config["CLASS"])
    model.fit(**model_config["FIT"])
    
    try:
        # need this for debug functions + demos repo
        dump(model, open(context.artifact_path+f"/models/{model_filename.split('.')[-1]}.pkl", "wb"))
        
        data = dumps(model)
        context.log_artifact('model', body=data, artifact_path=f"{models_dest}/{model_filename.split('.')[-1]}.pkl")
    except Exception as e:
        print("SERIALIZE MODEL ERROR:", str(e))

    ypred = model.predict(xvalid)
    y_score = model.predict_proba(xvalid)
    
    try:
        if yvalidb.shape[1] > 1:
            average_precision = metrics.average_precision_score(yvalidb,
                                                                y_score,
                                                                average=score_method)
            context.log_result(f"rocauc", metrics.roc_auc_score(yvalidb, y_score))
        else:
            average_precision = metrics.average_precision_score(yvalidb,
                                                                y_score[:, 1],
                                                                average=score_method)
            context.log_result(f"rocauc", metrics.roc_auc_score(yvalidb, y_score[:, 1]))
    except:
        context.logger.info('Error while calculating precision')
        
    try:
        context.log_result(f"accuracy", float(model.score(xvalid, yvalid)))
    except:
        context.logger.info('Error while calculating accuracy')
    try:
        context.log_result(f"f1_score", metrics.f1_score(yvalid, ypred,
                                                         average=score_method))
    except:
        context.logger.info('Error while calculating f1_score')

    # TODO: missing validation plots, callbacks need to reintroduced
    
    plot_roc(context, yvalidb, y_score, key="roc", plots_dir=plots_dest)
    gcf_clear(plt)
    metrics.plot_confusion_matrix(model, xvalid, yvalid, labels=labels.unique(), normalize='true', cmap=cmap) 
    context.log_artifact(PlotArtifact("confusion", body=plt.gcf()), local_path=f"{plots_dest}/confusion.html")

In [3]:
# nuclio: end-code

### mlconfig

In [4]:
from mlrun import mlconf

In [5]:
mlconf.dbpath = mlconf.dbpath or './'
mlconf.dbpath

'http://mlrun-api:8080'

In [6]:
vcs_branch = 'development'
base_vcs = f'https://raw.githubusercontent.com/mlrun/functions/{vcs_branch}/'

mlconf.hub_url = mlconf.hub_url or base_vcs + f'{name}/function.yaml'
mlconf.hub_url

'/User/repos/functions/{name}/function.yaml'

In [7]:
import os
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["V3IO_HOME"]}/artifacts'
mlconf.artifact_path

'/User/artifacts'

In [8]:
import os
TAG = os.environ['MLRUN_COMMIT']

### save

In [9]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("sklearn_classifier", kind="job", with_doc=True,
                      image=f"mlrun/ml-models:{TAG}")

# add metadata (for templates and reuse)
fn.spec.default_handler = "train_model"
fn.spec.description = "train any classifier using scikit-learn's API"
fn.metadata.categories = ["models", "classifier"]
fn.metadata.labels = {"author": "yjb"}

fn.save()
fn.export("function.yaml")

[mlrun] 2020-04-30 01:23:00,191 saving function: sklearn-classifier, tag: latest
[mlrun] 2020-04-30 01:23:00,234 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f618a26b940>

## tests

In [10]:
from mlrun import import_function, mount_v3io

func = import_function("hub://sklearn_classifier")

if "V3IO_HOME" in list(os.environ):
    from mlrun import mount_v3io
    func.apply(mount_v3io())
else:
    # is you set up mlrun using the instructions at 
    # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
    from mlrun.platforms import mount_pvc
    func.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))

In [11]:
## TODO: CLASS_xxx that aren't valid for a give algo, just report and skip, not fail

from mlrun import NewTask    

task_params = {
    "name" : "",
    "params" : {
        # CHOOSE YOUR MODEL AND CHANGE SOME DEFAULT PARAMETERS
       # "model_pkg_class"    : model_class,
        "model_filename"     : "model_pkg_class",
        "CLASS_random_state" : 1,
        #"CLASS_probability"  : True,  # USE ONLY FOR SVC
        "sample"             : -1,
        "label_column"       : "labels",
        "test_size"          : 0.10,
        "train_val_split"    : 0.75,
        "rng"                : 1,
        "models_dest"        : mlconf.artifact_path}}
  

### run remotely

In [12]:
MODELS = ["sklearn.ensemble.RandomForestClassifier", 
         "sklearn.linear_model.LogisticRegression",
         "sklearn.ensemble.AdaBoostClassifier",
         "lightgbm.LGBMClassifier",
         "xgboost.XGBClassifier"]

for model in MODELS:
    task_params.update({"params":{"model_pkg_class":model, "model_filename" : model}})
    task = NewTask(**task_params)
    run = func.run(
        task,
        inputs={"dataset"  : mlconf.artifact_path + "/iris.parquet"},
        artifact_path=mlconf.artifact_path+"/sklearn_classifier")

[mlrun] 2020-04-30 01:23:18,230 starting run sklearn-classifier-train_model uid=73a7007a84444c98b928bf1b91fa6492  -> http://mlrun-api:8080
[mlrun] 2020-04-30 01:23:18,374 Job is running in the background, pod: sklearn-classifier-train-model-lzxg2
No handles with labels found to put in legend.
[mlrun] 2020-04-30 01:23:22,741 log artifact test_set at /User/artifacts/sklearn_classifier/test_set.parquet, size: 4151, db: Y
[mlrun] 2020-04-30 01:23:22,953 log artifact model at models/RandomForestClassifier.pkl/model, size: 152366, db: Y
[mlrun] 2020-04-30 01:23:23,136 log artifact roc at /User/artifacts/sklearn_classifier/plots/sklearn-classifier-train_model/roc.html, size: 30962, db: Y
[mlrun] 2020-04-30 01:23:23,287 log artifact confusion at /User/artifacts/sklearn_classifier/plots/sklearn-classifier-train_model/confusion.html, size: 19828, db: Y

[mlrun] 2020-04-30 01:23:23,307 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...91fa6492,0,Apr 30 01:23:22,completed,sklearn-classifier-train_model,host=sklearn-classifier-train-model-lzxg2kind=jobowner=adminv3io_user=admin,dataset,model_filename=sklearn.ensemble.RandomForestClassifiermodel_pkg_class=sklearn.ensemble.RandomForestClassifier,accuracy=1.0f1_score=1.0rocauc=1.0,test_setmodelrocconfusion


to track results use .show() or .logs() or in CLI: 
!mlrun get run 73a7007a84444c98b928bf1b91fa6492  , !mlrun logs 73a7007a84444c98b928bf1b91fa6492 
[mlrun] 2020-04-30 01:23:27,578 run executed, status=completed
[mlrun] 2020-04-30 01:23:27,579 starting run sklearn-classifier-train_model uid=e4dc2729c1e14fef954b0b7e13d2aa40  -> http://mlrun-api:8080
[mlrun] 2020-04-30 01:23:27,722 Job is running in the background, pod: sklearn-classifier-train-model-t78pz
No handles with labels found to put in legend.
[mlrun] 2020-04-30 01:23:31,836 log artifact test_set at /User/artifacts/sklearn_classifier/test_set.parquet, size: 4151, db: Y
[mlrun] 2020-04-30 01:23:31,894 log artifact model at models/LogisticRegression.pkl/model, size: 780, db: Y
[mlrun] 2020-04-30 01:23:32,030 log artifact roc at /User/artifacts/sklearn_classifier/plots/sklearn-classifier-train_model/roc.html, size: 30962, db: Y
[mlrun] 2020-04-30 01:23:32,170 log artifact confusion at /User/artifacts/sklearn_classifier/plots/sklear

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...13d2aa40,0,Apr 30 01:23:31,completed,sklearn-classifier-train_model,host=sklearn-classifier-train-model-t78pzkind=jobowner=adminv3io_user=admin,dataset,model_filename=sklearn.linear_model.LogisticRegressionmodel_pkg_class=sklearn.linear_model.LogisticRegression,accuracy=1.0f1_score=1.0rocauc=1.0,test_setmodelrocconfusion


to track results use .show() or .logs() or in CLI: 
!mlrun get run e4dc2729c1e14fef954b0b7e13d2aa40  , !mlrun logs e4dc2729c1e14fef954b0b7e13d2aa40 
[mlrun] 2020-04-30 01:23:33,877 run executed, status=completed
[mlrun] 2020-04-30 01:23:33,878 starting run sklearn-classifier-train_model uid=dbb7fb416d3d41b49c4ddff90f9810ff  -> http://mlrun-api:8080
[mlrun] 2020-04-30 01:23:34,005 Job is running in the background, pod: sklearn-classifier-train-model-xhjn7
No handles with labels found to put in legend.
[mlrun] 2020-04-30 01:23:38,126 log artifact test_set at /User/artifacts/sklearn_classifier/test_set.parquet, size: 4151, db: Y
[mlrun] 2020-04-30 01:23:38,272 log artifact model at models/AdaBoostClassifier.pkl/model, size: 29487, db: Y
[mlrun] 2020-04-30 01:23:38,438 log artifact roc at /User/artifacts/sklearn_classifier/plots/sklearn-classifier-train_model/roc.html, size: 30962, db: Y
[mlrun] 2020-04-30 01:23:38,588 log artifact confusion at /User/artifacts/sklearn_classifier/plots/skle

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...0f9810ff,0,Apr 30 01:23:38,completed,sklearn-classifier-train_model,host=sklearn-classifier-train-model-xhjn7kind=jobowner=adminv3io_user=admin,dataset,model_filename=sklearn.ensemble.AdaBoostClassifiermodel_pkg_class=sklearn.ensemble.AdaBoostClassifier,accuracy=1.0f1_score=1.0rocauc=1.0,test_setmodelrocconfusion


to track results use .show() or .logs() or in CLI: 
!mlrun get run dbb7fb416d3d41b49c4ddff90f9810ff  , !mlrun logs dbb7fb416d3d41b49c4ddff90f9810ff 
[mlrun] 2020-04-30 01:23:40,158 run executed, status=completed
[mlrun] 2020-04-30 01:23:40,159 starting run sklearn-classifier-train_model uid=fe695975591e44938f208aa3d2977b86  -> http://mlrun-api:8080
[mlrun] 2020-04-30 01:23:40,281 Job is running in the background, pod: sklearn-classifier-train-model-qkz69
No handles with labels found to put in legend.
[mlrun] 2020-04-30 01:23:44,340 log artifact test_set at /User/artifacts/sklearn_classifier/test_set.parquet, size: 4151, db: Y
[mlrun] 2020-04-30 01:23:44,834 log artifact model at models/LGBMClassifier.pkl/model, size: 162515, db: Y
[mlrun] 2020-04-30 01:23:45,036 log artifact roc at /User/artifacts/sklearn_classifier/plots/sklearn-classifier-train_model/roc.html, size: 30962, db: Y
[mlrun] 2020-04-30 01:23:45,284 log artifact confusion at /User/artifacts/sklearn_classifier/plots/sklearn

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...d2977b86,0,Apr 30 01:23:44,completed,sklearn-classifier-train_model,host=sklearn-classifier-train-model-qkz69kind=jobowner=adminv3io_user=admin,dataset,model_filename=lightgbm.LGBMClassifiermodel_pkg_class=lightgbm.LGBMClassifier,accuracy=1.0f1_score=1.0rocauc=1.0,test_setmodelrocconfusion


to track results use .show() or .logs() or in CLI: 
!mlrun get run fe695975591e44938f208aa3d2977b86  , !mlrun logs fe695975591e44938f208aa3d2977b86 
[mlrun] 2020-04-30 01:23:46,431 run executed, status=completed
[mlrun] 2020-04-30 01:23:46,432 starting run sklearn-classifier-train_model uid=e4255990ec6a4a1a9b856b77649f27fe  -> http://mlrun-api:8080
[mlrun] 2020-04-30 01:23:46,562 Job is running in the background, pod: sklearn-classifier-train-model-v8cgc
No handles with labels found to put in legend.
[mlrun] 2020-04-30 01:23:50,677 log artifact test_set at /User/artifacts/sklearn_classifier/test_set.parquet, size: 4151, db: Y
[mlrun] 2020-04-30 01:23:51,219 log artifact model at models/XGBClassifier.pkl/model, size: 86044, db: Y
[mlrun] 2020-04-30 01:23:51,495 log artifact roc at /User/artifacts/sklearn_classifier/plots/sklearn-classifier-train_model/roc.html, size: 30962, db: Y
[mlrun] 2020-04-30 01:23:51,766 log artifact confusion at /User/artifacts/sklearn_classifier/plots/sklearn-c

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...649f27fe,0,Apr 30 01:23:50,completed,sklearn-classifier-train_model,host=sklearn-classifier-train-model-v8cgckind=jobowner=adminv3io_user=admin,dataset,model_filename=xgboost.XGBClassifiermodel_pkg_class=xgboost.XGBClassifier,accuracy=0.9722222222222222f1_score=0.9722222222222222rocauc=1.0,test_setmodelrocconfusion


to track results use .show() or .logs() or in CLI: 
!mlrun get run e4255990ec6a4a1a9b856b77649f27fe  , !mlrun logs e4255990ec6a4a1a9b856b77649f27fe 
[mlrun] 2020-04-30 01:23:55,761 run executed, status=completed
