In [1]:
# nuclio: ignore
import nuclio

In [2]:
import json
import os

from cloudpickle import dumps, load

from sklearn import metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn import metrics

from typing import List
from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact
from mlrun.mlutils import get_class_fit, create_class, plot_roc, plot_importance, gcf_clear

import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

def train_model(
    context: MLClientCtx,
    model_pkg_class: str = "",
    dataset: DataItem = '',
    sample: int = -1,
    label_column: str = "labels",
    test_size: float = 0.05,
    train_val_split: float = 0.75,
    rng: int = 1,
    model_filename: str = "model",
    models_dest: str = "",
    plots_dest: str = "",
    score_method: str = "micro",
    model_pkg_file: str = "",
    file_ext: str = "parquet"
) -> None:
    """train a classifier.

    :param context:           the function context
    :param model_pkg_class:   the model to train, e.g, "sklearn.neural_networks.MLPClassifier", 
                              or json model config
    :param dataset:           ("data") name of raw data file
    :param sample:            Selects the first n rows, or select a sample
                              starting from the first. If negative <-1, select
                              a random sample
    :param label_column:      ground-truth (y) labels
    :param model_filename:    model file filename,
                              points to a directory
    :param test_size:         (0.05) test set size
    :param train_val_split:   (0.75) Once the test set has been removed the
                              training set gets this proportion.
    :param rng:               (1) sklearn rng seed
    :param models_dest:       models subfolder on artifact path
    :param plots_dest:        plot subfolder on artifact path
    :param score_method:      for multiclass classification
    :param model_pkg_file:    json model config file
    :param file_ext:          format for test_set_key hold out data
    """
    # extract file name from DataItem
    srcfilepath = str(dataset)
    
    models_dest = models_dest or 'models'
    plots_dest = plots_dest or f'plots/{context.name}'
    
    if srcfilepath.endswith("csv"):
        reader = pd.read_csv
    elif srcfilepath.endswith("parquet") or srcfilepath.endswith("pq"):
        reader = pd.read_parquet
    else:
        raise Exception(f"file type unhandled {srcfilepath}")

    if (sample == -1) or (sample >= 1):
        # get all rows, or contiguous sample starting at row 1.
        raw = reader(srcfilepath).dropna()
        labels = raw.pop(label_column)
        raw = raw.iloc[:sample, :]
        labels = labels.iloc[:sample]
    else:
        # grab a random sample
        raw = reader(srcfilepath).dropna().sample(sample * -1)
        labels = raw.pop(label_column)

    context.header = raw.columns.values
    
    yb = label_binarize(labels, classes=labels.unique())
    
    # double split to generate 3 data sets: train, validation and test
    # with xtest,ytest set aside
    # here we hide the binary encoded labels inside the X matrix so that when splitting we preserve order in both the encoded
    # and non-encoded labels:
    x, xtest, y, ytest = train_test_split(np.concatenate([raw, yb], axis=1), labels, test_size=test_size, random_state=rng)
    xtrain, xvalid, ytrain, yvalid = train_test_split(x, y, train_size=train_val_split, random_state=rng)
    # now extract the hot_encoded labels
    ytrainb = xtrain[:, -yb.shape[1]:].copy()
    xtrain = xtrain[:, :-yb.shape[1]].copy()
    # extract the hot_encoded labels
    yvalidb = xvalid[:, -yb.shape[1]:].copy()
    xvalid = xvalid[:, :-yb.shape[1]].copy()
    # extract the hot_encoded labels
    ytestb = xtest[:, -yb.shape[1]:].copy()
    xtest = xtest[:, :-yb.shape[1]].copy()                                      
    
    # set-aside test_set
    test_set = pd.concat(
        [pd.DataFrame(data=xtest, columns=context.header),
         pd.DataFrame(data=ytest.values, columns=[label_column])],
        axis=1,)
    context.log_dataset('test_set', df=test_set, format=file_ext, index=False)

    if model_pkg_file:
        model_config = json.load(open(model_pkg_file, "r"))
    elif model_pkg_class:
        model_config = get_class_fit(model_pkg_class)
    else:
        raise ValueError('model_pkg_file or model_pkg_class must be provided')
    
    for k, v in context.parameters.items():
        if k.startswith('CLASS_'):
            model_config['CLASS'][k[6:]] = v
        if k.startswith('FIT_'):
            model_config['FIT'][k[4:]] = v

    model_config["FIT"].update({"X": xtrain,"y": ytrain.values})
    
    # create class and fit
    ClassifierClass = create_class(model_config["META"]["class"])
    model = ClassifierClass(**model_config["CLASS"])
    model.fit(**model_config["FIT"])

    # save model
    try:
        data = dumps(model)
        context.log_artifact('model', body=data, local_path=f"{models_dest}/{model_filename}.pkl")
    except Exception as e:
        print("SERIALIZE MODEL ERROR:", str(e))

    # compute validation metrics
    ypred = model.predict(xvalid)
    y_score = model.predict_proba(xvalid)
    context.logger.info(f"y_score.shape {y_score.shape}")
    context.logger.info(f"yvalidb.shape {yvalidb.shape}")
    if yvalidb.shape[1] > 1:
        # label encoding was applied:
        average_precision = metrics.average_precision_score(yvalidb,
                                                            y_score,
                                                            average=score_method)
        context.log_result(f"rocauc", metrics.roc_auc_score(yvalidb, y_score))
    else:
        average_precision = metrics.average_precision_score(yvalidb,
                                                            y_score[:, 1],
                                                            average=score_method)
        context.log_result(f"rocauc", metrics.roc_auc_score(yvalidb, y_score[:, 1]))
        
    context.log_result(f"avg_precscore", average_precision)
    context.log_result(f"accuracy", float(model.score(xvalid, yvalid)))
    context.log_result(f"f1_score", metrics.f1_score(yvalid, ypred,
                                                     average=score_method))

    # TODO: missing validation plots, callbacks need to reintroduced
    
    plot_roc(context, yvalidb, y_score, key="roc", plots_dir=plots_dest)
    gcf_clear(plt)
    # use sklearn >= v0.22 built in:
    metrics.plot_confusion_matrix(model, xvalid, yvalid, labels=labels.unique(), normalize='true') 
    context.log_artifact(PlotArtifact("confusion", body=plt.gcf()), local_path=f"{plots_dest}/confusion.html")

In [3]:
# nuclio: end-code

### save

In [4]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("sklearn_classifier", kind="job", with_doc=True,
                      image="mlrun/ml-models")

# add metadata (for templates and reuse)
fn.spec.default_handler = "train_model"
fn.spec.description = "train any classifier using scikit-learn's API"
fn.metadata.categories = ["models", "classifier"]
fn.metadata.labels = {"author": "yjb"}

fn.save()
fn.export("function.yaml")

[mlrun] 2020-04-02 15:15:13,756 saving function: sklearn-classifier, tag: latest
[mlrun] 2020-04-02 15:15:13,816 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f6796504ef0>

### test

In [5]:
from mlrun import import_function, mount_v3io

func = import_function("hub://sklearn_classifier").apply(mount_v3io())

In [8]:
task_params = {
    "name" : "tasks train a classifier",
    "params" : {
        
        # CHOOSE YOUR MODEL AND CHNAGE SOME DEFAULT PARAMETERS
        "model_pkg_class"    : "xgboost.XGBClassifier",
        "CLASS_random_state" : 1,
        "CLASS_num_class"    : 3,
    
        "sample"          : -1,
        "label_column"    : "labels",
        "test_size"       : 0.10,
        "train_val_split" : 0.75,
        "rng"             : 1}}
    

from mlrun import NewTask
run = func.run(NewTask(**task_params),
               inputs={"dataset"  : "/User/artifacts/iris.parquet"},
               artifact_path="/User/artifacts")

[mlrun] 2020-04-02 15:16:25,307 starting run tasks train a classifier uid=b91863ac040a4c8cb4eabe8a2e7115a3  -> http://mlrun-api:8080
[mlrun] 2020-04-02 15:16:25,436 Job is running in the background, pod: tasks-train-a-classifier-rvtxb
No handles with labels found to put in legend.
[mlrun] 2020-04-02 15:16:35,833 log artifact test_set at /User/artifacts/test_set.parquet, size: 4284, db: Y
[mlrun] 2020-04-02 15:16:37,158 log artifact model at /User/artifacts/models/model.pkl, size: 77204, db: Y
[mlrun] 2020-04-02 15:16:37,160 y_score.shape (34, 3)
[mlrun] 2020-04-02 15:16:37,160 yvalidb.shape (34, 3)
[mlrun] 2020-04-02 15:16:37,434 log artifact roc at /User/artifacts/plots/tasks train a classifier/roc.html, size: 31102, db: Y
[mlrun] 2020-04-02 15:16:37,681 log artifact confusion at /User/artifacts/plots/tasks train a classifier/confusion.html, size: 23852, db: Y

[mlrun] 2020-04-02 15:16:37,702 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...2e7115a3,0,Apr 02 15:16:35,completed,tasks train a classifier,host=tasks-train-a-classifier-rvtxbkind=jobowner=adminv3io_user=admin,dataset,CLASS_num_class=3CLASS_random_state=1label_column=labelsmodel_pkg_class=xgboost.XGBClassifierrng=1sample=-1test_size=0.1train_val_split=0.75,accuracy=0.9411764705882353avg_precscore=0.9878952937479977f1_score=0.9411764705882353rocauc=0.9905050505050506,test_setmodelrocconfusion


to track results use .show() or .logs() or in CLI: 
!mlrun get run b91863ac040a4c8cb4eabe8a2e7115a3  , !mlrun logs b91863ac040a4c8cb4eabe8a2e7115a3 
[mlrun] 2020-04-02 15:16:44,867 run executed, status=completed


In [10]:
task_params = {
    "name" : "tasks train a classifier",
    "params" : {
        
        # CHOOSE YOUR MODEL AND CHNAGE SOME DEFAULT PARAMETERS
        "model_pkg_class"    : "xgboost.XGBClassifier",
        "CLASS_random_state" : 1,
        "CLASS_num_class"    : 3,
    
        "sample"          : -1,
        "label_column"    : "labels",
        "test_size"       : 0.10,
        "train_val_split" : 0.75,
        "rng"             : 1}}
    

from mlrun import NewTask
run = func.run(NewTask(**task_params),
               inputs={"dataset"        : "/User/artifacts/wine.parquet"},
               artifact_path="/User/artifacts")

[mlrun] 2020-04-02 15:26:16,000 starting run tasks train a classifier uid=25af647da88e4f7989932482f72f5a5c  -> http://mlrun-api:8080
[mlrun] 2020-04-02 15:26:16,167 Job is running in the background, pod: tasks-train-a-classifier-sdfgs
No handles with labels found to put in legend.
[mlrun] 2020-04-02 15:26:26,525 log artifact test_set at /User/artifacts/test_set.parquet, size: 11232, db: Y
[mlrun] 2020-04-02 15:26:27,796 log artifact model at /User/artifacts/models/model.pkl, size: 74877, db: Y
[mlrun] 2020-04-02 15:26:27,800 y_score.shape (40, 3)
[mlrun] 2020-04-02 15:26:27,800 yvalidb.shape (40, 3)
[mlrun] 2020-04-02 15:26:28,036 log artifact roc at /User/artifacts/plots/tasks train a classifier/roc.html, size: 31258, db: Y
[mlrun] 2020-04-02 15:26:28,311 log artifact confusion at /User/artifacts/plots/tasks train a classifier/confusion.html, size: 24016, db: Y

[mlrun] 2020-04-02 15:26:28,344 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...f72f5a5c,0,Apr 02 15:26:26,completed,tasks train a classifier,host=tasks-train-a-classifier-sdfgskind=jobowner=adminv3io_user=admin,dataset,CLASS_num_class=3CLASS_random_state=1label_column=labelsmodel_pkg_class=xgboost.XGBClassifierrng=1sample=-1test_size=0.1train_val_split=0.75,accuracy=0.95avg_precscore=0.9946915584415585f1_score=0.9500000000000001rocauc=0.9962579745904995,test_setmodelrocconfusion


to track results use .show() or .logs() or in CLI: 
!mlrun get run 25af647da88e4f7989932482f72f5a5c  , !mlrun logs 25af647da88e4f7989932482f72f5a5c 
[mlrun] 2020-04-02 15:26:35,436 run executed, status=completed


In [13]:
task_params = {
    "name" : "tasks train a classifier",
    "params" : {
        
        # CHOOSE YOUR MODEL AND CHNAGE SOME DEFAULT PARAMETERS
        "model_pkg_class"    : "sklearn.linear_model.LogisticRegression",
        "CLASS_random_state" : 1,
        "CLASS_solver"       : "liblinear",
    
        "sample"          : -1,
        "label_column"    : "labels",
        "test_size"       : 0.10,
        "train_val_split" : 0.75,
        "rng"             : 1}}
    

run = func.run(NewTask(**task_params), 
               inputs = {"dataset"        : "/User/artifacts/breast_cancer.parquet"},
               artifact_path="/User/artifacts")

[mlrun] 2020-04-02 15:28:00,227 starting run tasks train a classifier uid=d529fa66c384488199e744f5dbddd394  -> http://mlrun-api:8080
[mlrun] 2020-04-02 15:28:00,349 Job is running in the background, pod: tasks-train-a-classifier-6d252
No handles with labels found to put in legend.
[mlrun] 2020-04-02 15:28:10,837 log artifact test_set at /User/artifacts/test_set.parquet, size: 35544, db: Y
[mlrun] 2020-04-02 15:28:10,920 log artifact model at /User/artifacts/models/model.pkl, size: 905, db: Y
[mlrun] 2020-04-02 15:28:10,921 y_score.shape (128, 2)
[mlrun] 2020-04-02 15:28:10,921 yvalidb.shape (128, 1)
[mlrun] 2020-04-02 15:28:11,068 log artifact roc at /User/artifacts/plots/tasks train a classifier/roc.html, size: 31058, db: Y
[mlrun] 2020-04-02 15:28:11,210 log artifact confusion at /User/artifacts/plots/tasks train a classifier/confusion.html, size: 20784, db: Y

[mlrun] 2020-04-02 15:28:11,267 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...dbddd394,0,Apr 02 15:28:10,completed,tasks train a classifier,host=tasks-train-a-classifier-6d252kind=jobowner=adminv3io_user=admin,dataset,CLASS_random_state=1CLASS_solver=liblinearlabel_column=labelsmodel_pkg_class=sklearn.linear_model.LogisticRegressionrng=1sample=-1test_size=0.1train_val_split=0.75,accuracy=0.9453125avg_precscore=0.9953450486871359f1_score=0.9453125rocauc=0.990748528174937,test_setmodelrocconfusion


to track results use .show() or .logs() or in CLI: 
!mlrun get run d529fa66c384488199e744f5dbddd394  , !mlrun logs d529fa66c384488199e744f5dbddd394 
[mlrun] 2020-04-02 15:28:20,008 run executed, status=completed
