In [1]:
# nuclio: ignore
import nuclio

In [2]:
import json
import os

from cloudpickle import dump, load

from sklearn import metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn import metrics

from typing import List
from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact

from mlutils.models import get_model_configs, create_class
from mlutils.plots import plot_roc, plot_importance, gcf_clear

import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

def train_model(
    context: MLClientCtx,
    model_pkg_class: str = "",
    data_key: str = "data",
    sample: int = -1,
    label_column: str = "labels",
    model_key: str = "model",
    test_size: float = 0.05,
    train_val_split: float = 0.75,
    test_set_key: str = "test_set",
    rng: int = 1,
    models_dest: str = "models",
    plots_dest: str = "plots",
    score_method: str = "micro",
    model_pkg_file: str = "",
    file_ext: str = "parquet"
) -> None:
    """train a classifier.

    :param context:           the function context
    :param model_pkg_class:   the model to train, e.g, "sklearn.neural_networks.MLPClassifier", 
                              or json model config
    :param data_key:          ("data") name of raw data file
    :param sample:            Selects the first n rows, or select a sample
                              starting from the first. If negative <-1, select
                              a random sample
    :param label_column:      ground-truth (y) labels
    :param model_key:         ("model") name of model in artifact store,
                              points to a directory
    :param test_size:         (0.05) test set size
    :param train_val_split:   (0.75) Once the test set has been removed the
                              training set gets this proportion.
    :param test_set_key:      store the test data set under this key in the
                              artifact store
    :param rng:               (1) sklearn rng seed
    :param models_dest:       models subfolder on artifact path
    :param plots_dest:        plot subfolder on artifact path
    :param score_method:      for multiclass classification
    :param model_pkg_file:    json model config file
    :param file_ext:          format for test_set_key hold out data
    """
    # extract file name from DataItem
    srcfilepath = str(data_key)
    
    # TODO: this should be part of data"s metadata dealt with in another step get a data set, sample, etc...
    # get all data or a sample
    if (sample == -1) or (sample >= 1):
        # get all rows, or contiguous sample starting at row 1.
        if srcfilepath.endswith(".csv"):
            raw = pd.read_csv(srcfilepath).dropna()
        if srcfilepath.endswith("parquet") or srcfilepath.endswith("pq"):
            raw = pd.read_parquet(srcfilepath).dropna()
        else:
            raise Exception("file type unhandled")
        labels = raw.pop(label_column)
        raw = raw.iloc[:sample, :]
        labels = labels.iloc[:sample]
    else:
        # grab a random sample
        raw = pq.read_table(srcfilepath).to_pandas().dropna().sample(sample * -1)
        labels = raw.pop(label_column)

    # TODO: this should be part of data"s metadata dealt with in another step
    context.header = raw.columns.values
    
    # TODO: all of this should be part of a spitter component that does cv too, dealt with in another step
    # make a hot encode copy of labels before the split
    yb = label_binarize(labels, classes=labels.unique()) # if binary 0/1 labels, will return labels as is
    
    # double split to generate 3 data sets: train, validation and test
    # with xtest,ytest set aside
    # here we hide the binary encoded labels inside the X matrix so that when splitting we preserve order in both the encoded
    # and non-encoded labels:
    x, xtest, y, ytest = train_test_split(np.concatenate([raw, yb], axis=1), labels, test_size=test_size, random_state=rng)
    xtrain, xvalid, ytrain, yvalid = train_test_split(x, y, train_size=train_val_split, random_state=rng)
    # now extract the hot_encoded labels
    ytrainb = xtrain[:, -yb.shape[1]:].copy()
    xtrain = xtrain[:, :-yb.shape[1]].copy()
    # extract the hot_encoded labels
    yvalidb = xvalid[:, -yb.shape[1]:].copy()
    xvalid = xvalid[:, :-yb.shape[1]].copy()
    # extract the hot_encoded labels
    ytestb = xtest[:, -yb.shape[1]:].copy()
    xtest = xtest[:, :-yb.shape[1]].copy()                                      
    
    # set-aside test_set
    test_set = pd.concat(
        [pd.DataFrame(data=xtest, columns=context.header),
         pd.DataFrame(data=ytest.values, columns=[label_column])],
        axis=1,)
    context.log_dataset(test_set_key, df=test_set, format=file_ext, index=False)

    if model_pkg_file:
        model_config = json.load(open(model_pkg_file, "r"))
    elif model_pkg_class:
        model_config = get_model_configs(model_pkg_class)
    else:
        raise ValueError('model_pkg_file or model_pkg_class must be provided')
    
    for k, v in context.parameters.items():
        if k.startswith('CLASS_'):
            model_config['CLASS'][k[6:]] = v
        if k.startswith('FIT_'):
            model_config['FIT'][k[4:]] = v

    model_config["FIT"].update({"X": xtrain,"y": ytrain.values})
    
    # create class and fit
    ClassifierClass = create_class(model_config["META"]["class"])
    model = ClassifierClass(**model_config["CLASS"])
    model.fit(**model_config["FIT"])

    # save model
    filepath = os.path.join(context.artifact_path, f"{models_dest}/{model_key}.pkl")
    os.makedirs(os.path.join(context.artifact_path, models_dest), exist_ok=True)
    try:
        dump(model, open(filepath, "wb"))
        context.log_artifact(model_key, local_path=models_dest)
    except Exception as e:
        print("SERIALIZE MODEL ERROR:", str(e))

    # compute validation metrics
    ypred = model.predict(xvalid)
    y_score = model.predict_proba(xvalid)
    context.logger.info(f"y_score.shape {y_score.shape}")
    context.logger.info(f"yvalidb.shape {yvalidb.shape}")
    if yvalidb.shape[1] > 1:
        # label encoding was applied:
        average_precision = metrics.average_precision_score(yvalidb,
                                                            y_score,
                                                            average=score_method)
        context.log_result(f"rocauc", metrics.roc_auc_score(yvalidb, y_score))
    else:
        average_precision = metrics.average_precision_score(yvalidb,
                                                            y_score[:, 1],
                                                            average=score_method)
        context.log_result(f"rocauc", metrics.roc_auc_score(yvalidb, y_score[:, 1]))
        
    context.log_result(f"avg_precscore", average_precision)
    context.log_result(f"accuracy", float(model.score(xvalid, yvalid)))
    context.log_result(f"f1_score", metrics.f1_score(yvalid, ypred,
                                             average=score_method))

    # TODO: missing validation plots, callbacks need to reintroduced
    
    plot_roc(context, yvalidb, y_score)
    gcf_clear(plt)
    # use sklearn >= v0.22 built in:
    metrics.plot_confusion_matrix(model, xvalid, yvalid, labels=labels.unique(), normalize='true') 
    context.log_artifact(PlotArtifact("confusion", body=plt.gcf()), local_path=f"{plots_dest}/confusion.html")

In [3]:
# nuclio: end-code

### save

In [4]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("sklearn_classifier", kind="job", with_doc=True,
                      handler=train_model, image="mlrun/ml-models:0.4.6")

# add metadata (for templates and reuse)
fn.spec.default_handler = "train_model"
fn.spec.description = "train any classifier using scikit-learn's API"
fn.metadata.categories = ["models", "classifier"]
fn.spec.image_pull_policy = "Always"
fn.metadata.labels = {"author": "yjb"}

fn.save()
fn.export("function.yaml")

[mlrun] 2020-03-26 14:17:25,458 saving function: sklearn-classifier, tag: latest
[mlrun] 2020-03-26 14:17:25,497 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f76f821a810>

### test

In [5]:
from mlrun import import_function, mount_v3io

func = import_function("hub://sklearn_classifier").apply(mount_v3io())
# func = import_function("function.yaml").apply(mlrun.mount_v3io())

In [6]:
task_params = {
    "name" : "tasks train a classifier",
    "params" : {
        
        # CHOOSE YOUR MODEL AND CHNAGE SOME DEFAULT PARAMETERS
        "model_pkg_class"    : "sklearn.linear_model.LogisticRegression",
        "CLASS_random_state" : 1,
        "CLASS_solver"       : "liblinear",
    
        # POINT THIS TO YOUR DATA
        "data_key"        : "/User/artifacts/breast_cancer.parquet", # 2 class
        "sample"          : -1,
        "label_column"    : "labels",
        "test_size"       : 0.10,
        "train_val_split" : 0.75,
        "rng"             : 1}}
    

from mlrun import NewTask
run = func.run(NewTask(**task_params), artifact_path="/User/artifacts")

[mlrun] 2020-03-26 14:17:25,542 starting run tasks train a classifier uid=7f3ee074dc3d43a29a07522bb71ecf20  -> http://mlrun-api:8080
[mlrun] 2020-03-26 14:17:25,810 Job is running in the background, pod: tasks-train-a-classifier-xg5gd
No handles with labels found to put in legend.
[mlrun] 2020-03-26 14:17:36,533 log artifact test_set at /User/artifacts/test_set.parquet, size: 35553, db: Y
[mlrun] 2020-03-26 14:17:37,247 log artifact model at /User/artifacts/models, size: None, db: Y
[mlrun] 2020-03-26 14:17:37,248 y_score.shape (128, 2)
[mlrun] 2020-03-26 14:17:37,248 yvalidb.shape (128, 1)
[mlrun] 2020-03-26 14:17:37,364 log artifact roc at /User/artifacts/plots/roc.html, size: 31058, db: Y
[mlrun] 2020-03-26 14:17:37,497 log artifact confusion at /User/artifacts/plots/confusion.html, size: 20784, db: Y

[mlrun] 2020-03-26 14:17:37,548 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...1ecf20,0,Mar 26 14:17:36,completed,tasks train a classifier,host=tasks-train-a-classifier-xg5gdkind=jobowner=admin,,CLASS_random_state=1CLASS_solver=liblineardata_key=/User/artifacts/breast_cancer.parquetlabel_column=labelsmodel_pkg_class=sklearn.linear_model.LogisticRegressionrng=1sample=-1test_size=0.1train_val_split=0.75,accuracy=0.9453125avg_precscore=0.9953450486871359f1_score=0.9453125rocauc=0.990748528174937,test_setmodelrocconfusion


to track results use .show() or .logs() or in CLI: 
!mlrun get run 7f3ee074dc3d43a29a07522bb71ecf20  , !mlrun logs 7f3ee074dc3d43a29a07522bb71ecf20 
[mlrun] 2020-03-26 14:17:45,185 run executed, status=completed


In [7]:
task_params = {
    "name" : "tasks train a classifier",
    "params" : {
        
        # CHOOSE YOUR MODEL AND CHNAGE SOME DEFAULT PARAMETERS
        "model_pkg_file"     : "/User/functions/sklearn_classifier/sample-configs/XGBClassifier.json",
        "CLASS_random_state" : 1,
        "CLASS_num_class"    : 3,
    
        # POINT THIS TO YOUR DATA
        "data_key"        : "/User/artifacts/iris.parquet",  # 3 classes
        #"data_key"        : "/User/artifacts/wine.parquet",
        #"data_key"        : "/User/artifacts/breast_cancer.parquet",
        "sample"          : -1,
        "label_column"    : "labels",
        "test_size"       : 0.10,
        "train_val_split" : 0.75,
        "rng"             : 1}}
    

from mlrun import NewTask
run = func.run(NewTask(**task_params), artifact_path="/User/artifacts")

[mlrun] 2020-03-26 14:17:45,191 starting run tasks train a classifier uid=c08466a8343840b19f421fc193893235  -> http://mlrun-api:8080
[mlrun] 2020-03-26 14:17:45,305 Job is running in the background, pod: tasks-train-a-classifier-ltkln
No handles with labels found to put in legend.
[mlrun] 2020-03-26 14:17:55,790 log artifact test_set at /User/artifacts/test_set.parquet, size: 4292, db: Y
[mlrun] 2020-03-26 14:17:56,580 log artifact model at /User/artifacts/models, size: None, db: Y
[mlrun] 2020-03-26 14:17:56,581 y_score.shape (34, 3)
[mlrun] 2020-03-26 14:17:56,581 yvalidb.shape (34, 3)
[mlrun] 2020-03-26 14:17:56,730 log artifact roc at /User/artifacts/plots/roc.html, size: 31290, db: Y
[mlrun] 2020-03-26 14:17:56,863 log artifact confusion at /User/artifacts/plots/confusion.html, size: 23580, db: Y

[mlrun] 2020-03-26 14:17:56,886 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...893235,0,Mar 26 14:17:55,completed,tasks train a classifier,host=tasks-train-a-classifier-ltklnkind=jobowner=admin,,CLASS_num_class=3CLASS_random_state=1data_key=/User/artifacts/iris.parquetlabel_column=labelsmodel_pkg_file=/User/functions/sklearn_classifier/sample-configs/XGBClassifier.jsonrng=1sample=-1test_size=0.1train_val_split=0.75,accuracy=0.9117647058823529avg_precscore=0.9676813220337889f1_score=0.9117647058823528rocauc=0.9678194028194028,test_setmodelrocconfusion


to track results use .show() or .logs() or in CLI: 
!mlrun get run c08466a8343840b19f421fc193893235  , !mlrun logs c08466a8343840b19f421fc193893235 
[mlrun] 2020-03-26 14:18:04,557 run executed, status=completed
