In [1]:
# nuclio: ignore
import nuclio

In [2]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

import json
import os

from cloudpickle import dumps, load

from sklearn import metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn import metrics

from typing import List
from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact
from mlrun.mlutils import (get_class_fit, create_class,
                           plot_roc, plot_importance,
                           gcf_clear)

def train_model(
    context: MLClientCtx,
    model_pkg_class: str,
    dataset: DataItem,
    label_column: str = "labels",
    sample: int = -1,
    test_size: float = 0.05,
    train_val_split: float = 0.75,
    rng: int = 1,
    model_filename: str = "model",
    models_dest: str = "",
    plots_dest: str = "",
    score_method: str = "micro",
    file_ext: str = "parquet",
    model_pkg_file: str = "",    
) -> None:
    """train a classifier.

    :param context:           the function context
    :param model_pkg_class:   the model to train, e.g, "sklearn.neural_networks.MLPClassifier", 
                              or json model config
    :param dataset:           ("data") name of raw data file
    :param label_column:      ground-truth (y) labels
    :param sample:            Selects the first n rows, or select a sample
                              starting from the first. If negative <-1, select
                              a random sample
    :param model_filename:    model file filename,
                              points to a directory
    :param test_size:         (0.05) test set size
    :param train_val_split:   (0.75) Once the test set has been removed the
                              training set gets this proportion.
    :param rng:               (1) sklearn rng seed
    :param models_dest:       models subfolder on artifact path
    :param plots_dest:        plot subfolder on artifact path
    :param score_method:      for multiclass classification
    
    :param file_ext:          format for test_set_key hold out data
    :param model_pkg_file:    json model config file                                  
    """
    srcfilepath = str(dataset)
    
    models_dest = models_dest or 'models'
    plots_dest = plots_dest or f'plots/{context.name}'
    
    if srcfilepath.endswith("csv"):
        reader = pd.read_csv
    elif srcfilepath.endswith("parquet") or srcfilepath.endswith("pq"):
        reader = pd.read_parquet
    else:
        raise Exception(f"file type unhandled {srcfilepath}")

    if (sample == -1) or (sample >= 1):
        # get all rows, or contiguous sample starting at row 1.
        raw = reader(srcfilepath).dropna()
        labels = raw.pop(label_column)
        raw = raw.iloc[:sample, :]
        labels = labels.iloc[:sample]
    else:
        # grab a random sample
        raw = reader(srcfilepath).dropna().sample(sample * -1)
        labels = raw.pop(label_column)

    context.header = raw.columns.values
    
    yb = label_binarize(labels, classes=labels.unique())
    
    # here we hide the binary encoded labels inside the X matrix so that when splitting we preserve order in both the encoded
    # and non-encoded labels:
    x, xtest, y, ytest = train_test_split(np.concatenate([raw, yb], axis=1), labels, test_size=test_size, random_state=rng)
    xtrain, xvalid, ytrain, yvalid = train_test_split(x, y, train_size=train_val_split, random_state=rng)
    # now extract the hot_encoded labels
    ytrainb = xtrain[:, -yb.shape[1]:].copy()
    xtrain = xtrain[:, :-yb.shape[1]].copy()
    # extract the hot_encoded labels
    yvalidb = xvalid[:, -yb.shape[1]:].copy()
    xvalid = xvalid[:, :-yb.shape[1]].copy()
    # extract the hot_encoded labels
    ytestb = xtest[:, -yb.shape[1]:].copy()
    xtest = xtest[:, :-yb.shape[1]].copy()                                      
    
    test_set = pd.concat(
        [pd.DataFrame(data=xtest, columns=context.header),
         pd.DataFrame(data=ytest.values, columns=[label_column])],
        axis=1,)
    context.log_dataset('test_set', df=test_set, format=file_ext, index=False)

    if model_pkg_file:
        model_config = json.load(open(model_pkg_file, "r"))
    elif model_pkg_class:
        model_config = get_class_fit(model_pkg_class)
    else:
        raise ValueError('model_pkg_file or model_pkg_class must be provided')
    
    for k, v in context.parameters.items():
        if k.startswith('CLASS_'):
            model_config['CLASS'][k[6:]] = v
        if k.startswith('FIT_'):
            model_config['FIT'][k[4:]] = v

    model_config["FIT"].update({"X": xtrain,"y": ytrain.values})
    
    ClassifierClass = create_class(model_config["META"]["class"])
    model = ClassifierClass(**model_config["CLASS"])
    model.fit(**model_config["FIT"])

    try:
        data = dumps(model)
        context.log_artifact('model', body=data, local_path=f"{models_dest}/{model_filename}.pkl")
    except Exception as e:
        print("SERIALIZE MODEL ERROR:", str(e))

    ypred = model.predict(xvalid)
    y_score = model.predict_proba(xvalid)
    
    try:
        if yvalidb.shape[1] > 1:
            average_precision = metrics.average_precision_score(yvalidb,
                                                                y_score,
                                                                average=score_method)
            context.log_result(f"rocauc", metrics.roc_auc_score(yvalidb, y_score))
        else:
            average_precision = metrics.average_precision_score(yvalidb,
                                                                y_score[:, 1],
                                                                average=score_method)
            context.log_result(f"rocauc", metrics.roc_auc_score(yvalidb, y_score[:, 1]))
    except:
        context.logger.info('Error while calculating precision')
        
    try:
        context.log_result(f"accuracy", float(model.score(xvalid, yvalid)))
    except:
        context.logger.info('Error while calculating accuracy')
    try:
        context.log_result(f"f1_score", metrics.f1_score(yvalid, ypred,
                                                         average=score_method))
    except:
        context.logger.info('Error while calculating f1_score')

    # TODO: missing validation plots, callbacks need to reintroduced
    
    plot_roc(context, yvalidb, y_score, key="roc", plots_dir=plots_dest)
    gcf_clear(plt)
    metrics.plot_confusion_matrix(model, xvalid, yvalid, labels=labels.unique(), normalize='true') 
    context.log_artifact(PlotArtifact("confusion", body=plt.gcf()), local_path=f"{plots_dest}/confusion.html")

In [3]:
# nuclio: end-code

### save

In [4]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("sklearn_classifier", kind="job", with_doc=True,
                      image="mlrun/ml-models:test")

# add metadata (for templates and reuse)
fn.spec.default_handler = "train_model"
fn.spec.description = "train any classifier using scikit-learn's API"
fn.metadata.categories = ["models", "classifier"]
fn.metadata.labels = {"author": "yjb"}

fn.save()
fn.export("function.yaml")

[mlrun] 2020-04-20 19:27:01,536 saving function: sklearn-classifier, tag: latest
[mlrun] 2020-04-20 19:27:01,589 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f896453ecc0>

### test

In [5]:
from mlrun import import_function, mount_v3io

func = import_function("hub://sklearn_classifier").apply(mount_v3io())

In [7]:
task_params = {
    "name" : "tasks train a classifier",
    "params" : {
        
        # CHOOSE YOUR MODEL AND CHANGE SOME DEFAULT PARAMETERS
        "model_pkg_class"    : "sklearn.linear_model.LogisticRegression",
        "CLASS_random_state" : 1,
        #"CLASS_num_class"    : 4,
#        "CLASS_tree_method"  : "gpu_hist",
        
    
        "sample"          : -1,
        "label_column"    : "labels",
        "test_size"       : 0.10,
        "train_val_split" : 0.75,
        "rng"             : 1}}
    

from mlrun import NewTask
run = func.run(NewTask(**task_params),
               inputs={"dataset"  : "classifier-data.csv"},
               workdir="/User/artifacts",
               artifact_path="/User/artifacts")

[mlrun] 2020-04-20 19:28:07,259 starting run tasks train a classifier uid=c60e017d672d437a8fe143ea77ef5631  -> http://mlrun-api:8080
[mlrun] 2020-04-20 19:28:07,417 Job is running in the background, pod: tasks-train-a-classifier-7fjzt
No handles with labels found to put in legend.
[mlrun] 2020-04-20 19:28:16,600 log artifact test_set at /User/artifacts/test_set.parquet, size: 6285088, db: Y
[mlrun] 2020-04-20 19:28:17,128 log artifact model at /User/artifacts/models/model.pkl, size: 1212, db: Y
[mlrun] 2020-04-20 19:28:17,347 log artifact roc at /User/artifacts/plots/tasks train a classifier/roc.html, size: 38606, db: Y
[mlrun] 2020-04-20 19:28:17,631 log artifact confusion at /User/artifacts/plots/tasks train a classifier/confusion.html, size: 20356, db: Y

[mlrun] 2020-04-20 19:28:17,751 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...77ef5631,0,Apr 20 19:28:14,completed,tasks train a classifier,host=tasks-train-a-classifier-7fjztkind=jobowner=iguaziov3io_user=iguazio,dataset,CLASS_random_state=1label_column=labelsmodel_pkg_class=sklearn.linear_model.LogisticRegressionrng=1sample=-1test_size=0.1train_val_split=0.75,accuracy=0.8466666666666667f1_score=0.8466666666666667rocauc=0.9211772799854459,test_setmodelrocconfusion


to track results use .show() or .logs() or in CLI: 
!mlrun get run c60e017d672d437a8fe143ea77ef5631  , !mlrun logs c60e017d672d437a8fe143ea77ef5631 
[mlrun] 2020-04-20 19:28:27,052 run executed, status=completed
