# WIP

this notebook function will handle training and logging of **only** xgboost models, exposig both the sklearn and low level api's.

In [1]:
# nuclio: ignore
import nuclio

In [None]:
def gen_xgb_model(xgb_params: dict, xgb_type: str):
    """generate an xgboost model
    
    Multiple model types that can be estimated using
    the XGBoost Scikit-Learn API
    
    :param xgb_params: parameters passed through the 
                       function execution context
    :param xgb_type  : one of 'classifier', 'regressor',
                       'ranker', 'rf_classifier', or
                      'rf_regressor'
    """
    import json
    from mlrun.utils import get_class_fit, create_class

    # generate model and fit function
    if xgb_type:
        model_config = json.load(open(model_pkg_file, "r"))
    elif modeltype is "classifier":
        model_config = get_class_fit("xgboost.XGBClassifier")
    elif modeltype is "regressor":
        model_config = get_class_fit("xgboost.XGBRegressor")
    elif modeltype is "ranker":
        model_config = get_class_fit("xgboost.XGBClassifier")
    elif modeltype is "rf_regressor":
        model_config = get_class_fit("xgboost.XGBRFRegressor")
    elif modeltype is "rf_classifier":
        model_config = get_class_fit("xgboost.XGBRFClassifier")
    else:
        raise ValueError(f'unknown trainer type {xgb_type}')

    for k, v in xgb_params:
        if k.startswith('CLASS_'):
            model_config['CLASS'][k[6:]] = v
        if k.startswith('FIT_'):
            model_config['FIT'][k[4:]] = v

        ClassifierClass = create_class(model_config["META"]["class"])
    model = ClassifierClass(**model_config["CLASS"])

    return model, model_config

In [None]:
def dump_xgb_model(
    context: MLClientCtx, 
    model,
    dump_type: str
    dest_folder: str,
    dest_name: str
):
    """serialize/log model
    
    XGBoost model can be save in 3 different ways:
    1. pickle the internal _booster object, inside the model
    2. using model.save_model(fn) using a legacy binary xgb format
    2. using model.save_model(fn.json) using a portable json format
    
    :param context:     the function's execution context
    :param model:       the fitted xgboost model
    :param dump_type:   'pickle' legacy', or 'json', 
    :param dest_folder: path for serialized model 
    :param dest_name:   name for serialized model file
    """
    try:
        # if dump_type is 'pickle':
        # save model,  incomplete
        # https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier.save_model
        model.save_model(f"{models_dest}/{model_filename}-legacy-save.pkl")
        # elif dump_type is "json":
        # see https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
        # this save all contents as json
        model.save_model(f"{models_dest}/{model_filename}-exp-save.json")
        # else:
        # this saves all internal contents as pickle
        _booster = xbg.model.get_booster()
        dump(_booster, open(f"{models_dest}/{model_filename}-legacy-dump.pkl", "wb"))
        
        # log model needs to be spec'ed:
        data = dumps(_booster)
        context.log_artifact('model', body=data, local_path=f"{models_dest}/{model_filename}.pkl")
    except Exception as e:
        print("SERIALIZE MODEL ERROR:", str(e))

In [None]:
def gen_sample(src:str, sample: int, label_column: str):
    """generate data sample to be split
    """
    # read data function
    if srcfilepath.endswith("csv"):
        reader = pd.read_csv
    elif srcfilepath.endswith("parquet") or srcfilepath.endswith("pq"):
        reader = pd.read_parquet
    else:
        raise Exception(f"file type unhandled {srcfilepath}")

    # get sample
    if (sample == -1) or (sample >= 1):
        # get all rows, or contiguous sample starting at row 1.
        raw = reader(srcfilepath).dropna()
        labels = raw.pop(label_column)
        raw = raw.iloc[:sample, :]
        labels = labels.iloc[:sample]
    else:
        # grab a random sample
        raw = reader(srcfilepath).dropna().sample(sample * -1)
        labels = raw.pop(label_column)
    context.header = raw.columns.values

    return raw, labels, context.header


In [2]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

import json
import os

from cloudpickle import dumps, load, dump

from sklearn import metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn import metrics

from typing import List
from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import PlotArtifact
from mlrun.mlutils import   (plot_roc, plot_importance,
                           gcf_clear)

def train_model(
    context: MLClientCtx,
    model_pkg_class: str,
    dataset: DataItem,
    label_column: str = "labels",
    sample: int = -1,
    test_size: float = 0.05,
    train_val_split: float = 0.75,
    rng: int = 1,
    model_filename: str = "model",
    models_dest: str = "",
    plots_dest: str = "",
    score_method: str = "micro",
    file_ext: str = "parquet",
    model_pkg_file: str = "",    
) -> None:
    """train an xgboost model.

    :param context:           the function context
    :param model_pkg_class:   the model to train, e.g, "sklearn.neural_networks.MLPClassifier", 
                              or json model config
    :param dataset:           ("data") name of raw data file
    :param label_column:      ground-truth (y) labels
    :param sample:            Selects the first n rows, or select a sample
                              starting from the first. If negative <-1, select
                              a random sample
    :param model_filename:    model file filename,
                              points to a directory
    :param test_size:         (0.05) test set size
    :param train_val_split:   (0.75) Once the test set has been removed the
                              training set gets this proportion.
    :param rng:               (1) sklearn rng seed
    :param models_dest:       models subfolder on artifact path
    :param plots_dest:        plot subfolder on artifact path
    :param score_method:      for multiclass classification
    
    :param file_ext:          format for test_set_key hold out data
    :param model_pkg_file:    json model config file                                  
    """
    srcfilepath = str(dataset)
    
    models_dest = models_dest or 'models'
    plots_dest = plots_dest or f'plots/{context.name}'
    
    def gen_sample(src:str, sample: int, label_column: str):
        # read data function
        if srcfilepath.endswith("csv"):
            reader = pd.read_csv
        elif srcfilepath.endswith("parquet") or srcfilepath.endswith("pq"):
            reader = pd.read_parquet
        else:
            raise Exception(f"file type unhandled {srcfilepath}")

        # get sample
        if (sample == -1) or (sample >= 1):
            # get all rows, or contiguous sample starting at row 1.
            raw = reader(srcfilepath).dropna()
            labels = raw.pop(label_column)
            raw = raw.iloc[:sample, :]
            labels = labels.iloc[:sample]
        else:
            # grab a random sample
            raw = reader(srcfilepath).dropna().sample(sample * -1)
            labels = raw.pop(label_column)
        context.header = raw.columns.values
        
        return raw, labels, context.header
    
    # labeling
    yb = label_binarize(labels, classes=labels.unique())
    
    # splits
    # here we hide the binary encoded labels inside the X matrix so that when splitting we preserve order in both the encoded
    # and non-encoded labels:
    x, xtest, y, ytest = train_test_split(np.concatenate([raw, yb], axis=1), labels, test_size=test_size, random_state=rng)
    xtrain, xvalid, ytrain, yvalid = train_test_split(x, y, train_size=train_val_split, random_state=rng)
    # now extract the hot_encoded labels
    ytrainb = xtrain[:, -yb.shape[1]:].copy()
    xtrain = xtrain[:, :-yb.shape[1]].copy()
    # extract the hot_encoded labels
    yvalidb = xvalid[:, -yb.shape[1]:].copy()
    xvalid = xvalid[:, :-yb.shape[1]].copy()
    # extract the hot_encoded labels
    ytestb = xtest[:, -yb.shape[1]:].copy()
    xtest = xtest[:, :-yb.shape[1]].copy()                                      
    
    test_set = pd.concat(
        [pd.DataFrame(data=xtest, columns=context.header),
         pd.DataFrame(data=ytest.values, columns=[label_column])],
        axis=1,)
    context.log_dataset('test_set', df=test_set, format=file_ext, index=False)
    
    # get the correct xgboost model and model config
    model, model_config = gen_xgb_model(context, model_type)
    
    # update the model config with training data and callbacks
    model_config["FIT"].update({"X": xtrain,"y": ytrain.values})
    
    # run the fit
    model.fit(**model_config["FIT"])
    
    def dump_xgb_model(
        context: MLClientCtx, 
        model,
        dest_folder: str,
        dest_name: str
    ):
        """serialize/log model
        """
        try:
            # save model,  incomplete
            # https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier.save_model
            model.save_model(f"{models_dest}/{model_filename}-legacy-save.pkl")

            # see https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
            # this save all contents as json
            model.save_model(f"{models_dest}/{model_filename}-exp-save.json")

            # this saves all internal contents as pickle
            _booster = xbg.model.get_booster()
            dump(_booster, open(f"{models_dest}/{model_filename}-legacy-dump.pkl", "wb"))

            data = dumps(_booster)
            context.log_artifact('model', body=data, local_path=f"{models_dest}/{model_filename}.pkl")
        except Exception as e:
            print("SERIALIZE MODEL ERROR:", str(e))

    # generate predictions
    ypred = model.predict(xvalid)
    y_score = model.predict_proba(xvalid)
    
    # generate probabilities
    try:
        if yvalidb.shape[1] > 1:
            average_precision = metrics.average_precision_score(yvalidb,
                                                                y_score,
                                                                average=score_method)
            context.log_result(f"rocauc", metrics.roc_auc_score(yvalidb, y_score))
        else:
            average_precision = metrics.average_precision_score(yvalidb,
                                                                y_score[:, 1],
                                                                average=score_method)
            context.log_result(f"rocauc", metrics.roc_auc_score(yvalidb, y_score[:, 1]))
    except:
        context.logger.info('Error while calculating precision')
        
    try:
        context.log_result(f"accuracy", float(model.score(xvalid, yvalid)))
    except:
        context.logger.info('Error while calculating accuracy')
    try:
        context.log_result(f"f1_score", metrics.f1_score(yvalid, ypred,
                                                         average=score_method))
    except:
        context.logger.info('Error while calculating f1_score')

    # generate plots
    plot_roc(context, yvalidb, y_score, key="roc", plots_dir=plots_dest)
    gcf_clear(plt)
    metrics.plot_confusion_matrix(model, xvalid, yvalid, labels=labels.unique(), normalize='true') 
    context.log_artifact(PlotArtifact("confusion", body=plt.gcf()), local_path=f"{plots_dest}/confusion.html")

In [3]:
# nuclio: end-code

### mlconfig

In [4]:
from mlrun import mlconf

In [5]:
mlconf.dbpath = mlconf.dbpath or './'
mlconf.dbpath

'http://mlrun-api:8080'

In [6]:
vcs_branch = 'development'
base_vcs = f'https://raw.githubusercontent.com/mlrun/functions/{vcs_branch}/'

mlconf.hub_url = mlconf.hub_url or base_vcs + f'{name}/function.yaml'
mlconf.hub_url

'/User/repos/functions/{name}/function.yaml'

In [7]:
import os
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["V3IO_HOME"]}/artifacts'
mlconf.artifact_path

'/User/artifacts'

In [8]:
import os
TAG = os.environ['MLRUN_COMMIT']

### save

In [9]:
from mlrun import code_to_function 
# create job function object from notebook code
fn = code_to_function("xgb_trainer", kind="job", with_doc=True,
                      handler=train_model,
                      image=f"mlrun/ml-models:{TAG}")

# add metadata (for templates and reuse)
fn.spec.default_handler = "train_model"
fn.spec.description = "train any classifier using scikit-learn's API"
fn.metadata.categories = ["models", "classifier"]
fn.metadata.labels = {"author": "yjb"}

fn.save()
fn.export("function.yaml")

[mlrun] 2020-04-27 09:29:16,493 saving function: xgb-trainer, tag: latest
[mlrun] 2020-04-27 09:29:16,780 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fb92d2509b0>

### test

In [10]:
from mlrun import import_function, mount_v3io, NewTask, run_local

func = import_function("hub://xgb_trainer")


if "V3IO_HOME" in list(os.environ):
    from mlrun import mount_v3io
    fn.apply(mount_v3io())
else:
    # is you set up mlrun using the instructions at 
    # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
    from mlrun.platforms import mount_pvc
    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))

In [11]:
task_params = {
    "name" : "tasks xgb cpu trainer",
    "params" : {
        "model_type"         : 'classifier', # choose regressor, ranker, rfclassifier...
        "num_class"          : 2,  # do not use this when binary
        "CLASS_tree_method"  : "gpu_hist",
        "CLASS_objective"    : "binary:logistic",  # have this chosen by default
        "CLASS_random_state" : 1,
        "sample"             : -1,
        "label_column"       : "labels",
        "test_size"          : 0.10,
        "train_val_split"    : 0.75}}

### run remotely

In [12]:
run = fn.run(
    NewTask(**task_params),
    inputs={"dataset"  : os.path.join(mlconf.artifact_path, 'breast_cancer.parquet')},
    artifact_path=mlconf.artifact_path)

[mlrun] 2020-04-27 09:29:17,107 starting run tasks xgb cpu trainer uid=44bd009f3baf49b0a428e9c5c188054c  -> http://mlrun-api:8080
[mlrun] 2020-04-27 09:29:17,924 Job is running in the background, pod: tasks-xgb-cpu-trainer-svhsg
[mlrun] 2020-04-27 09:29:28,457 log artifact test_set at /User/artifacts/test_set.parquet, size: 35544, db: Y
[mlrun] 2020-04-27 09:29:28,466 Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/mlrun/runtimes/local.py", line 184, in exec_from_params
    val = handler(*args_list)
  File "main.py", line 110, in train_model
    model_config = get_class_fit(model_pkg_class)
  File "/opt/conda/lib/python3.7/site-packages/mlrun/mlutils/models.py", line 12, in get_class_fit
    model_ = getattr(import_module(".".join(splits[:-1])), splits[-1])
  File "/opt/conda/lib/python3.7/importlib/__init__.py", line 127, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1003,

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...c188054c,0,Apr 27 09:29:27,error,tasks xgb cpu trainer,host=tasks-xgb-cpu-trainer-svhsgkind=jobowner=adminv3io_user=admin,dataset,CLASS_objective=binary:logisticCLASS_random_state=1label_column=labelsmodel_filename=XGBClassifiermodel_pkg_class=XGBClassifiersample=-1test_size=0.1train_val_split=0.75,,test_set


to track results use .show() or .logs() or in CLI: 
!mlrun get run 44bd009f3baf49b0a428e9c5c188054c  , !mlrun logs 44bd009f3baf49b0a428e9c5c188054c 
[mlrun] 2020-04-27 09:29:38,829 run executed, status=error
runtime error: Empty module name


RunError: Empty module name