**Please be sure to run the notebook [get-remote](get-remote.ipynb) before running this one.**

## train model

In [4]:
import mlrun, os, json
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

## function

In [12]:
train_fn = mlrun.import_function('/User/functions/sklearn_classifier/function.yaml').apply(mlrun.mount_v3io())

## parameters

Please see 
**[LightGBM parameters](https://lightgbm.readthedocs.io/en/latest/Parameters.html)** for more details on all configurable LighGBM parameters.

In [13]:
# specify model class and configuration, in this casse, a LightGBM classifier
SKLEARN_CLASSIFIER = 'lightgbm.sklearn.LGBMClassifier'
MODEL_CONF         = json.load(open('/User/functions/sklearn_classifier/configs/lightgbm-conf.json', 'r'))

In [14]:
# data params
DATA_KEY      = '/User/lightgbm/artifacts/higgs.pqt'   # (from a previous notebook)
LABEL_COLUMN  = 'labels'

# -n for random sample of n obs, -1 for entire dataset, +n for n consecutive rows
SAMPLE_SIZE      = -30_000

TEST_SIZE        = 0.1       # 10% set aside
TRAIN_VAL_SPLIT  = 0.75      # remainder split into train and val
RNG              = 1         # for reproducibility

In [15]:
# specify output params
MODEL_KEY          = 'model'
MODEL_NAME         = 'lgb-classifier.pkl'
ARTIFACT_PATH      = '/User/functions/sklearn_classifier/artifacts'

In [16]:
os.makedirs(ARTIFACT_PATH, exist_ok=True)

## task

In [21]:
train_task = mlrun.NewTask(
    'train',
    params={
        'data_key'        : DATA_KEY,
        'sample'          : SAMPLE_SIZE,
        'label_column'    : LABEL_COLUMN,
        'test_size'       : TEST_SIZE,
        'train_val_split' : TRAIN_VAL_SPLIT,
        'rng'             : RNG,
        'class_params'    : MODEL_CONF['CLASS_PARAMS'],
        'fit_params'      : MODEL_CONF['FIT_PARAMS']},
    artifact_path=ARTIFACT_PATH)

tf = train_fn.run(train_task)

[mlrun] 2020-03-11 13:05:24,321 starting run train uid=ac543eaa958d426d902a25606a41f8e4  -> http://mlrun-api:8080
[mlrun] 2020-03-11 13:05:24,439 Job is running in the background, pod: train-9lml7
Intel(R) Data Analytics Acceleration Library (Intel(R) DAAL) solvers for sklearn enabled: https://intelpython.github.io/daal4py/sklearn.html
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
[mlrun] 2020-03-11 13:05:43,167 log artifact test-set at /User/functions-refac/sklearn_classifier/artifacts/test-set.csv, size: None, db: Y
[mlrun] 2020-03-11 13:05:44,101 log artifact model at /User/functions-refac/sklearn_classifier/artifacts/models/model.pkl, size: None, db: Y
[mlrun] 2020-03-11 13:05:44,362 log artifact roc at /User/functions-refac/sklearn_classifier/artifacts/plots/roc.png, size: 46918, db: Y
[mlrun] 2020-

uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...41f8e4,0,Mar 11 13:05:35,completed,train,host=train-9lml7kind=jobowner=admin,,"class_params={'boosting_type': 'gbdt', 'colsample_bytree': 1, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': 50, 'min_child_samples': 20, 'min_split_gain': 0.0, 'n_estimators': 300, 'n_jobs': 16, 'num_leaves': 300, 'objective': 'binary', 'random_state': 1, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'silent': True, 'subsample': 1}data_key=/User/lightgbm/artifacts/higgs.pqtfit_params={'early_stopping_rounds': 10, 'verbose': False}label_column=labelsrng=1sample=-30000test_size=0.1train_val_split=0.75",accuracy=0.7112592592592593avg_precscore=0.8065259954834598f1_score=0.7269160711783662rocauc=0.7894231743202476,test-setmodelrochistoryconfusionfeatimp.pngfeatimp.csv


to track results use .show() or .logs() or in CLI: 
!mlrun get run ac543eaa958d426d902a25606a41f8e4  , !mlrun logs ac543eaa958d426d902a25606a41f8e4 
[mlrun] 2020-03-11 13:05:53,893 run executed, status=completed


## tests

In [35]:
# refactor: take a simple make_classification dataset...

In [33]:
outkeys = list(tf.outputs.keys())

assert outkeys == ['accuracy',
                   'avg_precscore',
                   'f1_score',
                   'rocauc',
                   'test_set',
                   'model',
                   'roc',
                   'history',
                   'confusion',
                   'featimp.png',
                   'featimp.csv']