**Please be sure to run the notebook [get-remote](get-remote.ipynb) before running this one.**

## train model

In [31]:
import mlrun, os, json
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

## function

In [32]:
train_fn = mlrun.import_function('/User/functions/sklearn_classifier/function.yaml').apply(mlrun.mount_v3io())

## parameters

Please see 
**[LightGBM parameters](https://lightgbm.readthedocs.io/en/latest/Parameters.html)** for more details on all configurable LighGBM parameters.

In [33]:
# specify model class
SKLEARN_CLASSIFIER = "sklearn.svm.SVC"

In [37]:
# data params
DATA_KEY      = '/User/lightgbm-project/artifacts/higgs.pqt'   # (from a previous notebook)
LABEL_COLUMN  = 'labels'

# -n for random sample of n obs, -1 for entire dataset, +n for n consecutive rows
SAMPLE_SIZE      = -30_000

TEST_SIZE        = 0.1       # 10% set aside
TRAIN_VAL_SPLIT  = 0.75      # remainder split into train and val
RNG              = 1         # for reproducibility

In [38]:
# specify output params
MODEL_KEY          = 'model'
MODEL_NAME         = 'lgb-classifier.pkl'
ARTIFACT_PATH      = '/User/functions/sklearn_classifier/artifacts'

## task

In [39]:
train_task = mlrun.NewTask(
    'train',
    params={
        'model_pkg_class' : SKLEARN_CLASSIFIER,
        'data_key'        : DATA_KEY,
        'sample'          : SAMPLE_SIZE,
        'label_column'    : LABEL_COLUMN,
        'model_key'       : "models",
        'test_size'       : TEST_SIZE,
        'train_val_split' : TRAIN_VAL_SPLIT,
        'rng'             : RNG,
        'class_updates'   : {
            'random_state' : 1,
            'probability'  : True
        },
        'fit_updates'    : {}},
    artifact_path=ARTIFACT_PATH)

tf = train_fn.run(train_task)

[mlrun] 2020-03-19 13:01:31,410 starting run train uid=54fc1639f45e4502b8bac8df2bf97751  -> http://mlrun-api:8080
[mlrun] 2020-03-19 13:01:31,702 Job is running in the background, pod: train-xpjv7
[mlrun] 2020-03-19 13:01:44,564 Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/mlrun/runtimes/local.py", line 179, in exec_from_params
    val = handler(*args_list)
  File "main.py", line 251, in train_model
    x, xtest, y, ytest = train_test_split(np.concatenate([raw, yb], axis=0), labels, test_size=test_size, random_state=rng)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_split.py", line 2096, in train_test_split
    arrays = indexable(*arrays)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py", line 230, in indexable
    check_consistent_length(*result)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py", line 205, in check_consistent_length
    " samples: %r" % [int(l) for l in l

uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...f97751,0,Mar 19 13:01:36,error,train,host=train-xpjv7kind=jobowner=admin,,"class_updates={'probability': True, 'random_state': 1}data_key=/User/lightgbm-project/artifacts/higgs.pqtfit_updates={}label_column=labelsmodel_key=modelsmodel_pkg_class=sklearn.svm.SVCrng=1sample=-30000test_size=0.1train_val_split=0.75",,


to track results use .show() or .logs() or in CLI: 
!mlrun get run 54fc1639f45e4502b8bac8df2bf97751  , !mlrun logs 54fc1639f45e4502b8bac8df2bf97751 
[mlrun] 2020-03-19 13:01:50,911 run executed, status=error
runtime error: Found input variables with inconsistent numbers of samples: [60000, 30000]


RunError: Found input variables with inconsistent numbers of samples: [60000, 30000]

## tests

In [35]:
# refactor: take a simple make_classification dataset...

In [33]:
outkeys = list(tf.outputs.keys())

assert outkeys == ['accuracy',
                   'avg_precscore',
                   'f1_score',
                   'rocauc',
                   'test_set',
                   'model',
                   'roc',
                   'history',
                   'confusion',
                   'featimp.png',
                   'featimp.csv']