# deploying yaml on optimized python images

* one node
* lightgbm
* 10 mio samples / 20 features
* code stored as yaml in github
* precomiled images using optimized for cpu python libraries

## imports

In [1]:
import mlrun
import os
import numpy as np
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

## parameters

In [2]:
TARGET_CODE_BASE   = '/User/repos/functions/'           
N_SAMPLES          = 10_000_000  # size of HIGGS data
M_FEATURES         = 20
NEG_WEIGHT         = 0.5
TARGET_DATA_PATH   = '/User/mlrun/sklearn-classifier'
FILE_NAME          = 'simdata.pqt'
KEY                = 'simdata'
RNG                = 1
SKLEARN_CLASSIFIER = 'lightgbm.sklearn.LGBMClassifier'
MODEL_KEY          = 'model'
MODEL_NAME         = MODEL_KEY
VERBOSE            = False

## generate some binary classifiaction data

In [3]:
binarydatagen = mlrun.import_function(
    os.path.join(TARGET_CODE_BASE+'datagen/classification', 'binary.yaml')
).apply(mlrun.mount_v3io())

In [4]:
# binarydatagen.deploy()

In [5]:
task1 = mlrun.NewTask()
task1.with_params(
    n_samples=N_SAMPLES,
    m_features=M_FEATURES,
    weight=NEG_WEIGHT,
    target_path=TARGET_DATA_PATH,
    filename=FILE_NAME,
    key=KEY,
    random_state=RNG)

<mlrun.model.RunTemplate at 0x7f49c0296710>

In [6]:
tsk1 = binarydatagen.run(task1, handler='create_binary_classification')

[mlrun] 2020-01-22 22:48:03,974 starting run create_binary_classification uid=ad9df1228d034fd5a11d732502f64aa2  -> http://mlrun-api:8080
[mlrun] 2020-01-22 22:48:04,072 Job is running in the background, pod: create-binary-classification-bnqlx
[mlrun] 2020-01-22 22:48:53,079 log artifact simdata at /User/mlrun/sklearn-classifier/simdata.pqt, size: None, db: Y

[mlrun] 2020-01-22 22:48:53,341 run executed, status=completed
  result = infer_dtype(pandas_collection)
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...f64aa2,0,Jan 22 22:48:16,completed,binary,host=create-binary-classification-bnqlxkind=jobowner=admin,,filename=simdata.pqtkey=simdatam_features=20n_samples=10000000random_state=1target_path=/User/mlrun/sklearn-classifierweight=0.5,,simdata


to track results use .show() or .logs() or in CLI: 
!mlrun get run ad9df1228d034fd5a11d732502f64aa2  , !mlrun logs ad9df1228d034fd5a11d732502f64aa2 
[mlrun] 2020-01-22 22:48:56,442 run executed, status=completed


____
# tests

In [7]:
import pandas as pd
df = pd.read_parquet(os.path.join(TARGET_DATA_PATH, FILE_NAME), engine='pyarrow')

In [8]:
assert tsk1.output(KEY) == os.path.join(TARGET_DATA_PATH, FILE_NAME), "binary.yaml failed to create a file"
assert df.shape== (N_SAMPLES, M_FEATURES+1), "simulation data artifact is not of the correct dimensions"

_____
## train a classifier

In [9]:
trainfn = mlrun.import_function(
    os.path.join(TARGET_CODE_BASE+'train/sklearn-classifier.yaml')
).apply(mlrun.mount_v3io())

In [10]:
# trainfn.deploy()

In [11]:
task2 = mlrun.NewTask()
task2.with_params(
    src_file=tsk1.output(KEY),
    SKClassifier=SKLEARN_CLASSIFIER,
    name=MODEL_NAME,
    key=MODEL_KEY,
    verbose=VERBOSE,
    random_state=RNG,
    callbacks = [])

<mlrun.model.RunTemplate at 0x7f49b8b9b3c8>

In [12]:
tsk2 = trainfn.run(task2, handler='train')

[mlrun] 2020-01-22 22:49:00,573 starting run train uid=902dec5bdd8a4d4baeb9333ac6d5e15e  -> http://mlrun-api:8080
[mlrun] 2020-01-22 22:49:00,663 Job is running in the background, pod: train-99tsp
This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[mlrun] 2020-01-22 22:50:23,162 log artifact model at model, size: None, db: Y
[mlrun] 2020-01-22 22:50:23,333 log artifact xtest at xtest.pkl, size: None, db: Y
[mlrun] 2020-01-22 22:50:23,454 log artifact ytest at ytest.pkl, size: None, db: Y

[mlrun] 2020-01-22 22:50:23,466 run executed, status=completed
  labels = getattr(columns, 'labels', None) or [
  return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)
  labels, = index.labels
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...d5e15e,0,Jan 22 22:49:07,completed,sklearn-classifier,host=train-99tspkind=jobowner=admin,,SKClassifier=lightgbm.sklearn.LGBMClassifiercallbacks=[]key=modelname=modelrandom_state=1src_file=/User/mlrun/sklearn-classifier/simdata.pqtverbose=False,train_accuracy=0.9671342173532174,modelxtestytest


to track results use .show() or .logs() or in CLI: 
!mlrun get run 902dec5bdd8a4d4baeb9333ac6d5e15e  , !mlrun logs 902dec5bdd8a4d4baeb9333ac6d5e15e 
[mlrun] 2020-01-22 22:50:29,828 run executed, status=completed


In [13]:
tsk2.outputs

{'train_accuracy': 0.9671342173532174,
 'model': 'model',
 'xtest': 'xtest.pkl',
 'ytest': 'ytest.pkl'}

## evaluation

run plots here

## model optimization

onnx here