# deploying yaml on optimized python images

## imports

In [1]:
import mlrun
import os
import numpy as np
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

## parameters

In [6]:
TARGET_CODE_BASE   = '/User/repos/functions/'           
N_SAMPLES          = 10_000_000
M_FEATURES         = 20
NEG_WEIGHT         = 0.5
TARGET_DATA_PATH   = '/User/mlrun/sklearn-classifier'
FILE_NAME          = 'simdata.pqt'
KEY                = 'simdata'
RNG                = 1
SKLEARN_CLASSIFIER = 'lightgbm.sklearn.LGBMClassifier'
MODEL_KEY          = 'model'
MODEL_NAME         = MODEL_KEY
VERBOSE            = True

## generate some binary classifiaction data

In [7]:
binarydatagen = mlrun.import_function(
    os.path.join(TARGET_CODE_BASE+'datagen/classification', 'binary.yaml')
).apply(mlrun.mount_v3io())

In [9]:
binarydatagen.deploy()

[mlrun] 2020-01-22 22:34:26,684 starting remote build, image: .mlrun/func-default-binary-latest
[36mINFO[0m[0000] Resolved base name yjbds/mlrun-ds:latest to yjbds/mlrun-ds:latest 
[36mINFO[0m[0000] Resolved base name yjbds/mlrun-ds:latest to yjbds/mlrun-ds:latest 
[36mINFO[0m[0000] Downloading base image yjbds/mlrun-ds:latest 
[36mINFO[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:e4dd2f2f98d45ea9b78e8776e998e0c5f4d19099676464c0dd486139d6f391dc: no such file or directory 
[36mINFO[0m[0000] Downloading base image yjbds/mlrun-ds:latest 
[36mINFO[0m[0000] Built cross stage deps: map[]                
[36mINFO[0m[0000] Downloading base image yjbds/mlrun-ds:latest 
[36mINFO[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:e4dd2f2f98d45ea9b78e8776e998e0c5f4d19099676464c0dd486139d6f391dc: no such file or directory 
[36mINFO[0m[0000] Downloading base image yjbds/mlrun-ds:latest 
[36mINFO[0m[00

True

In [10]:
task1 = mlrun.NewTask()
task1.with_params(
    n_samples=N_SAMPLES,
    m_features=M_FEATURES,
    weight=NEG_WEIGHT,
    target_path=TARGET_DATA_PATH,
    filename=FILE_NAME,
    key=KEY,
    random_state=RNG)

<mlrun.model.RunTemplate at 0x7f2112e96828>

In [11]:
tsk1 = binarydatagen.run(task1, handler='create_binary_classification')

[mlrun] 2020-01-22 22:35:44,742 starting run create_binary_classification uid=9330db1734df40afabbaf41cd386930c  -> http://mlrun-api:8080
[mlrun] 2020-01-22 22:35:44,823 Job is running in the background, pod: create-binary-classification-gcwdk
[mlrun] 2020-01-22 22:36:38,971 log artifact simdata at /User/mlrun/sklearn-classifier/simdata.pqt, size: None, db: Y

[mlrun] 2020-01-22 22:36:39,218 run executed, status=completed
  result = infer_dtype(pandas_collection)
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...86930c,0,Jan 22 22:36:01,completed,binary,host=create-binary-classification-gcwdkkind=jobowner=admin,,filename=simdata.pqtkey=simdatam_features=20n_samples=10000000random_state=1target_path=/User/mlrun/sklearn-classifierweight=0.5,,simdata


to track results use .show() or .logs() or in CLI: 
!mlrun get run 9330db1734df40afabbaf41cd386930c  , !mlrun logs 9330db1734df40afabbaf41cd386930c 
[mlrun] 2020-01-22 22:36:45,628 run executed, status=completed


____
# tests

In [12]:
import pandas as pd
df = pd.read_parquet(os.path.join(TARGET_DATA_PATH, FILE_NAME), engine='pyarrow')

In [13]:
assert tsk1.output(KEY) == os.path.join(TARGET_DATA_PATH, FILE_NAME), "binary.yaml failed to create a file"
assert df.shape== (N_SAMPLES, M_FEATURES+1), "simulation data artifact is not of the correct dimensions"

_____
## train a classifier

In [14]:
trainfn = mlrun.import_function(
    os.path.join(TARGET_CODE_BASE+'train/sklearn-classifier.yaml')
).apply(mlrun.mount_v3io())

In [15]:
trainfn.deploy()

[mlrun] 2020-01-22 22:36:49,836 starting remote build, image: .mlrun/func-default-sklearn-classifier-latest
[36mINFO[0m[0000] Resolved base name yjbds/mlrun-ds:latest to yjbds/mlrun-ds:latest 
[36mINFO[0m[0000] Resolved base name yjbds/mlrun-ds:latest to yjbds/mlrun-ds:latest 
[36mINFO[0m[0000] Downloading base image yjbds/mlrun-ds:latest 
[36mINFO[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:e4dd2f2f98d45ea9b78e8776e998e0c5f4d19099676464c0dd486139d6f391dc: no such file or directory 
[36mINFO[0m[0000] Downloading base image yjbds/mlrun-ds:latest 
[36mINFO[0m[0000] Built cross stage deps: map[]                
[36mINFO[0m[0000] Downloading base image yjbds/mlrun-ds:latest 
[36mINFO[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:e4dd2f2f98d45ea9b78e8776e998e0c5f4d19099676464c0dd486139d6f391dc: no such file or directory 
[36mINFO[0m[0000] Downloading base image yjbds/mlrun-ds:latest 
[36

True

In [16]:
task2 = mlrun.NewTask()
task2.with_params(
    src_file=tsk1.output(KEY),
    SKClassifier=SKLEARN_CLASSIFIER,
    name=MODEL_NAME,
    key=MODEL_KEY,
    verbose=VERBOSE,
    random_state=RNG,
    callbacks = [])

<mlrun.model.RunTemplate at 0x7f21604144e0>

In [17]:
tsk2 = trainfn.run(task2, handler='train')

[mlrun] 2020-01-22 22:38:03,841 starting run train uid=fcb2e3cad46c42648f8e08b5a834dc49  -> http://mlrun-api:8080
[mlrun] 2020-01-22 22:38:03,933 Job is running in the background, pod: train-p29xk
This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 3375747, number of negative: 3374252
[LightGBM] [Info] Total Bins 5120
[LightGBM] [Info] Number of data: 6749999, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500111 -> initscore=0.000443
[LightGBM] [Info] Start training from score 0.000443
[1]	train's binary_logloss: 0.60911	valid's binary_logloss: 0.609096
[2]	train's binary_logloss: 0.540195	valid's binary_logloss: 0.540177
[3]	train's binary_logloss: 0.482774	valid's binary_logloss: 0.482749
[4]	train's binary_logloss: 0.434212	valid's binary_logloss: 0.434186
[5]	train's binary_logloss: 0.392909	vali

uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...34dc49,0,Jan 22 22:38:10,completed,sklearn-classifier,host=train-p29xkkind=jobowner=admin,,SKClassifier=lightgbm.sklearn.LGBMClassifiercallbacks=[]key=modelname=modelrandom_state=1src_file=/User/mlrun/sklearn-classifier/simdata.pqtverbose=True,train_accuracy=0.9671342173532174,modelxtestytest


to track results use .show() or .logs() or in CLI: 
!mlrun get run fcb2e3cad46c42648f8e08b5a834dc49  , !mlrun logs fcb2e3cad46c42648f8e08b5a834dc49 
[mlrun] 2020-01-22 22:39:32,871 run executed, status=completed


In [19]:
tsk2.outputs

{'train_accuracy': 0.9671342173532174,
 'model': 'model',
 'xtest': 'xtest.pkl',
 'ytest': 'ytest.pkl'}

## evaluation

run plots here

## model optimization

onnx here