# ML Pipeline example - XGBoost Training

In [1]:
# nuclio: ignore
# if the nuclio-jupyter package is not installed run !pip install nuclio-jupyter
import nuclio 

### Install and register package dependencied and build commands
Those will convert to container build instructions 

In [None]:
%%nuclio cmd 
pip install sklearn
pip install xgboost

In [3]:
%nuclio config spec.build.baseImage = "python:3.6-jessie"
#%nuclio config spec.image = ".mlrun/xgb:latest"

%nuclio: setting spec.build.baseImage to 'python:3.6-jessie'


## ML Training code

In [2]:
import xgboost as xgb
import os
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score

dtrain = dtest = Y_test = None

def load_dataset():
    global dtrain, dtest, Y_test
    iris = load_iris()
    y = iris['target']
    X = iris['data']
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)
    dtrain = xgb.DMatrix(X_train, label=Y_train)
    dtest = xgb.DMatrix(X_test, label=Y_test)


def xgb_train(context, model_name='iris_v1.bst',
            max_depth=6,
            num_class=10,
            eta=0.2,
            gamma=0.1,
            steps=20):
    global dtrain, dtest, Y_test

    if dtrain is None:
        load_dataset()

    # Get params from event
    param = {"max_depth": max_depth,
             "eta": eta, "nthread": 4,
             "num_class": num_class,
             "gamma": gamma,
             "objective": "multi:softprob"}

    # Train model
    xgb_model = xgb.train(param, dtrain, steps)

    preds = xgb_model.predict(dtest)
    best_preds = np.asarray([np.argmax(line) for line in preds])

    context.log_result('accuracy', float(accuracy_score(Y_test, best_preds)))

    os.makedirs('models', exist_ok=True)
    model_file = model_name #os.path.join('models', model_name)
    xgb_model.save_model(model_file)
    context.log_artifact('model', src_path=model_file, labels={'framework': 'xgboost'})

def dummy(context, x=5):
    return 'xxx {}'.format(x)

In [3]:
# nuclio: end-code
# (end-code marker tells nuclio to stop parsing the notebook from this cell)

# set mlrun db path (can also be specified in run_start command)
%env MLRUN_DBPATH=/User/mlrun
#%env MLRUN_PACKAGE_PATH=git+https://github.com/mlrun/mlrun.git@development

env: MLRUN_DBPATH=/User/mlrun
env: MLRUN_PACKAGE_PATH=git+https://github.com/mlrun/mlrun.git@development


In [4]:
from mlrun import new_function, code_to_function, NewRun, RunTemplate, get_run_db
from mlrun.platforms import mount_v3io, v3io_cred
import kfp
from kfp import dsl

## Test the code locally 

In [5]:
task = NewRun(handler=xgb_train, out_path='/User/mlrun/data').with_hyper_params({'eta': [0.1, 0.2, 0.3]}, selector='max.accuracy')
run = new_function().run(task)

[mlrun] 2019-09-15 22:29:45,313 starting run None uid=8538d82ad98d4f478c424d327bc06510


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...c06510,0,Sep 15 22:29:45,completed,,kind=handlerowner=iguazio,,,best_iteration=1accuracy=0.9666666666666667,modeliteration_results.csv


type result.show() to see detailed results/progress or use CLI:
!mlrun get run --uid 8538d82ad98d4f478c424d327bc06510 
[mlrun] 2019-09-15 22:29:46,190 run executed, status=completed


## Create a containerized function from the notebook code
We create a function object which defined the code, metadata, execution and build instructions <br>

later on we build the image (so we dont have to repeat this every run)

In [6]:
# create a job from the notebook, attache it to iguazio data fabric (v3io)
fn = code_to_function('training')

In [7]:
fn.build(image='mlrun/xgb:latest')

[mlrun] 2019-09-15 22:29:50,692 building image (mlrun/xgb:latest)
FROM python:3.6-jessie
WORKDIR /run
RUN pip install sklearn
RUN pip install xgboost
RUN pip install git+https://github.com/mlrun/mlrun.git@development
ENV PYTHONPATH /run
[mlrun] 2019-09-15 22:29:50,694 using in-cluster config.
[mlrun] 2019-09-15 22:29:50,710 Pod mlrun-build-brtpk created
..
[36mINFO[0m[0000] Downloading base image python:3.6-jessie     
2019/09/15 22:29:52 No matching credentials were found, falling back on anonymous
[36mINFO[0m[0000] Unpacking rootfs as cmd RUN pip install sklearn requires it. 
[36mINFO[0m[0010] Taking snapshot of full filesystem...        
[36mINFO[0m[0014] Skipping paths under /kaniko, as it is a whitelisted directory 
[36mINFO[0m[0014] Skipping paths under /empty, as it is a whitelisted directory 
[36mINFO[0m[0014] Skipping paths under /var/run, as it is a whitelisted directory 
[36mINFO[0m[0015] Skipping paths under /dev, as it is a whitelisted directory 
[36mINFO[0

<mlrun.runtimes.local.LocalRuntime at 0x7f8d576e9320>

## Create and run the pipeline

In [8]:
this_path = '/User/mlrun'
db_path = this_path
artifacts_path = 'v3io:///bigdata/mlrun/{{workflow.uid}}/'

In [9]:
@dsl.pipeline(
    name='My XGBoost training pipeline',
    description='Shows how to use mlrun.'
)
def xgb_pipeline(
   eta = [0.1, 0.2, 0.3], gamma = 0.2
):
    run = NewRun(handler='xgb_train', out_path=artifacts_path, outputs=['model']).with_hyper_params({'eta': eta}, selector='max.accuracy').with_params(gamma=gamma)
    train = fn.with_code().to_step(run).apply(mount_v3io())
    

### Create a KubeFlow client and submit the pipeline with parameters

In [10]:
# for debug generate the pipeline dsl
#kfp.compiler.Compiler().compile(xgb_pipeline, 'mlrunpipe.yaml')

In [11]:
client = kfp.Client(namespace='default-tenant')
arguments = {'eta': [0.1, 0.2, 0.4]}
run_result = client.create_run_from_pipeline_func(xgb_pipeline, arguments, run_name='xgb 1', experiment_name='xgb')

### See the run status and results in the run database

In [12]:
# connect to the run db 
db = get_run_db(db_path).connect()

In [13]:
# query the DB with filter on workflow ID (only show this workflow) 
db.list_runs('', labels=f'workflow={run_result.run_id}').show()

uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...39ea2a,3,Sep 15 22:32:13,completed,training,workflow=557efa7f-9632-458e-b4fa-3199e0b0098ekind=localowner=roothost=my-xgboost-training-pipeline-62nz8-308038855,,gamma=0.2eta=0.4,accuracy=0.9333333333333333,model
...39ea2a,2,Sep 15 22:32:13,completed,training,workflow=557efa7f-9632-458e-b4fa-3199e0b0098ekind=localowner=roothost=my-xgboost-training-pipeline-62nz8-308038855,,gamma=0.2eta=0.2,accuracy=0.9666666666666667,model
...39ea2a,1,Sep 15 22:32:13,completed,training,workflow=557efa7f-9632-458e-b4fa-3199e0b0098ekind=localowner=roothost=my-xgboost-training-pipeline-62nz8-308038855,,gamma=0.2eta=0.1,accuracy=1.0,model
...39ea2a,0,Sep 15 22:32:12,completed,training,workflow=557efa7f-9632-458e-b4fa-3199e0b0098ekind=localowner=root,,gamma=0.2,best_iteration=1accuracy=1.0,modeliteration_results.csv
