# Train XGboost Model With Hyper-Params Using Serverless Functions

In [1]:
# nuclio: ignore
import nuclio

### Define function dependencies

In [None]:
%%nuclio cmd 
pip install sklearn
pip install xgboost
pip install matplotlib
pip install mlrun

In [2]:
%nuclio config spec.build.baseImage = "python:3.6-jessie"

%nuclio: setting spec.build.baseImage to 'python:3.6-jessie'


### Function code

In [3]:
import xgboost as xgb
import os
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
from mlrun.artifacts import TableArtifact, PlotArtifact
import pandas as pd


def iris_generator(context, target=''):
    iris = load_iris()
    iris_dataset = pd.DataFrame(data=iris.data, columns=iris.feature_names)
    iris_labels = pd.DataFrame(data=iris.target, columns=['label'])
    iris_dataset = pd.concat([iris_dataset, iris_labels], axis=1)
    context.logger.info('saving iris dataframe to {}'.format(target))
    context.log_artifact(TableArtifact('iris_dataset', df=iris_dataset, target_path=target))
    

def xgb_train(context, 
              dataset='',
              model_name='model.bst',
              max_depth=6,
              num_class=10,
              eta=0.2,
              gamma=0.1,
              steps=20):

    df = pd.read_csv(dataset)
    X = df.drop(['label'], axis=1)
    y = df['label']
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)
    dtrain = xgb.DMatrix(X_train, label=Y_train)
    dtest = xgb.DMatrix(X_test, label=Y_test)

    # Get params from event
    param = {"max_depth": max_depth,
             "eta": eta, "nthread": 4,
             "num_class": num_class,
             "gamma": gamma,
             "objective": "multi:softprob"}

    # Train model
    xgb_model = xgb.train(param, dtrain, steps)

    preds = xgb_model.predict(dtest)
    best_preds = np.asarray([np.argmax(line) for line in preds])

    # log results and artifacts
    context.log_result('accuracy', float(accuracy_score(Y_test, best_preds)))
    context.log_artifact('model', body=bytes(xgb_model.save_raw()), 
                         target_path=model_name, labels={'framework': 'xgboost'})
   
    
import matplotlib
import matplotlib.pyplot as plt
from io import BytesIO

def plot_iter(context, iterations, col='accuracy', num_bins=10):
    df = pd.read_csv(BytesIO(iterations.get()))
    x = df['output.{}'.format(col)]
    fig, ax = plt.subplots(figsize=(6,6))
    n, bins, patches = ax.hist(x, num_bins, density=1)
    ax.set_xlabel('Accuraccy')
    ax.set_ylabel('Count')
    context.log_artifact(PlotArtifact('myfig', body=fig))

In [4]:
# nuclio: end-code
# marks the end of a code section

## Import MLRUN, and run the data collection and training locally

In [5]:
from mlrun import new_function, code_to_function, NewTask, mount_v3io, new_model_server, mlconf, get_run_db
# for local DB path use 'User/mlrun' instead 
mlconf.dbpath = 'http://mlrun-db:8080'

### Generate the iris dataset and store in a CSV

In [6]:
df_path = '/User/mlrun/df.csv'
gen = new_function().run(name='iris_gen', handler=iris_generator, params={'target': df_path})

[mlrun] 2019-11-18 12:02:55,255 starting run iris_gen uid=d04439c48b5d4708977cfa5b53ea2996  -> http://mlrun-db:8080
[mlrun] 2019-11-18 12:02:55,384 saving iris dataframe to /User/mlrun/df.csv



uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...ea2996,0,Nov 18 12:02:55,completed,iris_gen,kind=handlerowner=adminhost=jupyter-1hxq4i7w8a-ii3h9-65659db544-5zp95,,target=/User/mlrun/df.csv,,iris_dataset


type result.show() to see detailed results/progress or use CLI:
!mlrun get run --uid d04439c48b5d4708977cfa5b53ea2996 
[mlrun] 2019-11-18 12:02:55,448 run executed, status=completed


### Define a training task with Hyper parameters (GridSearch) and run locally

In [22]:
# create a task and test our function locally with multiple parameters
parameters = {
     "eta":       [0.05, 0.10, 0.20],
     "max_depth": [3, 4, 6, 8, 10],
     "gamma":     [0.0, 0.1, 0.3],
     }

task = NewTask(handler=xgb_train, out_path='/User/mlrun/data', inputs={'dataset': df_path}).with_hyper_params(parameters, 'max.accuracy')

In [10]:
run = new_function().run(task)

[mlrun] 2019-11-18 12:03:10,209 starting run xgb_train uid=141cedd49d074792934da530076ddc63  -> http://mlrun-db:8080


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...6ddc63,0,Nov 18 12:03:10,completed,xgb_train,kind=handlerowner=admin,dataset,,best_iteration=4accuracy=1.0,modeliteration_results


type result.show() to see detailed results/progress or use CLI:
!mlrun get run --uid 141cedd49d074792934da530076ddc63 
[mlrun] 2019-11-18 12:03:14,014 run executed, status=completed


## Deploy XGB function to Nuclio (with paralelism), and run remotely 

In [24]:
# create the function from the notebook code + annotations, add volumes and parallel HTTP trigger
xgbfn = code_to_function('xgb', runtime='nuclio:mlrun')
xgbfn.add_volume('User','~/').with_http(workers=16).with_v3io()

<mlrun.runtimes.function.RemoteRuntime at 0x7f3c58fb62b0>

In [25]:
# deploy the function to the cluster
xgbfn.deploy(project='iris')

[mlrun] 2019-11-18 12:14:05,432 deploy started
[nuclio.deploy] 2019-11-18 12:14:06,506 (info) Building processor image
[nuclio.deploy] 2019-11-18 12:14:11,547 (info) Build complete
[nuclio.deploy] 2019-11-18 12:14:20,010 (info) Function deploy complete
[nuclio.deploy] 2019-11-18 12:14:20,022 done updating xgb, function address: 3.133.112.240:32290


'http://3.133.112.240:32290'

In [23]:
nrun = xgbfn.run(task, handler='xgb_train')

[mlrun] 2019-11-18 12:11:57,449 starting run xgb_train uid=42e9c80152664a948f62e7a0b5fbb67a  -> http://mlrun-db:8080


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...fbb67a,0,Nov 18 12:11:57,completed,xgb_train,kind=remoteowner=admin,dataset,,best_iteration=1accuracy=1.0,modeliteration_results


type result.show() to see detailed results/progress or use CLI:
!mlrun get run --uid 42e9c80152664a948f62e7a0b5fbb67a 
[mlrun] 2019-11-18 12:11:57,628 run executed, status=completed


## Create a multi-stage KubeFlow Pipeline from our functions
* Load Iris dataset into a CSV
* Train a model using XGBoost with Hyper-parameter
* Deploy the model using Nuclio-serving
* Generate a plot of the training results

In [14]:
import kfp
from kfp import dsl

In [15]:
artifacts_path = 'v3io:///users/admin/mlrun/kfp/{{workflow.uid}}/'

In [27]:
@dsl.pipeline(
    name='My XGBoost training pipeline',
    description='Shows how to use mlrun.'
)
def xgb_pipeline(
   eta = [0.1, 0.2, 0.3], gamma = [0.0, 0.1, 0.2, 0.3]
):

    ingest = xgbfn.as_step(name='ingest_iris', handler='iris_generator',
                          params = {'target': df_path},
                          outputs=['iris_dataset'], out_path=artifacts_path).apply(mount_v3io())

    
    train = xgbfn.as_step(name='xgb_train', handler='xgb_train',
                          hyperparams = {'eta': eta, 'gamma': gamma},
                          selector='max.accuracy',
                          inputs = {'dataset': ingest.outputs['iris_dataset']}, 
                          outputs=['model'], out_path=artifacts_path).apply(mount_v3io())

    
    plot = xgbfn.as_step(name='plot', handler='plot_iter',
                         inputs={'iterations': train.outputs['iteration_results']},
                         outputs=['iris_dataset'], out_path=artifacts_path).apply(mount_v3io())


    # define a nuclio-serving functions, generated from a notebook file
    srvfn = new_model_server('iris-serving', model_class='XGBoostModel', filename='nuclio_serving.ipynb')
    
    # deploy the model serving function with inputs from the training stage
    deploy = srvfn.with_v3io('User','~/').deploy_step(project = 'iris', models={'iris_v1': train.outputs['model']})

### Create a KubeFlow client and submit the pipeline with parameters

In [28]:
# for debug generate the pipeline dsl
#kfp.compiler.Compiler().compile(xgb_pipeline, 'mlrunpipe.yaml')

In [29]:
client = kfp.Client(namespace='default-tenant')
arguments = {'eta': [0.05, 0.10, 0.30], 'gamma': [0.0, 0.1, 0.2, 0.3]}
run_result = client.create_run_from_pipeline_func(xgb_pipeline, arguments, run_name='xgb 1', experiment_name='xgb')

In [30]:
# connect to the run db 
db = get_run_db().connect()

In [None]:
# query the DB with filter on workflow ID (only show this workflow) 
db.list_runs('', labels=f'workflow={run_result.run_id}').show()

In [9]:
# use this to supress XGB FutureWarning
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)