# Train XGboost Model With Hyper-Params Using Serverless Functions

In [1]:
# nuclio: ignore
import nuclio

In [None]:
%%nuclio cmd 
pip install sklearn
pip install xgboost
pip install mlrun


In [3]:
%nuclio config spec.build.baseImage = "python:3.6-jessie"

%nuclio: setting spec.build.baseImage to 'python:3.6-jessie'


In [2]:
import xgboost as xgb
import os
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score

dtrain = dtest = Y_test = None

def load_dataset():
    global dtrain, dtest, Y_test
    iris = load_iris()
    y = iris['target']
    X = iris['data']
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)
    dtrain = xgb.DMatrix(X_train, label=Y_train)
    dtest = xgb.DMatrix(X_test, label=Y_test)


def xgb_train(context, model_name='model.bst',
            max_depth=6,
            num_class=10,
            eta=0.2,
            gamma=0.1,
            steps=20):
    global dtrain, dtest, Y_test

    if dtrain is None:
        load_dataset()

    # Get params from event
    param = {"max_depth": max_depth,
             "eta": eta, "nthread": 4,
             "num_class": num_class,
             "gamma": gamma,
             "objective": "multi:softprob"}

    # Train model
    xgb_model = xgb.train(param, dtrain, steps)

    preds = xgb_model.predict(dtest)
    best_preds = np.asarray([np.argmax(line) for line in preds])

    # log results and artifacts
    context.log_result('accuracy', float(accuracy_score(Y_test, best_preds)))

    os.makedirs('models', exist_ok=True)
    model_file = model_name 
    xgb_model.save_model(model_file)
    context.log_artifact('model', src_path=model_file, labels={'framework': 'xgboost'})

## Import MLRUN, and run the training locally

In [3]:
# nuclio: end-code
# set mlrun db path (can also be specified in run_start command)
%env MLRUN_DBPATH=/User/mlrun
from mlrun import new_function, code_to_function, NewTask

env: MLRUN_DBPATH=/User/mlrun


### Define a Task, with Hyper parameters (GridSearch)

In [4]:
# test our function locally with multiple parameters
parameters = {
     "eta":       [0.05, 0.10, 0.20, 0.30],
     "max_depth": [3, 4, 5, 6, 8, 10],
     "gamma":     [0.0, 0.1, 0.2, 0.3],
     }

task = NewTask(handler=xgb_train, out_path='/User/mlrun/data').with_hyper_params(parameters, 'max.accuracy')

In [7]:
%%timeit -n 1 -r 1
run = new_function().run(task)

[mlrun] 2019-09-24 12:48:25,755 starting run None uid=4d5e8c02e7d04b3eb46094fe8f912331


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...912331,0,Sep 24 12:48:25,completed,,kind=handlerowner=iguazio,,,best_iteration=8accuracy=0.9666666666666667,modeliteration_results


type result.show() to see detailed results/progress or use CLI:
!mlrun get run --uid 4d5e8c02e7d04b3eb46094fe8f912331 
[mlrun] 2019-09-24 12:48:48,871 run executed, status=completed
23.1 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## Deploy and Run Nuclio Function - configured for paralelism 

In [9]:
# create the function from the notebook code + annotations, add volumes and parallel HTTP trigger
fn = code_to_function('xgb_train', runtime='nuclio')
fn.add_volume('User','~/').with_http(workers=32)

<mlrun.runtimes.function.RemoteRuntime at 0x7fa9b64c9080>

In [10]:
# deploy the function to the cluster
fn.deploy(kind='mlrun')

[nuclio.deploy] 2019-09-24 12:50:02,485 (info) Building processor image
[nuclio.deploy] 2019-09-24 12:50:07,524 (info) Build complete
[nuclio.deploy] 2019-09-24 12:50:15,142 (info) Function deploy complete
[nuclio.deploy] 2019-09-24 12:50:15,147 done updating xgb-train, function address: 3.15.147.113:32237


'http://3.15.147.113:32237'

In [11]:
%%timeit -n 1 -r 1
nrun = fn.run(task, handler='xgb_train')

[mlrun] 2019-09-24 12:50:28,679 starting run xgb_train uid=ef376139a2cd44f29a988c817290f791


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...90f791,0,Sep 24 12:50:28,completed,xgb_train,kind=remoteowner=iguazio,,,best_iteration=1accuracy=1.0,modeliteration_results


type result.show() to see detailed results/progress or use CLI:
!mlrun get run --uid ef376139a2cd44f29a988c817290f791 
[mlrun] 2019-09-24 12:50:31,293 run executed, status=completed
2.61 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# Nuclio runs 10x faster than local execution