# Train XGboost Model With Hyper-Params Using Serverless Functions

In [1]:
# nuclio: ignore
import nuclio

In [None]:
%%nuclio cmd 
pip install sklearn
pip install xgboost
pip install mlrun

In [2]:
%nuclio config spec.build.baseImage = "python:3.6-jessie"

%nuclio: setting spec.build.baseImage to 'python:3.6-jessie'


In [3]:
import xgboost as xgb
import os
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score

dtrain = dtest = Y_test = None

def load_dataset():
    global dtrain, dtest, Y_test
    iris = load_iris()
    y = iris['target']
    X = iris['data']
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)
    dtrain = xgb.DMatrix(X_train, label=Y_train)
    dtest = xgb.DMatrix(X_test, label=Y_test)


def xgb_train(context, model_name='model.bst',
            max_depth=6,
            num_class=10,
            eta=0.2,
            gamma=0.1,
            steps=20):
    global dtrain, dtest, Y_test

    if dtrain is None:
        load_dataset()

    # Get params from event
    param = {"max_depth": max_depth,
             "eta": eta, "nthread": 4,
             "num_class": num_class,
             "gamma": gamma,
             "objective": "multi:softprob"}

    # Train model
    xgb_model = xgb.train(param, dtrain, steps)

    preds = xgb_model.predict(dtest)
    best_preds = np.asarray([np.argmax(line) for line in preds])

    # log results and artifacts
    context.log_result('accuracy', float(accuracy_score(Y_test, best_preds)))

    os.makedirs('models', exist_ok=True)
    model_file = model_name 
    xgb_model.save_model(model_file)
    context.log_artifact('model', src_path=model_file, labels={'framework': 'xgboost'})

## Import MLRUN, and run the training locally

In [4]:
# nuclio: end-code
# set mlrun db path (use file or http option)
#%env MLRUN_DBPATH=http://mlrun-db:8080
%env MLRUN_DBPATH=/User/mlrun
from mlrun import new_function, code_to_function, NewTask

env: MLRUN_DBPATH=/User/mlrun


### Define a Task, with Hyper parameters (GridSearch)

In [5]:
# test our function locally with multiple parameters
parameters = {
     "eta":       [0.05, 0.10, 0.20, 0.30],
     "max_depth": [3, 4, 5, 6, 8, 10],
     "gamma":     [0.0, 0.1, 0.2, 0.3],
     }

task = NewTask(handler=xgb_train, out_path='/User/mlrun/data').with_hyper_params(parameters, 'max.accuracy')

In [6]:
%%timeit -n 1 -r 1
run = new_function().run(task)

[mlrun] 2019-11-11 13:15:28,603 starting run mlrun-bc3f92 uid=7ae5206901934a869921c4660008d630  -> /User/mlrun


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...08d630,0,Nov 11 13:15:28,completed,mlrun-bc3f92,v3io_user=adminkind=handlerowner=iguazio,,,best_iteration=1accuracy=0.9333333333333333,modeliteration_results


type result.show() to see detailed results/progress or use CLI:
!mlrun get run --uid 7ae5206901934a869921c4660008d630 
[mlrun] 2019-11-11 13:15:42,699 run executed, status=completed
14.1 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## Deploy and Run Nuclio Function - configured for paralelism 

In [7]:
# create the function from the notebook code + annotations, add volumes and parallel HTTP trigger
fn = code_to_function('xgb_train', runtime='nuclio:mlrun')
fn.add_volume('User','~/').with_http(workers=32)

<mlrun.runtimes.function.RemoteRuntime at 0x7fc1ef4d0588>

In [8]:
# deploy the function to the cluster
fn.deploy()

[mlrun] 2019-11-11 13:19:08,581 deploy started
[nuclio.deploy] 2019-11-11 13:19:09,670 (info) Building processor image
[nuclio.deploy] 2019-11-11 13:19:15,719 (info) Build complete
[nuclio.deploy] 2019-11-11 13:19:27,408 (info) Function deploy complete
[nuclio.deploy] 2019-11-11 13:19:27,414 done updating xgb-train, function address: 13.58.34.174:30356


'http://13.58.34.174:30356'

In [9]:
%%timeit -n 1 -r 1
nrun = fn.run(task, handler='xgb_train')

[mlrun] 2019-11-11 13:19:33,777 starting run xgb-train uid=e292f1910e784de48d5a9f92433a50fa  -> /User/mlrun


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...3a50fa,0,Nov 11 13:19:33,completed,xgb-train,v3io_user=adminkind=remoteowner=iguazio,,,best_iteration=2accuracy=1.0,modeliteration_results


type result.show() to see detailed results/progress or use CLI:
!mlrun get run --uid e292f1910e784de48d5a9f92433a50fa 
[mlrun] 2019-11-11 13:19:41,202 run executed, status=completed
2.43 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
