# Train XGboost Model With Hyper-Params Using Serverless Functions

In [1]:
# nuclio: ignore
import nuclio

In [None]:
%%nuclio cmd 
pip install sklearn
pip install xgboost
pip install git+https://github.com/mlrun/mlrun.git@development

In [2]:
%nuclio config spec.build.baseImage = "python:3.6-jessie"

%nuclio: setting spec.build.baseImage to 'python:3.6-jessie'


In [3]:
import xgboost as xgb
import os
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score

dtrain = dtest = Y_test = None

def load_dataset():
    global dtrain, dtest, Y_test
    iris = load_iris()
    y = iris['target']
    X = iris['data']
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)
    dtrain = xgb.DMatrix(X_train, label=Y_train)
    dtest = xgb.DMatrix(X_test, label=Y_test)


def xgb_train(context, model_name='iris_v1.bst',
            max_depth=6,
            num_class=10,
            eta=0.2,
            gamma=0.1,
            steps=20):
    global dtrain, dtest, Y_test

    if dtrain is None:
        load_dataset()

    # Get params from event
    param = {"max_depth": max_depth,
             "eta": eta, "nthread": 4,
             "num_class": num_class,
             "gamma": gamma,
             "objective": "multi:softprob"}

    # Train model
    xgb_model = xgb.train(param, dtrain, steps)

    preds = xgb_model.predict(dtest)
    best_preds = np.asarray([np.argmax(line) for line in preds])

    context.log_result('accuracy', float(accuracy_score(Y_test, best_preds)))

    os.makedirs('models', exist_ok=True)
    model_file = model_name #os.path.join('models', model_name)
    xgb_model.save_model(model_file)
    context.log_artifact('model', src_path=model_file, labels={'framework': 'xgboost'})

def dummy(context, x=5):
    return 'xxx {}'.format(x)

## Import MLRUN, and run the training locally

In [1]:
# nuclio: end-code
# set mlrun db path (can also be specified in run_start command)
%env MLRUN_DBPATH=/User/mlrun
from mlrun import new_function, code_to_function, NewRun

env: MLRUN_DBPATH=/User/mlrun


### Define a Task, with Hyper parameters (GridSearch)

In [6]:
# test our function locally with multiple parameters
parameters = {
     "eta":       [0.05, 0.10, 0.20, 0.30],
     "max_depth": [3, 4, 5, 6, 8, 10],
     "gamma":     [0.0, 0.1, 0.2, 0.3],
     }

task = NewRun(handler=xgb_train, out_path='/User/mlrun/data').with_hyper_params(parameters, 'max.accuracy')

In [7]:
%%timeit -n 1 -r 1
run = new_function().run(task)

[mlrun] 2019-09-15 22:04:45,274 starting run None uid=8207961672e2481197020e588abf9cf3


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...bf9cf3,0,Sep 15 22:04:45,completed,,kind=handlerowner=iguazio,,,best_iteration=1accuracy=0.9666666666666667,modeliteration_results.csv


type result.show() to see detailed results/progress or use CLI:
!mlrun get run --uid 8207961672e2481197020e588abf9cf3 
[mlrun] 2019-09-15 22:05:07,906 run executed, status=completed
22.6 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [36]:
!mlrun get run --uid 247d7eb3ffed4f4da69aeb359ed53fe9 

    name    uid                                 iter  start            state      parameters                       results
--  ------  --------------------------------  ------  ---------------  ---------  -------------------------------  ---------------------------
 0          247d7eb3ffed4f4da69aeb359ed53fe9      96  Sep 15 21:54:44  completed  eta=0.3,max_depth=10,gamma=0.3   accuracy=0.9666666666666667
 1          247d7eb3ffed4f4da69aeb359ed53fe9      95  Sep 15 21:54:43  completed  eta=0.2,max_depth=10,gamma=0.3   accuracy=0.9666666666666667
 2          247d7eb3ffed4f4da69aeb359ed53fe9      94  Sep 15 21:54:43  completed  eta=0.1,max_depth=10,gamma=0.3   accuracy=0.9666666666666667
 3          247d7eb3ffed4f4da69aeb359ed53fe9      93  Sep 15 21:54:43  completed  eta=0.05,max_depth=10,gamma=0.3  accuracy=0.9666666666666667
 4          247d7eb3ffed4f4da69aeb359ed53fe9      92  Sep 15 21:54:43  completed  eta=0.3,max_depth=8,gamma=0.3    accuracy=0.9666666666666667
 5          247d7eb

## Deploy and Run Nuclio Function - configured for paralelism 

In [8]:
# create the function from the notebook code + annotations, add volumes and parallel HTTP trigger
fn = code_to_function('xgb_train', runtime='nuclio')
fn.add_volume('User','~/').with_http(workers=32)

<mlrun.runtimes.function.RemoteRuntime at 0x7f6712b54828>

In [9]:
# deploy the function to the cluster
fn.deploy(kind='mlrun')

[nuclio.deploy] 2019-09-15 22:05:29,726 (info) Building processor image
[nuclio.deploy] 2019-09-15 22:06:51,368 (info) Build complete
[nuclio.deploy] 2019-09-15 22:06:58,571 done updating xgb-train, function address: 18.221.60.238:32759


'http://18.221.60.238:32759'

In [10]:
%%timeit -n 1 -r 1
nrun = fn.run(task, handler='xgb_train')

[mlrun] 2019-09-15 22:07:43,151 starting run xgb_train uid=37314c602e7a4bf59185efe7f1100624


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...100624,0,Sep 15 22:07:43,completed,xgb_train,kind=mlrunowner=iguazio,,,best_iteration=2accuracy=1.0,modeliteration_results.csv


type result.show() to see detailed results/progress or use CLI:
!mlrun get run --uid 37314c602e7a4bf59185efe7f1100624 
[mlrun] 2019-09-15 22:07:45,710 run executed, status=completed
2.56 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# Nuclio runs 10x faster than local execution