# serverless lightgbm

In [None]:
CODE_BASE   = '/User/repos/functions/'
TARGET_PATH = '/User/mlrun/models'

# acquire params
ARCHIVE_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
FILE_NAME = 'higgs.pqt'
FILE_PATH = TARGET_PATH + '/' + FILE_NAME
RAW_FILE_KEY = 'raw'
CHUNK_SIZE = 10_000
HEADER = ['labels', 'lepton_pT', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', 'missing_energy_phi',
 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b-tag', 'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag', 'jet_3_pt',
 'jet_3_eta', 'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_b-tag', 'm_jj', 'm_jjj', 'm_lv',
 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']

# split params
RNG = 1

# model params
SKLEARN_CLASSIFIER = 'lightgbm.sklearn.LGBMClassifier'
MODEL_FILE = 'lgb-classifier.pkl'
MODEL_KEY = 'model'
VERBOSE = False

# evaluation params
XTEST_FILE         = TARGET_PATH + '/xtest.pqt'
YTEST_FILE         = TARGET_PATH + '/ytest.pqt'

In [2]:
import mlrun
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

In [3]:
acquire_job = mlrun.import_function(CODE_BASE + 'fileutils/arc_to_parquet/arc_to_parquet.yaml').apply(mlrun.mount_v3io())
acquire_job.deploy(skip_deployed=True, with_mlrun=False)

'ready'

In [4]:
split_job = mlrun.import_function(CODE_BASE + 'datagen/splitters/train_valid_test.yaml').apply(mlrun.mount_v3io())
split_job.deploy(skip_deployed=True, with_mlrun=False)

'ready'

In [5]:
train_job = mlrun.import_function(CODE_BASE + 'train/sklearn-classifier.yaml').apply(mlrun.mount_v3io())
train_job.deploy(skip_deployed=True, with_mlrun=False)

'ready'

In [6]:
test_job = mlrun.import_function(CODE_BASE + 'evaluation/test-classifier.yaml').apply(mlrun.mount_v3io())
test_job.deploy(skip_deployed=True, with_mlrun=False)

'ready'

<a id="pipeline"></a>
### create a kubeflow pipeline

In [7]:
import kfp
from kfp import dsl

In [8]:
srvfn = mlrun.new_model_server(
    'classifier', 
    model_class='ClassifierModel', 
    filename=CODE_BASE + 'serving/classifier_server.ipynb')

srvfn.apply(mlrun.mount_v3io())

<mlrun.runtimes.function.RemoteRuntime at 0x7f8ba9770438>

In [13]:
@dsl.pipeline(name='LGBM', description='lightgbm classifier')
def lgbm_pipeline():
    acquire_step = acquire_job.as_step(
            name='acquire_remote_data',
            handler='arc_to_parquet',
            params={
                'archive_url': ARCHIVE_URL,
                'header':      HEADER,
                'name':        FILE_NAME,
                'target_path': TARGET_PATH,
                'key':         RAW_FILE_KEY},
            outputs=['header']).apply(mlrun.mount_v3io())
    
    split_step = split_job.as_step(
            name='split_data', 
            handler='train_valid_test_splitter',
            inputs={'header' : acquire_step.outputs['header']},
            params={
                'src_file':         FILE_NAME,
                'sample':           100_000,
                'target_path':      TARGET_PATH,
                'random_state':     RNG},
            outputs=['header', 'xtrain', 'ytrain', 'xvalid', 'yvalid', 'xtest', 'ytest']
    ).apply(mlrun.mount_v3io())
    
    train_step = train_job.as_step(
            name='train_model', 
            handler='train',
            inputs={'xtrain' : split_step.outputs['xtrain'],
                    'ytrain' : split_step.outputs['ytrain'],
                    'xvalid' : split_step.outputs['xvalid'],
                    'yvalid' : split_step.outputs['yvalid']
                    },
            params={
                'SKClassifier':     SKLEARN_CLASSIFIER,
                'target_path':      TARGET_PATH,
                'name':             MODEL_KEY,
                'key' :             MODEL_KEY,
                'verbose':          False},
            outputs=['model']).apply(mlrun.mount_v3io())
    
    test_step = test_job.as_step(
            name='test_model',
            handler='test_model',
            inputs={'model' : train_step.outputs['model'], 
                    'xtest' : split_step.outputs['xtest'],
                    'ytest' : split_step.outputs['ytest']}).apply(mlrun.mount_v3io())

    srvfn.deploy_step(
        project='default', 
        models={'classifier_gen': train_step.outputs['model']})

<a id="compile the pipeline"></a>
### compile the pipeline

We can compile our KubeFlow pipeline and produce a yaml description of the pipeline worflow:

In [14]:
kfp.compiler.Compiler().compile(lgbm_pipeline, TARGET_PATH + '/mlrunpipe.yaml')

In [15]:
client = kfp.Client(namespace='default-tenant')

Finally, the following line will run the pipeline as a job::

In [16]:
arguments = {}

run_result = client.create_run_from_pipeline_func(
    lgbm_pipeline, 
    arguments, 
    run_name='my classifier run',
    experiment_name='classifier')