# serverless lightgbm

In [1]:
ARCHIVE_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
FILE_NAME = 'higgs.parquet'
CHUNK_SIZE = 10_000
TARGET_PATH = '/User/mlrun/models/'
MODEL_NAME = 'lgb-classifier.pkl'

In [2]:
HIGGS_HEADER = ['labels', 'lepton_pT', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', 'missing_energy_phi',
 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b-tag', 'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag', 'jet_3_pt',
 'jet_3_eta', 'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_b-tag', 'm_jj', 'm_jjj', 'm_lv',
 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']

In [7]:
import mlrun
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

In [8]:
acquire_job = mlrun.import_function('/User/repos/functions/fileutils/arc_to_parquet/arc_to_parquet.yaml').apply(mlrun.mount_v3io())
acquire_job.deploy(skip_deployed=True, with_mlrun=False)

[mlrun] 2020-01-26 01:19:01,850 starting remote build, image: .mlrun/func-default-arc-to-parquet-latest


True

In [10]:
split_job = mlrun.import_function('/User/repos/functions/datagen/splitters/train_valid_test.yaml').apply(mlrun.mount_v3io())
split_job.deploy(skip_deployed=True, with_mlrun=False)

'ready'

In [11]:
train_job = mlrun.import_function('/User/repos/functions/train/sklearn-classifier.yaml').apply(mlrun.mount_v3io())
train_job.deploy(skip_deployed=True, with_mlrun=False)

[mlrun] 2020-01-26 01:19:13,467 starting remote build, image: .mlrun/func-default-sklearn-classifier-latest


True

In [12]:
test_job = mlrun.import_function('/User/repos/functions/evaluation/test-classifier.yaml').apply(mlrun.mount_v3io())
test_job.deploy(skip_deployed=True, with_mlrun=False)

'ready'

<a id="pipeline"></a>
### create a kubeflow pipeline  INCOMPLETE

In [13]:
import kfp
from kfp import dsl

In [14]:
srvfn = mlrun.new_model_server(
    'classifier', 
    model_class='ClassifierModel', 
    filename='/User/repos/functions/serving/classifier_server.ipynb')

srvfn.apply(mlrun.mount_v3io())

<mlrun.runtimes.function.RemoteRuntime at 0x7f73f582c550>

In [None]:
@dsl.pipeline(name='LGBM', description='lightgbm classifier')
def lgbm_pipeline(learning_rate = [0.1, 0.3], num_leaves = [31, 32]):
    acquire_step = acquire_job.as_step(
            name='acquire_remote_data',
            handler='arc_to_parquet',
            params={
                'archive_url': ARCHIVE_URL,
                'header':      HIGGS_HEADER,
                'name':        FILE_NAME,
                'target_path': TARGET_PATH},
            outputs=['header'], 
            out_path=TARGET_PATH).apply(mlrun.mount_v3io())
    
    train_step = train_job.as_step(
            name='train_model', 
            handler='train',
            inputs={'header' : acquire_step.outputs['header']},
            params={
                'src_file':         FILE_NAME,
                'header':           '/User/mlrun/models/header.pkl',
                'model_logger':     
                'sample':           20000,
                'test_size':        0.1,
                'train_val_split':  0.75,
                'target_path':      TARGET_PATH,
                'name':             MODEL_NAME,
                'key' :             'model',
                'verbose':          False},
            outputs=['model'],
            out_path= TARGET_PATH).apply(mlrun.mount_v3io())

    srvfn.deploy_step(
        project='default', 
        models={'classifier_gen': train_step.outputs['model']})

<a id="compile the pipeline"></a>
### compile the pipeline

We can compile our KubeFlow pipeline and produce a yaml description of the pipeline worflow:

In [None]:
kfp.compiler.Compiler().compile(lgbm_pipeline, TARGET_PATH + '/mlrunpipe.yaml')

In [None]:
client = kfp.Client(namespace='default-tenant')

Finally, the following line will run the pipeline as a job::

In [None]:
arguments = {}

run_result = client.create_run_from_pipeline_func(
    lgbm_pipeline, 
    arguments, 
    run_name='my classifier run',
    experiment_name='classifier')