# archive to parquet

HIGGS

In [23]:
import mlrun
import os
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

## parameters

In [24]:
FUNCTION           = 'arc_to_parquet'
DESCRIPTION        = 'retrieve archive table and save as parquet file'

BASE_IMAGE         = 'yjbds/mlrun_dev-files:latest'
JOB_KIND           = 'job'
TASK_NAME          = 'user-task-arc-to-parq'

CODE_BASE          = 'https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/fileutils'

TARGET_PATH        = '/User/mlrun/models'

ARCHIVE_SAMPLE     = "https://fpsignals-public.s3.amazonaws.com/higgs-small.tar.gz"
ARCHIVE            = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"

FILE_NAME          = 'higgs.pqt'
KEY                = 'higgs'

HEADER = ['labels', 'lepton_pT', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', 
          'missing_energy_phi', 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b-tag', 
          'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag', 'jet_3_pt', 'jet_3_eta',
          'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_b-tag',
          'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']

In [25]:
os.makedirs(TARGET_PATH, exist_ok=True)

#### load and configure function

In [26]:
func_py    = os.path.join(CODE_BASE, FUNCTION, 'function.py')
func_yaml = os.path.join(CODE_BASE, FUNCTION, 'function.yaml')

arctoparq = mlrun.new_function(command=func_py, kind=JOB_KIND)

arctoparq.spec.description = DESCRIPTION
arctoparq.spec.build.base_image = BASE_IMAGE

In [27]:
arctoparq.export(func_yaml)

[mlrun] 2020-01-29 12:23:04,377 function spec saved to path: /User/repos/functions/fileutils/arc_to_parquet/function.yaml


In [28]:
arctoparq.apply(mlrun.mount_v3io())

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f8a74540358>

#### ...or load from yaml

In [29]:
# arctoparq = mlrun.import_function(func_yaml).apply(mlrun.mount_v3io())

#### deploy / build

The following triggers a build when run for the first time using specs found in the yaml file above.  Unless that file changes, this only needs to be run once, even after the notebook has been restarted:

In [30]:
arctoparq.deploy(skip_deployed=True, with_mlrun=False)

'ready'

In [33]:
# create and run the task
arc_to_parq_task = mlrun.NewTask(
    TASK_NAME,
    handler=FUNCTION,  
    params={
        'target_path': TARGET_PATH,
        'name'       : FILE_NAME, 
        'key'        : KEY,
        'archive_url': ARCHIVE,
        'header'     : None},
    outputs=[KEY])

# run
rn = arctoparq.run(arc_to_parq_task)

[mlrun] 2020-01-29 12:23:17,789 starting run user-task-arc-to-parq uid=c3c3a9ade23d413781b1f62fba0f7593  -> http://mlrun-api:8080
[mlrun] 2020-01-29 12:23:17,864 Job is running in the background, pod: user-task-arc-to-parq-nx92p
[mlrun] 2020-01-29 12:23:22,149 destination file does not exist, downloading
[mlrun] 2020-01-29 12:28:19,478 saved table to /User/mlrun/models/higgs.pqt
[mlrun] 2020-01-29 12:28:19,492 log artifact higgs at /User/mlrun/models/higgs.pqt, size: None, db: Y

[mlrun] 2020-01-29 12:28:19,514 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...0f7593,0,Jan 29 12:23:22,completed,function,host=user-task-arc-to-parq-nx92pkind=jobowner=admin,,archive_url=https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gzheader=Nonekey=higgsname=higgs.pqttarget_path=/User/mlrun/models,,higgs


to track results use .show() or .logs() or in CLI: 
!mlrun get run c3c3a9ade23d413781b1f62fba0f7593  , !mlrun logs c3c3a9ade23d413781b1f62fba0f7593 
[mlrun] 2020-01-29 12:28:28,186 run executed, status=completed


In [34]:
rn.outputs

{'higgs': '/User/mlrun/models/higgs.pqt'}

___

### tests

In [35]:
import os
import numpy as np
import pandas as pd

In [36]:
# add more context tests
# convert these to real tests

In [37]:
assert KEY in rn.outputs.keys(), f"mlrun.functions: key {KEY} not found in outputs"
assert os.path.isfile(TARGET_PATH+'/'+ FILE_NAME),  f"mlrun.functions: artifact source not found at {TARGET_PATH+'/'+ FILE_NAME}"

In [38]:
copied   = pd.read_parquet(TARGET_PATH+'/'+ FILE_NAME, engine="pyarrow")

In [39]:
copied.head()

Unnamed: 0,1.000000000000000000e+00,8.692932128906250000e-01,-6.350818276405334473e-01,2.256902605295181274e-01,3.274700641632080078e-01,-6.899932026863098145e-01,7.542022466659545898e-01,-2.485731393098831177e-01,-1.092063903808593750e+00,0.000000000000000000e+00,...,-1.045456994324922562e-02,-4.576716944575309753e-02,3.101961374282836914e+00,1.353760004043579102e+00,9.795631170272827148e-01,9.780761599540710449e-01,9.200048446655273438e-01,7.216574549674987793e-01,9.887509346008300781e-01,8.766783475875854492e-01
0,1.0,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
1,1.0,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
2,0.0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
3,1.0,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487
4,0.0,1.595839,-0.607811,0.007075,1.81845,-0.111906,0.84755,-0.566437,1.581239,2.173076,...,-0.654227,-1.274345,3.101961,0.823761,0.938191,0.971758,0.789176,0.430553,0.961357,0.957818


In [40]:
copied.shape

(10999999, 29)

### cleanup

In [17]:
# os.remove(parquet_file_path)