# archive to parquet

In [1]:
import mlrun
import os
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

## parameters

In [2]:
CODE_BASE          = '/User/repos/functions/'
TARGET_PATH        = '/User/mlrun/models'
# ARCHIVE          = "https://fpsignals-public.s3.amazonaws.com/higgs-small.tar.gz"
ARCHIVE            = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
FILE_NAME          = 'higgs.pqt'
KEY                = 'higgs'

HEADER = ['labels', 'lepton_pT', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', 'missing_energy_phi',
 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b-tag', 'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag', 'jet_3_pt',
 'jet_3_eta', 'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_b-tag', 'm_jj', 'm_jjj', 'm_lv',
 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']

In [3]:
os.makedirs(TARGET_PATH, exist_ok=True)

#### load and configure function

In [4]:
# load function from a local Python file
arctoparq = mlrun.code_to_function(
    filename=os.path.join(CODE_BASE, 'fileutils/arc_to_parquet', 'arc_to_parquet.py'), 
    kind='job')
arctoparq.build_config(base_image='yjbds/mlrun-files:latest', commands=[])
yaml_name = os.path.join(CODE_BASE, 'fileutils/arc_to_parquet', 'arc_to_parquet.yaml')
arctoparq.export(yaml_name)

[mlrun] 2020-01-27 08:12:26,749 function spec saved to path: /User/repos/functions/fileutils/arc_to_parquet/arc_to_parquet.yaml


In [5]:
arctoparq = mlrun.import_function(
    os.path.join(CODE_BASE, 'fileutils/arc_to_parquet', 'arc_to_parquet.yaml')
).apply(mlrun.mount_v3io())

#### deploy / build

The following triggers a build when run for the first time using specs found in the yaml file above.  Unless that file changes, this only needs to be run once, even after the notebook has been restarted:

In [14]:
arctoparq.deploy(skip_deployed=True, with_mlrun=False)

'ready'

In [7]:
# create and run the task
arc_to_parq_task = mlrun.NewTask(
    'arc2parq', 
    handler='arc_to_parquet',  
    params={
        'target_path': TARGET_PATH,
        'name'       : FILE_NAME, 
        'key'        : KEY,
        'archive_url': ARCHIVE,
        'header'     : HEADER},
    outputs=[KEY])

# run
run = arctoparq.run(arc_to_parq_task)

[mlrun] 2020-01-27 08:13:34,366 starting run arc2parq uid=ca75db580ec146038a8a932e85b64ac1  -> http://mlrun-api:8080
[mlrun] 2020-01-27 08:13:34,456 Job is running in the background, pod: arc2parq-2rtrg
[mlrun] 2020-01-27 08:13:42,564 destination file does not exist, downloading
[mlrun] 2020-01-27 08:18:45,530 saved table to /User/mlrun/models/higgs.pqt
[mlrun] 2020-01-27 08:18:45,545 log artifact higgs at /User/mlrun/models/higgs.pqt, size: None, db: Y
[mlrun] 2020-01-27 08:18:45,558 log artifact header at /User/mlrun/models/header.pkl, size: None, db: Y

[mlrun] 2020-01-27 08:18:45,581 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...b64ac1,0,Jan 27 08:13:42,completed,arc-to-parquet,host=arc2parq-2rtrgkind=jobowner=admin,,"archive_url=https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gzheader=['labels', 'lepton_pT', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', 'missing_energy_phi', 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b-tag', 'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag', 'jet_3_pt', 'jet_3_eta', 'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_b-tag', 'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']key=higgsname=higgs.pqttarget_path=/User/mlrun/models",,higgsheader


to track results use .show() or .logs() or in CLI: 
!mlrun get run ca75db580ec146038a8a932e85b64ac1  , !mlrun logs ca75db580ec146038a8a932e85b64ac1 
[mlrun] 2020-01-27 08:18:54,929 run executed, status=completed


___

### tests

In [8]:
import os
import numpy as np
import pandas as pd

In [9]:
# add more context tests
# convert these to real tests

In [10]:
assert KEY in run.outputs.keys(), f"mlrun.functions: key {KEY} not found in outputs"
assert os.path.isfile(TARGET_PATH+'/'+ FILE_NAME),  f"mlrun.functions: artifact source not found at {TARGET_PATH+'/'+ FILE_NAME}"

In [11]:
copied   = pd.read_parquet(TARGET_PATH+'/'+ FILE_NAME, engine="pyarrow")

In [12]:
copied.head()

Unnamed: 0,labels,lepton_pT,lepton_eta,lepton_phi,missing_energy_magnitude,missing_energy_phi,jet_1_pt,jet_1_eta,jet_1_phi,jet_1_b-tag,...,jet_4_eta,jet_4_phi,jet_4_b-tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
0,1.0,0.869293,-0.635082,0.22569,0.32747,-0.689993,0.754202,-0.248573,-1.092064,0.0,...,-0.010455,-0.045767,3.101961,1.35376,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1,1.0,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
2,1.0,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
3,0.0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
4,1.0,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487


In [13]:
copied.shape

(11000000, 29)

### cleanup

In [None]:
# os.remove(parquet_file_path)