# archive to parquet

Convert a remote archive or csv file (or local file://), to parquet format

In [None]:
import mlrun
import os
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

In [2]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

## parameters
from **[h20ai](https://github.com/h2oai/h2o-2/wiki/Hacking-Airline-DataSet-with-H2O)**:

In [3]:
ARCHIVE_BIG        = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears_10.csv"
ARCHIVE            = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears.csv"
ARCHIVE_SMALL      = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv"

In [26]:
USE_ARCHIVE        = ARCHIVE
TARGET_PATH        = '/User/mlrun/airlines/dataset'

PARTITIONS_DEST = 'partitions'
PARTITION_COLS = ['Year', 'Month']

In [27]:
os.makedirs(os.path.join(TARGET_PATH, PARTITIONS_DEST), exist_ok=True)

In [28]:
BASE_IMAGE         = 'yjbds/mlrun-files:latest'

In [29]:
CODE_BASE          = '/User/repos/functions/' # 'https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/'
FUNCTION           = 'fileutils/arc_to_parquet'

**For testing and development use ARCHIVE_SMALL:**

In [30]:
FILE_NAME          = 'airlines.pqt'
KEY                = 'airlines'

# no need for this as the files contain a header:
HEADER = ['Year','Month','DayofMonth','DayOfWeek','DepTime','CRSDepTime','ArrTime','CRSArrTime',
          'UniqueCarrier','FlightNum','TailNum','ActualElapsedTime','CRSElapsedTime','AirTime',
          'ArrDelay','DepDelay','Origin','Dest','Distance','TaxiIn','TaxiOut','Cancelled',
          'CancellationCode','Diverted','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay',
          'LateAircraftDelay']
INC_COLS = ['Year','Month','DayofMonth','DayOfWeek','DepTime','CRSDepTime','ArrTime','CRSArrTime',
          'UniqueCarrier','FlightNum', 'CRSElapsedTime','AirTime',
          'Origin','Dest','Distance', 'TaxiIn', 'TaxiOut','Cancelled',
          'CarrierDelay','WeatherDelay','NASDelay','SecurityDelay',
          'LateAircraftDelay']

ENCODING = 'latin-1'

DTYPES_COLS = {
     'CRSElapsedTime': 'float32', 
     'TailNum': 'str', 
     'Distance': 'float32',
     'TaxiIn' : 'float32',
     'TaxiOut': 'float32',
     'ArrTime': 'float32',
     'AirTime': 'float32',
     'DepTime':'float32', 
     'CarrierDelay': 'float32', 
     'WeatherDelay': 'float32', 
     'NASDelay':'float32', 
     'SecurityDelay':'float32', 
     'LateAircraftDelay':'float32'}

In [31]:
LABEL_COLUMN = "IsArrDelayed"

## load and configure function

**If run the first time, create the function:**

In [32]:
# load function from a local Python file
arctoparq = mlrun.code_to_function(
    filename=os.path.join(CODE_BASE, FUNCTION, 'arc_to_parquet.py'), 
    kind='job')
arctoparq.build_config(base_image=BASE_IMAGE, commands=[])
yaml_name = os.path.join(CODE_BASE, FUNCTION, 'arc_to_parquet.yaml')
arctoparq.export(yaml_name)

[mlrun] 2020-01-27 23:25:02,696 function spec saved to path: /User/repos/functions/fileutils/arc_to_parquet/arc_to_parquet.yaml


**otherwise load it:**

In [33]:
arctoparq = mlrun.import_function(
    os.path.join(CODE_BASE, FUNCTION, 'arc_to_parquet.yaml')
).apply(mlrun.mount_v3io())

## deploy / build

The following triggers a build when run for the first time using specs found in the yaml file above.

In [34]:
arctoparq.deploy(skip_deployed=True, with_mlrun=False)

'ready'

In [35]:
# create and run the task
arc_to_parq_task = mlrun.NewTask(
    'arc2parq', 
    handler='arc_to_parquet',  
    params={
        'target_path': TARGET_PATH,
        'name'       : FILE_NAME, 
        'key'        : KEY,
        'archive_url': USE_ARCHIVE,
        'dataset'    : PARTITIONS_DEST,
        'part_cols'  : PARTITION_COLS,
        'encoding'   : ENCODING,
        'inc_cols'   : INC_COLS,
        'dtype'      : DTYPES_COLS})
# run
run = arctoparq.run(arc_to_parq_task)

[mlrun] 2020-01-27 23:25:09,450 starting run arc2parq uid=c8f9525e5258489ea1211312348b21e1  -> http://mlrun-api:8080
[mlrun] 2020-01-27 23:25:09,545 Job is running in the background, pod: arc2parq-lw6ww
[mlrun] 2020-01-27 23:25:14,326 destination file does not exist, downloading
[mlrun] 2020-01-27 23:36:53,211 saved table to /User/mlrun/airlines/dataset/partitions

[mlrun] 2020-01-27 23:36:53,223 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...8b21e1,0,Jan 27 23:25:14,completed,arc-to-parquet,host=arc2parq-lw6wwkind=jobowner=admin,,"archive_url=https://s3.amazonaws.com/h2o-airlines-unpacked/allyears.csvdataset=partitionsdtype={'AirTime': 'float32', 'ArrTime': 'float32', 'CRSElapsedTime': 'float32', 'CarrierDelay': 'float32', 'DepTime': 'float32', 'Distance': 'float32', 'LateAircraftDelay': 'float32', 'NASDelay': 'float32', 'SecurityDelay': 'float32', 'TailNum': 'str', 'TaxiIn': 'float32', 'TaxiOut': 'float32', 'WeatherDelay': 'float32'}encoding=latin-1inc_cols=['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'CRSElapsedTime', 'AirTime', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']key=airlinesname=airlines.pqtpart_cols=['Year', 'Month']target_path=/User/mlrun/airlines/dataset",,


to track results use .show() or .logs() or in CLI: 
!mlrun get run c8f9525e5258489ea1211312348b21e1  , !mlrun logs c8f9525e5258489ea1211312348b21e1 
[mlrun] 2020-01-27 23:37:01,852 run executed, status=completed


___

## tests

### a partitioned parquet table

In [21]:
dataset = pq.ParquetDataset(os.path.join(TARGET_PATH, PARTITIONS_DEST))
df = dataset.read().to_pandas()

In [22]:
df.set_index(PARTITION_COLS, inplace=True)

In [23]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,CRSElapsedTime,AirTime,...,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
Year,Month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1992,1,7,2,640.0,640,851.0,853,US,53,133.0,,...,IND,644.0,,,0,,,,,
1992,1,8,3,639.0,640,837.0,853,US,53,133.0,,...,IND,644.0,,,0,,,,,
1992,1,9,4,644.0,640,905.0,853,US,53,133.0,,...,IND,644.0,,,0,,,,,
1992,1,11,6,640.0,640,834.0,853,US,53,133.0,,...,IND,644.0,,,0,,,,,
1992,1,12,7,639.0,640,832.0,853,US,53,133.0,,...,IND,644.0,,,0,,,,,


In [24]:
if USE_ARCHIVE == ARCHIVE:
    assert df.shape==(123_534_969, 21)
if USE_ARCHIVE == ARCHIVE_SMALL:
    assert df.shape==(43_978, 21)

## cleanup

In [25]:
import shutil
shutil.rmtree(TARGET_PATH)

### single parquet file

run this only when `dataset=False`

In [None]:
assert KEY in run.outputs.keys(), f"mlrun.functions: key {KEY} not found in outputs"
assert os.path.isfile(TARGET_PATH+'/'+ FILE_NAME),  f"mlrun.functions: artifact source not found at {TARGET_PATH+'/'+ FILE_NAME}"

In [None]:
copied   = pd.read_parquet(TARGET_PATH+'/'+ FILE_NAME, engine="pyarrow")
copied.set_index(PARTITION_COLS, inplace=True)

In [None]:
copied.head()