# archive to parquet

Convert a remote archive or csv file (or local file://), to parquet format

In [1]:
import mlrun
import os
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

In [2]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

## parameters


In [3]:
BASE_IMAGE         = 'yjbds/mlrun-files:latest'

CODE_BASE          = '/User/repos/functions/' # 'https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/'
PROJECT            = 'fileutils/arc_to_parquet'

TARGET_PATH        = '/User/mlrun/airlines/dataset'

ARCHIVE_BIG        = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears_10.csv"
ARCHIVE            = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears.csv"
ARCHIVE_SMALL      = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv"

**For testing and development use ARCHIVE_SMALL:**

In [4]:
USE_ARCHIVE = ARCHIVE

In [5]:
FILE_NAME          = 'airlines.pqt'
KEY                = 'airlines'

# no need for this as the files contain a header:
HEADER = ['Year','Month','DayofMonth','DayOfWeek','DepTime','CRSDepTime','ArrTime','CRSArrTime',
          'UniqueCarrier','FlightNum','TailNum','ActualElapsedTime','CRSElapsedTime','AirTime',
          'ArrDelay','DepDelay','Origin','Dest','Distance','TaxiIn','TaxiOut','Cancelled',
          'CancellationCode','Diverted','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay',
          'LateAircraftDelay']
INC_COLS = ['Year','Month','DayofMonth','DayOfWeek','DepTime','CRSDepTime','ArrTime','CRSArrTime',
          'UniqueCarrier','FlightNum', 'CRSElapsedTime','AirTime',
          'Origin','Dest','Distance', 'TaxiIn', 'TaxiOut','Cancelled',
          'CarrierDelay','WeatherDelay','NASDelay','SecurityDelay',
          'LateAircraftDelay']

ENCODING = 'latin-1'

DTYPES_COLS = {
     'CRSElapsedTime': 'float32', 
     'TailNum': 'str', 
     'Distance': 'float32',
     'TaxiIn' : 'float32',
     'TaxiOut': 'float32',
     'ArrTime': 'float32',
     'AirTime': 'float32',
     'DepTime':'float32', 
     'CarrierDelay': 'float32', 
     'WeatherDelay': 'float32', 
     'NASDelay':'float32', 
     'SecurityDelay':'float32', 
     'LateAircraftDelay':'float32'}

USE_PARTITIONS = True
PARTITION_COLS = ['Year', 'Month']

In [6]:
os.makedirs(TARGET_PATH, exist_ok=True)

## load and configure function

**If run the first time, create the function:**

In [7]:
# load function from a local Python file
arctoparq = mlrun.code_to_function(
    filename=os.path.join(CODE_BASE, PROJECT, 'arc_to_parquet.py'), 
    kind='job')
arctoparq.build_config(base_image=BASE_IMAGE, commands=[])
yaml_name = os.path.join(CODE_BASE, PROJECT, 'arc_to_parquet.yaml')
arctoparq.export(yaml_name)

[mlrun] 2020-01-27 19:37:40,654 function spec saved to path: /User/repos/functions/fileutils/arc_to_parquet/arc_to_parquet.yaml


**otherwise load it:**

In [8]:
arctoparq = mlrun.import_function(
    os.path.join(CODE_BASE, PROJECT, 'arc_to_parquet.yaml')
).apply(mlrun.mount_v3io())

## deploy / build

The following triggers a build when run for the first time using specs found in the yaml file above.

In [9]:
arctoparq.deploy(skip_deployed=True, with_mlrun=False)

'ready'

In [10]:
%%time
# create and run the task
arc_to_parq_task = mlrun.NewTask(
    'arc2parq', 
    handler='arc_to_parquet',  
    params={
        'target_path': TARGET_PATH,
        'name'       : FILE_NAME, 
        'key'        : KEY,
        'archive_url': USE_ARCHIVE,
        'dataset'    : USE_PARTITIONS,
        'part_cols'  : PARTITION_COLS,
        'encoding'   : ENCODING,
        'inc_cols'   : INC_COLS,
        'dtype'      : DTYPES_COLS})
# run
run = arctoparq.run(arc_to_parq_task)

[mlrun] 2020-01-27 19:37:40,720 starting run arc2parq uid=647251d1ef46416bb2a1dc9a76310e54  -> http://mlrun-api:8080
[mlrun] 2020-01-27 19:37:40,821 Job is running in the background, pod: arc2parq-mgrvp
[mlrun] 2020-01-27 19:37:45,590 destination file does not exist, downloading
[mlrun] 2020-01-27 19:50:05,061 saved table to /User/mlrun/airlines/dataset/airlines.pqt
[mlrun] 2020-01-27 19:50:05,076 log artifact airlines at /User/mlrun/airlines/dataset/airlines.pqt, size: None, db: Y
[mlrun] 2020-01-27 19:50:05,095 log artifact header at /User/mlrun/airlines/dataset/header.pkl, size: None, db: Y

[mlrun] 2020-01-27 19:50:05,114 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...310e54,0,Jan 27 19:37:45,completed,arc-to-parquet,host=arc2parq-mgrvpkind=jobowner=admin,,"archive_url=https://s3.amazonaws.com/h2o-airlines-unpacked/allyears.csvdataset=Truedtype={'AirTime': 'float32', 'ArrTime': 'float32', 'CRSElapsedTime': 'float32', 'CarrierDelay': 'float32', 'DepTime': 'float32', 'Distance': 'float32', 'LateAircraftDelay': 'float32', 'NASDelay': 'float32', 'SecurityDelay': 'float32', 'TailNum': 'str', 'TaxiOut': 'float32', 'WeatherDelay': 'float32'}encoding=latin-1inc_cols=['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'CRSElapsedTime', 'AirTime', 'Origin', 'Dest', 'Distance', 'TaxiOut', 'Cancelled', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']key=airlinesname=airlines.pqtpart_cols=['Year', 'Month']target_path=/User/mlrun/airlines/dataset",,airlinesheader


to track results use .show() or .logs() or in CLI: 
!mlrun get run 647251d1ef46416bb2a1dc9a76310e54  , !mlrun logs 647251d1ef46416bb2a1dc9a76310e54 
[mlrun] 2020-01-27 19:50:13,248 run executed, status=completed
CPU times: user 400 ms, sys: 45.2 ms, total: 445 ms
Wall time: 12min 32s


___

## tests

### a partitioned parquet table

In [11]:
dataset = pq.ParquetDataset(TARGET_PATH)

In [15]:
df = dataset.read().to_pandas()

In [16]:
df.set_index(['Year', 'Month'], inplace=True)

In [17]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,CRSElapsedTime,AirTime,Origin,Dest,Distance,TaxiOut,Cancelled,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
Year,Month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007,9,21,5,1951.0,1815,2058.0,1901,EV,4318,46.0,25.0,CSG,ATL,83.0,5.0,0,0.0,0.0,21.0,0.0,96.0
2007,9,23,7,1826.0,1815,1915.0,1901,EV,4318,46.0,22.0,CSG,ATL,83.0,8.0,0,0.0,0.0,0.0,0.0,0.0
2007,9,24,1,1827.0,1815,1906.0,1901,EV,4318,46.0,19.0,CSG,ATL,83.0,7.0,0,0.0,0.0,0.0,0.0,0.0
2007,9,25,2,1840.0,1815,1915.0,1901,EV,4318,46.0,22.0,CSG,ATL,83.0,3.0,0,0.0,0.0,0.0,0.0,0.0
2007,9,26,3,1815.0,1815,1847.0,1901,EV,4318,46.0,17.0,CSG,ATL,83.0,5.0,0,0.0,0.0,0.0,0.0,0.0


In [20]:
if USE_ARCHIVE == ARCHIVE:
    assert df.shape==(123_534_969, 20)

## cleanup

In [None]:
import shutil
shutil.rmtree(TARGET_PATH)

### single parquet file

run this only when `dataset=False`

In [None]:
assert KEY in run.outputs.keys(), f"mlrun.functions: key {KEY} not found in outputs"
assert os.path.isfile(TARGET_PATH+'/'+ FILE_NAME),  f"mlrun.functions: artifact source not found at {TARGET_PATH+'/'+ FILE_NAME}"

In [None]:
copied   = pd.read_parquet(TARGET_PATH+'/'+ FILE_NAME, engine="pyarrow")