# archive to parquet - partitioned data

Ailines data

In [None]:
import mlrun
import os
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

## parameters

In [2]:
FUNCTION           = 'arc_to_parquet'
DESCRIPTION        = 'retrieve archive table and save as partitioned parquet dataset'

BASE_IMAGE         = 'yjbds/mlrun-base:dev'
JOB_KIND           = 'dask'
TASK_NAME          = 'user-task-arc-to-part-parq'
https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/tests/describe.py
CODE_BASE          = 'https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/fileutils'

ARCHIVE_BIG        = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears_10.csv"
ARCHIVE            = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears.csv"
ARCHIVE_SMALL      = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv"

USE_ARCHIVE        = ARCHIVE_SMALL
TARGET_PATH        = '/User/mlrun/airlines/dataset-small'

FILE_SHAPE         = (123_534_969, 21) # (rows, cols)
SMALL_FILE_SHAPE   = (43_978, 21) # (rows, cols)

FILE_NAME          = 'airlines.pqt'
KEY                = 'airlines'

In [3]:
PARTITIONS_DEST = 'partitions'
PARTITION_COLS = ['Year', 'Month']

In [4]:
HEADER = ['Year','Month','DayofMonth','DayOfWeek','DepTime','CRSDepTime','ArrTime','CRSArrTime',
          'UniqueCarrier','FlightNum','TailNum','ActualElapsedTime','CRSElapsedTime','AirTime',
          'ArrDelay','DepDelay','Origin','Dest','Distance','TaxiIn','TaxiOut','Cancelled',
          'CancellationCode','Diverted','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay',
          'LateAircraftDelay']

INC_COLS = ['Year','Month','DayofMonth','DayOfWeek','DepTime','CRSDepTime','ArrTime','CRSArrTime',
          'UniqueCarrier','FlightNum', 'CRSElapsedTime','AirTime',
          'Origin','Dest','Distance', 'TaxiIn', 'TaxiOut','Cancelled',
          'CarrierDelay','WeatherDelay','NASDelay','SecurityDelay',
          'LateAircraftDelay']

ENCODING = 'latin-1'

DTYPES_COLS = {
     'CRSElapsedTime': 'float32', 
     'TailNum': 'str', 
     'Distance': 'float32',
     'TaxiIn' : 'float32',
     'TaxiOut': 'float32',
     'ArrTime': 'float32',
     'AirTime': 'float32',
     'DepTime':'float32', 
     'CarrierDelay': 'float32', 
     'WeatherDelay': 'float32', 
     'NASDelay':'float32', 
     'SecurityDelay':'float32', 
     'LateAircraftDelay':'float32'}

In [5]:
LABEL_COLUMN = "IsArrDelayed"

In [6]:
os.makedirs(os.path.join(TARGET_PATH, PARTITIONS_DEST), exist_ok=True)

#### load function

In [7]:
func_yaml = os.path.join(CODE_BASE, FUNCTION, 'function.yaml')

arctoparq = mlrun.import_function(func_yaml)

arctoparq.apply(mlrun.mount_v3io())

arctoparq.deploy() #skip_deployed=True, with_mlrun=False)

[mlrun] 2020-01-30 01:19:26,578 starting remote build, image: .mlrun/func-default-function-latest
[36mINFO[0m[0000] Resolved base name yjbds/mlrun-base:dev to yjbds/mlrun-base:dev 
[36mINFO[0m[0000] Resolved base name yjbds/mlrun-base:dev to yjbds/mlrun-base:dev 
[36mINFO[0m[0000] Downloading base image yjbds/mlrun-base:dev  
[36mINFO[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:2bbe9095ff126252340957bde01f8d26d7742ee802d9b07a1490ad87e13ea3eb: no such file or directory 
[36mINFO[0m[0000] Downloading base image yjbds/mlrun-base:dev  
[36mINFO[0m[0001] Built cross stage deps: map[]                
[36mINFO[0m[0001] Downloading base image yjbds/mlrun-base:dev  
[36mINFO[0m[0001] Error while retrieving image from cache: getting file info: stat /cache/sha256:2bbe9095ff126252340957bde01f8d26d7742ee802d9b07a1490ad87e13ea3eb: no such file or directory 
[36mINFO[0m[0001] Downloading base image yjbds/mlrun-base:dev  
[36mINFO[0m[0001

True

In [8]:
# create and run the task
arc_to_parq_task = mlrun.NewTask(
    TASK_NAME, 
    handler=FUNCTION,  
    params={
        'target_path': TARGET_PATH,
        'name'       : FILE_NAME, 
        'key'        : KEY,
        'archive_url': USE_ARCHIVE,
        'dataset'    : PARTITIONS_DEST,
        'part_cols'  : PARTITION_COLS,
        'encoding'   : ENCODING,
        'inc_cols'   : INC_COLS,
        'dtype'      : DTYPES_COLS})
# run
run = arctoparq.run(arc_to_parq_task)

[mlrun] 2020-01-30 01:20:30,226 starting run user-task-arc-to-part-parq uid=e98743f403fc4c1aabb5fd293ae16613  -> http://mlrun-api:8080
[mlrun] 2020-01-30 01:20:30,314 Job is running in the background, pod: user-task-arc-to-part-parq-km9tw
[mlrun] 2020-01-30 01:20:36,058 destination file does not exist, downloading
[mlrun] 2020-01-30 01:20:36,537 saved table to /User/mlrun/airlines/dataset-small/partitions
[mlrun] 2020-01-30 01:20:36,564 log artifact airlines at /User/mlrun/airlines/dataset-small/partitions, size: None, db: Y

[mlrun] 2020-01-30 01:20:36,578 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...e16613,0,Jan 30 01:20:36,completed,function,host=user-task-arc-to-part-parq-km9twkind=jobowner=admin,,"archive_url=https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csvdataset=partitionsdtype={'AirTime': 'float32', 'ArrTime': 'float32', 'CRSElapsedTime': 'float32', 'CarrierDelay': 'float32', 'DepTime': 'float32', 'Distance': 'float32', 'LateAircraftDelay': 'float32', 'NASDelay': 'float32', 'SecurityDelay': 'float32', 'TailNum': 'str', 'TaxiIn': 'float32', 'TaxiOut': 'float32', 'WeatherDelay': 'float32'}encoding=latin-1inc_cols=['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'CRSElapsedTime', 'AirTime', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']key=airlinesname=airlines.pqtpart_cols=['Year', 'Month']target_path=/User/mlrun/airlines/dataset-small",,airlines


to track results use .show() or .logs() or in CLI: 
!mlrun get run e98743f403fc4c1aabb5fd293ae16613  , !mlrun logs e98743f403fc4c1aabb5fd293ae16613 
[mlrun] 2020-01-30 01:20:39,512 run executed, status=completed


## tests

### a partitioned parquet table

In [9]:
import os
import pandas as pd
import pyarrow.parquet as pq

In [10]:
dataset = pq.ParquetDataset(os.path.join(TARGET_PATH, PARTITIONS_DEST))
df = dataset.read().to_pandas()

In [11]:
df.set_index(PARTITION_COLS, inplace=True)

In [12]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,CRSElapsedTime,AirTime,...,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
Year,Month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1987,10,14,3,741.0,730,912.0,849,PS,1451,79.0,,...,SFO,447.0,,,0,,,,,
1987,10,15,4,729.0,730,903.0,849,PS,1451,79.0,,...,SFO,447.0,,,0,,,,,
1987,10,17,6,741.0,730,918.0,849,PS,1451,79.0,,...,SFO,447.0,,,0,,,,,
1987,10,18,7,729.0,730,847.0,849,PS,1451,79.0,,...,SFO,447.0,,,0,,,,,
1987,10,19,1,749.0,730,922.0,849,PS,1451,79.0,,...,SFO,447.0,,,0,,,,,


In [13]:
if USE_ARCHIVE == ARCHIVE:
    assert df.shape==FILE_SHAPE
if USE_ARCHIVE == ARCHIVE_SMALL:
    assert df.shape==SMALL_FILE_SHAPE, f"{df.shape}"

AssertionError: (87956, 21)

## cleanup

In [None]:
# import shutil
# shutil.rmtree(TARGET_PATH)