# archive to parquet - partitioned data

Ailines data

In [1]:
import mlrun
import os
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

## parameters

In [2]:
FUNCTION           = 'arc_to_parquet'
DESCRIPTION        = 'retrieve archive table and save as partitioned parquet dataset'

BASE_IMAGE         = 'yjbds/mlrun_dev-files:latest'
JOB_KIND           = 'job'
TASK_NAME          = 'user-task-arc-to-part-parq'

CODE_BASE          = '/User/repos/functions/fileutils'

ARCHIVE_BIG        = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears_10.csv"
ARCHIVE            = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears.csv"
ARCHIVE_SMALL      = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv"

USE_ARCHIVE        = ARCHIVE_SMALL
TARGET_PATH        = '/User/mlrun/airlines/dataset-small'

FILE_SHAPE         = (123_534_969, 21) # (rows, cols)
SMALL_FILE_SHAPE   = (43_978, 21) # (rows, cols)

FILE_NAME          = 'airlines.pqt'
KEY                = 'airlines'

In [3]:
PARTITIONS_DEST = 'partitions'
PARTITION_COLS = ['Year', 'Month']

In [4]:
HEADER = ['Year','Month','DayofMonth','DayOfWeek','DepTime','CRSDepTime','ArrTime','CRSArrTime',
          'UniqueCarrier','FlightNum','TailNum','ActualElapsedTime','CRSElapsedTime','AirTime',
          'ArrDelay','DepDelay','Origin','Dest','Distance','TaxiIn','TaxiOut','Cancelled',
          'CancellationCode','Diverted','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay',
          'LateAircraftDelay']

INC_COLS = ['Year','Month','DayofMonth','DayOfWeek','DepTime','CRSDepTime','ArrTime','CRSArrTime',
          'UniqueCarrier','FlightNum', 'CRSElapsedTime','AirTime',
          'Origin','Dest','Distance', 'TaxiIn', 'TaxiOut','Cancelled',
          'CarrierDelay','WeatherDelay','NASDelay','SecurityDelay',
          'LateAircraftDelay']

ENCODING = 'latin-1'

DTYPES_COLS = {
     'CRSElapsedTime': 'float32', 
     'TailNum': 'str', 
     'Distance': 'float32',
     'TaxiIn' : 'float32',
     'TaxiOut': 'float32',
     'ArrTime': 'float32',
     'AirTime': 'float32',
     'DepTime':'float32', 
     'CarrierDelay': 'float32', 
     'WeatherDelay': 'float32', 
     'NASDelay':'float32', 
     'SecurityDelay':'float32', 
     'LateAircraftDelay':'float32'}

In [5]:
LABEL_COLUMN = "IsArrDelayed"

In [6]:
os.makedirs(os.path.join(TARGET_PATH, PARTITIONS_DEST), exist_ok=True)

#### load function

In [7]:
func_yaml = os.path.join(CODE_BASE, FUNCTION, 'function.yaml')

arctoparq = mlrun.import_function(func_yaml)

arctoparq.apply(mlrun.mount_v3io())

arctoparq.deploy(skip_deployed=True, with_mlrun=False)

'ready'

In [8]:
# create and run the task
arc_to_parq_task = mlrun.NewTask(
    TASK_NAME, 
    handler=FUNCTION,  
    params={
        'target_path': TARGET_PATH,
        'name'       : FILE_NAME, 
        'key'        : KEY,
        'archive_url': USE_ARCHIVE,
        'dataset'    : PARTITIONS_DEST,
        'part_cols'  : PARTITION_COLS,
        'encoding'   : ENCODING,
        'inc_cols'   : INC_COLS,
        'dtype'      : DTYPES_COLS})
# run
run = arctoparq.run(arc_to_parq_task)

[mlrun] 2020-01-29 12:30:19,645 starting run user-task-arc-to-part-parq uid=963c75c5d76642da9bbae845f527e361  -> http://mlrun-api:8080
[mlrun] 2020-01-29 12:30:19,808 Job is running in the background, pod: user-task-arc-to-part-parq-c8fjx
[mlrun] 2020-01-29 12:30:24,158 destination file does not exist, downloading
[mlrun] 2020-01-29 12:30:24,614 saved table to /User/mlrun/airlines/dataset-small/partitions
[mlrun] 2020-01-29 12:30:24,647 log artifact airlines at /User/mlrun/airlines/dataset-small/partitions, size: None, db: Y

[mlrun] 2020-01-29 12:30:24,667 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...27e361,0,Jan 29 12:30:24,completed,function,host=user-task-arc-to-part-parq-c8fjxkind=jobowner=admin,,"archive_url=https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csvdataset=partitionsdtype={'AirTime': 'float32', 'ArrTime': 'float32', 'CRSElapsedTime': 'float32', 'CarrierDelay': 'float32', 'DepTime': 'float32', 'Distance': 'float32', 'LateAircraftDelay': 'float32', 'NASDelay': 'float32', 'SecurityDelay': 'float32', 'TailNum': 'str', 'TaxiIn': 'float32', 'TaxiOut': 'float32', 'WeatherDelay': 'float32'}encoding=latin-1inc_cols=['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'CRSElapsedTime', 'AirTime', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']key=airlinesname=airlines.pqtpart_cols=['Year', 'Month']target_path=/User/mlrun/airlines/dataset-small",,airlines


to track results use .show() or .logs() or in CLI: 
!mlrun get run 963c75c5d76642da9bbae845f527e361  , !mlrun logs 963c75c5d76642da9bbae845f527e361 
[mlrun] 2020-01-29 12:30:25,977 run executed, status=completed


___

## tests

### a partitioned parquet table

In [None]:
import os
import pandas as pd
import pyarrow.parquet as pq

In [None]:
dataset = pq.ParquetDataset(os.path.join(TARGET_PATH, PARTITIONS_DEST))
df = dataset.read().to_pandas()

In [None]:
df.set_index(PARTITION_COLS, inplace=True)

In [None]:
df.head()

In [None]:
if USE_ARCHIVE == ARCHIVE:
    assert df.shape==FILE_SHAPE
if USE_ARCHIVE == ARCHIVE_SMALL:
    assert df.shape==SMALL_FILE_SHAPE, f"{df.shape}"

## cleanup

In [None]:
# import shutil
# shutil.rmtree(TARGET_PATH)