# archive to parquet

Convert a remote archive or csv file (or local file://), to parquet format

In [1]:
import mlrun
import os
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

In [2]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

## parameters


In [15]:
BASE_IMAGE         = 'yjbds/mlrun-files:latest'

CODE_BASE          = '/User/repos/functions/'
PROJECT            = 'fileutils/arc_to_parquet'

TARGET_PATH        = '/User/mlrun/airlines/dataset'

ARCHIVE_BIG        = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears_10.csv"
ARCHIVE            = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears.csv"
ARCHIVE_SMALL      = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv"

**For testing and development use ARCHIVE_SMALL:**

In [4]:
USE_ARCHIVE = ARCHIVE

In [19]:
FILE_NAME          = 'airlines.pqt'
KEY                = 'airlines'

# no need for this as the files contain a header:
HEADER = ['Year','Month','DayofMonth','DayOfWeek','DepTime','CRSDepTime','ArrTime','CRSArrTime',
          'UniqueCarrier','FlightNum','TailNum','ActualElapsedTime','CRSElapsedTime','AirTime',
          'ArrDelay','DepDelay','Origin','Dest','Distance','TaxiIn','TaxiOut','Cancelled',
          'CancellationCode','Diverted','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay',
          'LateAircraftDelay']
INC_COLS = ['Year','Month','DayofMonth','DayOfWeek','DepTime','CRSDepTime','ArrTime','CRSArrTime',
          'UniqueCarrier','FlightNum', 'CRSElapsedTime','AirTime',
          'Origin','Dest','Distance','TaxiOut','Cancelled',
          'CarrierDelay','WeatherDelay','NASDelay','SecurityDelay',
          'LateAircraftDelay']

ENCODING = 'latin-1'

DTYPES_COLS = {
     'CRSElapsedTime': 'float64', 
     'TailNum': 'str', 
     'Distance': 'float64', 
     'TaxiOut': 'float64',
     'ArrTime': 'float64',
     'DepTime':'float64', 
     'CarrierDelay': 'float64', 
     'WeatherDelay': 'float64', 
     'NASDelay':'float64', 
     'SecurityDelay':'float64', 
     'LateAircraftDelay':'float64'}

USE_PARTITIONS = True
PARTITION_COLS = ['Year', 'Month']

In [6]:
os.makedirs(TARGET_PATH, exist_ok=True)

## load and configure function

**If run the first time, create the function:**

In [7]:
# load function from a local Python file
arctoparq = mlrun.code_to_function(
    filename=os.path.join(CODE_BASE, PROJECT, 'arc_to_parquet.py'), 
    kind='job')
arctoparq.build_config(base_image=BASE_IMAGE, commands=[])
yaml_name = os.path.join(CODE_BASE, PROJECT, 'arc_to_parquet.yaml')
arctoparq.export(yaml_name)

[mlrun] 2020-01-27 19:05:25,830 function spec saved to path: /User/repos/functions/fileutils/arc_to_parquet/arc_to_parquet.yaml


**otherwise load it:**

In [8]:
arctoparq = mlrun.import_function(
    os.path.join(CODE_BASE, PROJECT, 'arc_to_parquet.yaml')
).apply(mlrun.mount_v3io())

## deploy / build

The following triggers a build when run for the first time using specs found in the yaml file above.

In [9]:
arctoparq.deploy(skip_deployed=True, with_mlrun=False)

'ready'

In [None]:
%%time
# create and run the task
arc_to_parq_task = mlrun.NewTask(
    'arc2parq', 
    handler='arc_to_parquet',  
    params={
        'target_path': TARGET_PATH,
        'name'       : FILE_NAME, 
        'key'        : KEY,
        'archive_url': USE_ARCHIVE,
        'dataset'    : USE_PARTITIONS,
        'part_cols'  : PARTITION_COLS,
        'encoding'   : ENCODING,
        'inc_cols'   : INC_COLS,
        'dtype'      : DTYPES_COLS})
# run
run = arctoparq.run(arc_to_parq_task)

[mlrun] 2020-01-27 19:21:20,254 starting run arc2parq uid=d3a5446edb94436d91efe4b2d7c64c2b  -> http://mlrun-api:8080
[mlrun] 2020-01-27 19:21:20,379 Job is running in the background, pod: arc2parq-5qs84


___

## tests

### a partitioned parquet table

In [16]:
dataset = pq.ParquetDataset(TARGET_PATH)

ValueError: Schema in /User/mlrun/airlines/dataset/1278d2c85afc40cabc8e5add8d12892e.parquet was different. 
Year: int64
Month: int64
DayofMonth: int64
DayOfWeek: int64
DepTime: double
CRSDepTime: int64
ArrTime: int64
CRSArrTime: int64
UniqueCarrier: string
FlightNum: int64
CRSElapsedTime: double
AirTime: int64
Origin: string
Dest: string
Distance: double
TaxiOut: double
Cancelled: int64
CarrierDelay: double
WeatherDelay: double
NASDelay: double
SecurityDelay: double
LateAircraftDelay: double
metadata
--------
OrderedDict([(b'pandas',
              b'{"index_columns": [{"kind": "range", "name": null, "start": '
              b'95550000, "stop": 95560000, "step": 1}], "column_indexes": ['
              b'{"name": null, "field_name": null, "pandas_type": "unicode",'
              b' "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}]'
              b', "columns": [{"name": "Year", "field_name": "Year", "pandas'
              b'_type": "int64", "numpy_type": "int64", "metadata": null}, {'
              b'"name": "Month", "field_name": "Month", "pandas_type": "int6'
              b'4", "numpy_type": "int64", "metadata": null}, {"name": "Dayo'
              b'fMonth", "field_name": "DayofMonth", "pandas_type": "int64",'
              b' "numpy_type": "int64", "metadata": null}, {"name": "DayOfWe'
              b'ek", "field_name": "DayOfWeek", "pandas_type": "int64", "num'
              b'py_type": "int64", "metadata": null}, {"name": "DepTime", "f'
              b'ield_name": "DepTime", "pandas_type": "float64", "numpy_type'
              b'": "float64", "metadata": null}, {"name": "CRSDepTime", "fie'
              b'ld_name": "CRSDepTime", "pandas_type": "int64", "numpy_type"'
              b': "int64", "metadata": null}, {"name": "ArrTime", "field_nam'
              b'e": "ArrTime", "pandas_type": "int64", "numpy_type": "int64"'
              b', "metadata": null}, {"name": "CRSArrTime", "field_name": "C'
              b'RSArrTime", "pandas_type": "int64", "numpy_type": "int64", "'
              b'metadata": null}, {"name": "UniqueCarrier", "field_name": "U'
              b'niqueCarrier", "pandas_type": "unicode", "numpy_type": "obje'
              b'ct", "metadata": null}, {"name": "FlightNum", "field_name": '
              b'"FlightNum", "pandas_type": "int64", "numpy_type": "int64", '
              b'"metadata": null}, {"name": "CRSElapsedTime", "field_name": '
              b'"CRSElapsedTime", "pandas_type": "float64", "numpy_type": "f'
              b'loat64", "metadata": null}, {"name": "AirTime", "field_name"'
              b': "AirTime", "pandas_type": "int64", "numpy_type": "int64", '
              b'"metadata": null}, {"name": "Origin", "field_name": "Origin"'
              b', "pandas_type": "unicode", "numpy_type": "object", "metadat'
              b'a": null}, {"name": "Dest", "field_name": "Dest", "pandas_ty'
              b'pe": "unicode", "numpy_type": "object", "metadata": null}, {'
              b'"name": "Distance", "field_name": "Distance", "pandas_type":'
              b' "float64", "numpy_type": "float64", "metadata": null}, {"na'
              b'me": "TaxiOut", "field_name": "TaxiOut", "pandas_type": "flo'
              b'at64", "numpy_type": "float64", "metadata": null}, {"name": '
              b'"Cancelled", "field_name": "Cancelled", "pandas_type": "int6'
              b'4", "numpy_type": "int64", "metadata": null}, {"name": "Carr'
              b'ierDelay", "field_name": "CarrierDelay", "pandas_type": "flo'
              b'at64", "numpy_type": "float64", "metadata": null}, {"name": '
              b'"WeatherDelay", "field_name": "WeatherDelay", "pandas_type":'
              b' "float64", "numpy_type": "float64", "metadata": null}, {"na'
              b'me": "NASDelay", "field_name": "NASDelay", "pandas_type": "f'
              b'loat64", "numpy_type": "float64", "metadata": null}, {"name"'
              b': "SecurityDelay", "field_name": "SecurityDelay", "pandas_ty'
              b'pe": "float64", "numpy_type": "float64", "metadata": null}, '
              b'{"name": "LateAircraftDelay", "field_name": "LateAircraftDel'
              b'ay", "pandas_type": "float64", "numpy_type": "float64", "met'
              b'adata": null}], "creator": {"library": "pyarrow", "version":'
              b' "0.15.1"}, "pandas_version": "0.25.3"}'),
             (b'ARROW:schema',
              b'/////4AQAAAQAAAAAAAKAA4ABgAFAAgACgAAAAABAwAQAAAAAAAKAAwAAAAE'
              b'AAgACgAAAJwLAAAEAAAAAQAAAAwAAAAIAAwABAAIAAgAAAAIAAAAEAAAAAYA'
              b'AABwYW5kYXMAAGcLAAB7ImluZGV4X2NvbHVtbnMiOiBbeyJraW5kIjogInJh'
              b'bmdlIiwgIm5hbWUiOiBudWxsLCAic3RhcnQiOiA5NTU1MDAwMCwgInN0b3Ai'
              b'OiA5NTU2MDAwMCwgInN0ZXAiOiAxfV0sICJjb2x1bW5faW5kZXhlcyI6IFt7'
              b'Im5hbWUiOiBudWxsLCAiZmllbGRfbmFtZSI6IG51bGwsICJwYW5kYXNfdHlw'
              b'ZSI6ICJ1bmljb2RlIiwgIm51bXB5X3R5cGUiOiAib2JqZWN0IiwgIm1ldGFk'
              b'YXRhIjogeyJlbmNvZGluZyI6ICJVVEYtOCJ9fV0sICJjb2x1bW5zIjogW3si'
              b'bmFtZSI6ICJZZWFyIiwgImZpZWxkX25hbWUiOiAiWWVhciIsICJwYW5kYXNf'
              b'dHlwZSI6ICJpbnQ2NCIsICJudW1weV90eXBlIjogImludDY0IiwgIm1ldGFk'
              b'YXRhIjogbnVsbH0sIHsibmFtZSI6ICJNb250aCIsICJmaWVsZF9uYW1lIjog'
              b'Ik1vbnRoIiwgInBhbmRhc190eXBlIjogImludDY0IiwgIm51bXB5X3R5cGUi'
              b'OiAiaW50NjQiLCAibWV0YWRhdGEiOiBudWxsfSwgeyJuYW1lIjogIkRheW9m'
              b'TW9udGgiLCAiZmllbGRfbmFtZSI6ICJEYXlvZk1vbnRoIiwgInBhbmRhc190'
              b'eXBlIjogImludDY0IiwgIm51bXB5X3R5cGUiOiAiaW50NjQiLCAibWV0YWRh'
              b'dGEiOiBudWxsfSwgeyJuYW1lIjogIkRheU9mV2VlayIsICJmaWVsZF9uYW1l'
              b'IjogIkRheU9mV2VlayIsICJwYW5kYXNfdHlwZSI6ICJpbnQ2NCIsICJudW1w'
              b'eV90eXBlIjogImludDY0IiwgIm1ldGFkYXRhIjogbnVsbH0sIHsibmFtZSI6'
              b'ICJEZXBUaW1lIiwgImZpZWxkX25hbWUiOiAiRGVwVGltZSIsICJwYW5kYXNf'
              b'dHlwZSI6ICJmbG9hdDY0IiwgIm51bXB5X3R5cGUiOiAiZmxvYXQ2NCIsICJt'
              b'ZXRhZGF0YSI6IG51bGx9LCB7Im5hbWUiOiAiQ1JTRGVwVGltZSIsICJmaWVs'
              b'ZF9uYW1lIjogIkNSU0RlcFRpbWUiLCAicGFuZGFzX3R5cGUiOiAiaW50NjQi'
              b'LCAibnVtcHlfdHlwZSI6ICJpbnQ2NCIsICJtZXRhZGF0YSI6IG51bGx9LCB7'
              b'Im5hbWUiOiAiQXJyVGltZSIsICJmaWVsZF9uYW1lIjogIkFyclRpbWUiLCAi'
              b'cGFuZGFzX3R5cGUiOiAiaW50NjQiLCAibnVtcHlfdHlwZSI6ICJpbnQ2NCIs'
              b'ICJtZXRhZGF0YSI6IG51bGx9LCB7Im5hbWUiOiAiQ1JTQXJyVGltZSIsICJm'
              b'aWVsZF9uYW1lIjogIkNSU0FyclRpbWUiLCAicGFuZGFzX3R5cGUiOiAiaW50'
              b'NjQiLCAibnVtcHlfdHlwZSI6ICJpbnQ2NCIsICJtZXRhZGF0YSI6IG51bGx9'
              b'LCB7Im5hbWUiOiAiVW5pcXVlQ2FycmllciIsICJmaWVsZF9uYW1lIjogIlVu'
              b'aXF1ZUNhcnJpZXIiLCAicGFuZGFzX3R5cGUiOiAidW5pY29kZSIsICJudW1w'
              b'eV90eXBlIjogIm9iamVjdCIsICJtZXRhZGF0YSI6IG51bGx9LCB7Im5hbWUi'
              b'OiAiRmxpZ2h0TnVtIiwgImZpZWxkX25hbWUiOiAiRmxpZ2h0TnVtIiwgInBh'
              b'bmRhc190eXBlIjogImludDY0IiwgIm51bXB5X3R5cGUiOiAiaW50NjQiLCAi'
              b'bWV0YWRhdGEiOiBudWxsfSwgeyJuYW1lIjogIkNSU0VsYXBzZWRUaW1lIiwg'
              b'ImZpZWxkX25hbWUiOiAiQ1JTRWxhcHNlZFRpbWUiLCAicGFuZGFzX3R5cGUi'
              b'OiAiZmxvYXQ2NCIsICJudW1weV90eXBlIjogImZsb2F0NjQiLCAibWV0YWRh'
              b'dGEiOiBudWxsfSwgeyJuYW1lIjogIkFpclRpbWUiLCAiZmllbGRfbmFtZSI6'
              b'ICJBaXJUaW1lIiwgInBhbmRhc190eXBlIjogImludDY0IiwgIm51bXB5X3R5'
              b'cGUiOiAiaW50NjQiLCAibWV0YWRhdGEiOiBudWxsfSwgeyJuYW1lIjogIk9y'
              b'aWdpbiIsICJmaWVsZF9uYW1lIjogIk9yaWdpbiIsICJwYW5kYXNfdHlwZSI6'
              b'ICJ1bmljb2RlIiwgIm51bXB5X3R5cGUiOiAib2JqZWN0IiwgIm1ldGFkYXRh'
              b'IjogbnVsbH0sIHsibmFtZSI6ICJEZXN0IiwgImZpZWxkX25hbWUiOiAiRGVz'
              b'dCIsICJwYW5kYXNfdHlwZSI6ICJ1bmljb2RlIiwgIm51bXB5X3R5cGUiOiAi'
              b'b2JqZWN0IiwgIm1ldGFkYXRhIjogbnVsbH0sIHsibmFtZSI6ICJEaXN0YW5j'
              b'ZSIsICJmaWVsZF9uYW1lIjogIkRpc3RhbmNlIiwgInBhbmRhc190eXBlIjog'
              b'ImZsb2F0NjQiLCAibnVtcHlfdHlwZSI6ICJmbG9hdDY0IiwgIm1ldGFkYXRh'
              b'IjogbnVsbH0sIHsibmFtZSI6ICJUYXhpT3V0IiwgImZpZWxkX25hbWUiOiAi'
              b'VGF4aU91dCIsICJwYW5kYXNfdHlwZSI6ICJmbG9hdDY0IiwgIm51bXB5X3R5'
              b'cGUiOiAiZmxvYXQ2NCIsICJtZXRhZGF0YSI6IG51bGx9LCB7Im5hbWUiOiAi'
              b'Q2FuY2VsbGVkIiwgImZpZWxkX25hbWUiOiAiQ2FuY2VsbGVkIiwgInBhbmRh'
              b'c190eXBlIjogImludDY0IiwgIm51bXB5X3R5cGUiOiAiaW50NjQiLCAibWV0'
              b'YWRhdGEiOiBudWxsfSwgeyJuYW1lIjogIkNhcnJpZXJEZWxheSIsICJmaWVs'
              b'ZF9uYW1lIjogIkNhcnJpZXJEZWxheSIsICJwYW5kYXNfdHlwZSI6ICJmbG9h'
              b'dDY0IiwgIm51bXB5X3R5cGUiOiAiZmxvYXQ2NCIsICJtZXRhZGF0YSI6IG51'
              b'bGx9LCB7Im5hbWUiOiAiV2VhdGhlckRlbGF5IiwgImZpZWxkX25hbWUiOiAi'
              b'V2VhdGhlckRlbGF5IiwgInBhbmRhc190eXBlIjogImZsb2F0NjQiLCAibnVt'
              b'cHlfdHlwZSI6ICJmbG9hdDY0IiwgIm1ldGFkYXRhIjogbnVsbH0sIHsibmFt'
              b'ZSI6ICJOQVNEZWxheSIsICJmaWVsZF9uYW1lIjogIk5BU0RlbGF5IiwgInBh'
              b'bmRhc190eXBlIjogImZsb2F0NjQiLCAibnVtcHlfdHlwZSI6ICJmbG9hdDY0'
              b'IiwgIm1ldGFkYXRhIjogbnVsbH0sIHsibmFtZSI6ICJTZWN1cml0eURlbGF5'
              b'IiwgImZpZWxkX25hbWUiOiAiU2VjdXJpdHlEZWxheSIsICJwYW5kYXNfdHlw'
              b'ZSI6ICJmbG9hdDY0IiwgIm51bXB5X3R5cGUiOiAiZmxvYXQ2NCIsICJtZXRh'
              b'ZGF0YSI6IG51bGx9LCB7Im5hbWUiOiAiTGF0ZUFpcmNyYWZ0RGVsYXkiLCAi'
              b'ZmllbGRfbmFtZSI6ICJMYXRlQWlyY3JhZnREZWxheSIsICJwYW5kYXNfdHlw'
              b'ZSI6ICJmbG9hdDY0IiwgIm51bXB5X3R5cGUiOiAiZmxvYXQ2NCIsICJtZXRh'
              b'ZGF0YSI6IG51bGx9XSwgImNyZWF0b3IiOiB7ImxpYnJhcnkiOiAicHlhcnJv'
              b'dyIsICJ2ZXJzaW9uIjogIjAuMTUuMSJ9LCAicGFuZGFzX3ZlcnNpb24iOiAi'
              b'MC4yNS4zIn0AFgAAAHwEAAA4BAAAAAQAAMgDAACQAwAAWAMAACQDAADsAgAA'
              b'tAIAAHwCAABEAgAAEAIAAOQBAAC4AQAAhAEAAFQBAAAcAQAA5AAAAKwAAAB4'
              b'AAAAQAAAAAQAAADs+///AAABAxgAAAAMAAAABAAAAAAAAAC2/P//AAACABEA'
              b'AABMYXRlQWlyY3JhZnREZWxheQAAACT8//8AAAEDGAAAAAwAAAAEAAAAAAAA'
              b'AO78//8AAAIADQAAAFNlY3VyaXR5RGVsYXkAAABY/P//AAABAxgAAAAMAAAA'
              b'BAAAAAAAAAAi/f//AAACAAgAAABOQVNEZWxheQAAAACI/P//AAABAxgAAAAM'
              b'AAAABAAAAAAAAABS/f//AAACAAwAAABXZWF0aGVyRGVsYXkAAAAAvPz//wAA'
              b'AQMYAAAADAAAAAQAAAAAAAAAhv3//wAAAgAMAAAAQ2FycmllckRlbGF5AAAA'
              b'APD8//8AAAECHAAAAAwAAAAEAAAAAAAAAOD8//8AAAABQAAAAAkAAABDYW5j'
              b'ZWxsZWQAAAAk/f//AAABAxgAAAAMAAAABAAAAAAAAADu/f//AAACAAcAAABU'
              b'YXhpT3V0AFD9//8AAAEDGAAAAAwAAAAEAAAAAAAAABr+//8AAAIACAAAAERp'
              b'c3RhbmNlAAAAAID9//8AAAEFFAAAAAwAAAAEAAAAAAAAABj///8EAAAARGVz'
              b'dAAAAACo/f//AAABBRQAAAAMAAAABAAAAAAAAABA////BgAAAE9yaWdpbgAA'
              b'0P3//wAAAQIcAAAADAAAAAQAAAAAAAAAwP3//wAAAAFAAAAABwAAAEFpclRp'
              b'bWUAAP7//wAAAQMYAAAADAAAAAQAAAAAAAAAyv7//wAAAgAOAAAAQ1JTRWxh'
              b'cHNlZFRpbWUAADT+//8AAAECHAAAAAwAAAAEAAAAAAAAACT+//8AAAABQAAA'
              b'AAkAAABGbGlnaHROdW0AAABo/v//AAABBRgAAAAQAAAABAAAAAAAAAAEAAQA'
              b'BAAAAA0AAABVbmlxdWVDYXJyaWVyAAAAnP7//wAAAQIcAAAADAAAAAQAAAAA'
              b'AAAAjP7//wAAAAFAAAAACgAAAENSU0FyclRpbWUAAND+//8AAAECHAAAAAwA'
              b'AAAEAAAAAAAAAMD+//8AAAABQAAAAAcAAABBcnJUaW1lAAD///8AAAECHAAA'
              b'AAwAAAAEAAAAAAAAAPD+//8AAAABQAAAAAoAAABDUlNEZXBUaW1lAAA0////'
              b'AAABAyAAAAAUAAAABAAAAAAAAAAAAAYACAAGAAYAAAAAAAIABwAAAERlcFRp'
              b'bWUAaP///wAAAQIcAAAADAAAAAQAAAAAAAAAWP///wAAAAFAAAAACQAAAERh'
              b'eU9mV2VlawAAAJz///8AAAECHAAAAAwAAAAEAAAAAAAAAIz///8AAAABQAAA'
              b'AAoAAABEYXlvZk1vbnRoAADQ////AAABAhwAAAAMAAAABAAAAAAAAADA////'
              b'AAAAAUAAAAAFAAAATW9udGgAAAAQABQACAAGAAcADAAAABAAEAAAAAAAAQIk'
              b'AAAAFAAAAAQAAAAAAAAACAAMAAgABwAIAAAAAAAAAUAAAAAEAAAAWWVhcgAA'
              b'AAA=')])

vs

Year: int64
Month: int64
DayofMonth: int64
DayOfWeek: int64
DepTime: double
CRSDepTime: int64
ArrTime: double
CRSArrTime: int64
UniqueCarrier: string
FlightNum: int64
CRSElapsedTime: double
AirTime: double
Origin: string
Dest: string
Distance: double
TaxiOut: double
Cancelled: int64
CarrierDelay: double
WeatherDelay: double
NASDelay: double
SecurityDelay: double
LateAircraftDelay: double
metadata
--------
OrderedDict([(b'pandas',
              b'{"index_columns": [{"kind": "range", "name": null, "start": '
              b'90730000, "stop": 90740000, "step": 1}], "column_indexes": ['
              b'{"name": null, "field_name": null, "pandas_type": "unicode",'
              b' "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}]'
              b', "columns": [{"name": "Year", "field_name": "Year", "pandas'
              b'_type": "int64", "numpy_type": "int64", "metadata": null}, {'
              b'"name": "Month", "field_name": "Month", "pandas_type": "int6'
              b'4", "numpy_type": "int64", "metadata": null}, {"name": "Dayo'
              b'fMonth", "field_name": "DayofMonth", "pandas_type": "int64",'
              b' "numpy_type": "int64", "metadata": null}, {"name": "DayOfWe'
              b'ek", "field_name": "DayOfWeek", "pandas_type": "int64", "num'
              b'py_type": "int64", "metadata": null}, {"name": "DepTime", "f'
              b'ield_name": "DepTime", "pandas_type": "float64", "numpy_type'
              b'": "float64", "metadata": null}, {"name": "CRSDepTime", "fie'
              b'ld_name": "CRSDepTime", "pandas_type": "int64", "numpy_type"'
              b': "int64", "metadata": null}, {"name": "ArrTime", "field_nam'
              b'e": "ArrTime", "pandas_type": "float64", "numpy_type": "floa'
              b't64", "metadata": null}, {"name": "CRSArrTime", "field_name"'
              b': "CRSArrTime", "pandas_type": "int64", "numpy_type": "int64'
              b'", "metadata": null}, {"name": "UniqueCarrier", "field_name"'
              b': "UniqueCarrier", "pandas_type": "unicode", "numpy_type": "'
              b'object", "metadata": null}, {"name": "FlightNum", "field_nam'
              b'e": "FlightNum", "pandas_type": "int64", "numpy_type": "int6'
              b'4", "metadata": null}, {"name": "CRSElapsedTime", "field_nam'
              b'e": "CRSElapsedTime", "pandas_type": "float64", "numpy_type"'
              b': "float64", "metadata": null}, {"name": "AirTime", "field_n'
              b'ame": "AirTime", "pandas_type": "float64", "numpy_type": "fl'
              b'oat64", "metadata": null}, {"name": "Origin", "field_name": '
              b'"Origin", "pandas_type": "unicode", "numpy_type": "object", '
              b'"metadata": null}, {"name": "Dest", "field_name": "Dest", "p'
              b'andas_type": "unicode", "numpy_type": "object", "metadata": '
              b'null}, {"name": "Distance", "field_name": "Distance", "panda'
              b's_type": "float64", "numpy_type": "float64", "metadata": nul'
              b'l}, {"name": "TaxiOut", "field_name": "TaxiOut", "pandas_typ'
              b'e": "float64", "numpy_type": "float64", "metadata": null}, {'
              b'"name": "Cancelled", "field_name": "Cancelled", "pandas_type'
              b'": "int64", "numpy_type": "int64", "metadata": null}, {"name'
              b'": "CarrierDelay", "field_name": "CarrierDelay", "pandas_typ'
              b'e": "float64", "numpy_type": "float64", "metadata": null}, {'
              b'"name": "WeatherDelay", "field_name": "WeatherDelay", "panda'
              b's_type": "float64", "numpy_type": "float64", "metadata": nul'
              b'l}, {"name": "NASDelay", "field_name": "NASDelay", "pandas_t'
              b'ype": "float64", "numpy_type": "float64", "metadata": null},'
              b' {"name": "SecurityDelay", "field_name": "SecurityDelay", "p'
              b'andas_type": "float64", "numpy_type": "float64", "metadata":'
              b' null}, {"name": "LateAircraftDelay", "field_name": "LateAir'
              b'craftDelay", "pandas_type": "float64", "numpy_type": "float6'
              b'4", "metadata": null}], "creator": {"library": "pyarrow", "v'
              b'ersion": "0.15.1"}, "pandas_version": "0.25.3"}'),
             (b'ARROW:schema',
              b'/////4AQAAAQAAAAAAAKAA4ABgAFAAgACgAAAAABAwAQAAAAAAAKAAwAAAAE'
              b'AAgACgAAAKQLAAAEAAAAAQAAAAwAAAAIAAwABAAIAAgAAAAIAAAAEAAAAAYA'
              b'AABwYW5kYXMAAG8LAAB7ImluZGV4X2NvbHVtbnMiOiBbeyJraW5kIjogInJh'
              b'bmdlIiwgIm5hbWUiOiBudWxsLCAic3RhcnQiOiA5MDczMDAwMCwgInN0b3Ai'
              b'OiA5MDc0MDAwMCwgInN0ZXAiOiAxfV0sICJjb2x1bW5faW5kZXhlcyI6IFt7'
              b'Im5hbWUiOiBudWxsLCAiZmllbGRfbmFtZSI6IG51bGwsICJwYW5kYXNfdHlw'
              b'ZSI6ICJ1bmljb2RlIiwgIm51bXB5X3R5cGUiOiAib2JqZWN0IiwgIm1ldGFk'
              b'YXRhIjogeyJlbmNvZGluZyI6ICJVVEYtOCJ9fV0sICJjb2x1bW5zIjogW3si'
              b'bmFtZSI6ICJZZWFyIiwgImZpZWxkX25hbWUiOiAiWWVhciIsICJwYW5kYXNf'
              b'dHlwZSI6ICJpbnQ2NCIsICJudW1weV90eXBlIjogImludDY0IiwgIm1ldGFk'
              b'YXRhIjogbnVsbH0sIHsibmFtZSI6ICJNb250aCIsICJmaWVsZF9uYW1lIjog'
              b'Ik1vbnRoIiwgInBhbmRhc190eXBlIjogImludDY0IiwgIm51bXB5X3R5cGUi'
              b'OiAiaW50NjQiLCAibWV0YWRhdGEiOiBudWxsfSwgeyJuYW1lIjogIkRheW9m'
              b'TW9udGgiLCAiZmllbGRfbmFtZSI6ICJEYXlvZk1vbnRoIiwgInBhbmRhc190'
              b'eXBlIjogImludDY0IiwgIm51bXB5X3R5cGUiOiAiaW50NjQiLCAibWV0YWRh'
              b'dGEiOiBudWxsfSwgeyJuYW1lIjogIkRheU9mV2VlayIsICJmaWVsZF9uYW1l'
              b'IjogIkRheU9mV2VlayIsICJwYW5kYXNfdHlwZSI6ICJpbnQ2NCIsICJudW1w'
              b'eV90eXBlIjogImludDY0IiwgIm1ldGFkYXRhIjogbnVsbH0sIHsibmFtZSI6'
              b'ICJEZXBUaW1lIiwgImZpZWxkX25hbWUiOiAiRGVwVGltZSIsICJwYW5kYXNf'
              b'dHlwZSI6ICJmbG9hdDY0IiwgIm51bXB5X3R5cGUiOiAiZmxvYXQ2NCIsICJt'
              b'ZXRhZGF0YSI6IG51bGx9LCB7Im5hbWUiOiAiQ1JTRGVwVGltZSIsICJmaWVs'
              b'ZF9uYW1lIjogIkNSU0RlcFRpbWUiLCAicGFuZGFzX3R5cGUiOiAiaW50NjQi'
              b'LCAibnVtcHlfdHlwZSI6ICJpbnQ2NCIsICJtZXRhZGF0YSI6IG51bGx9LCB7'
              b'Im5hbWUiOiAiQXJyVGltZSIsICJmaWVsZF9uYW1lIjogIkFyclRpbWUiLCAi'
              b'cGFuZGFzX3R5cGUiOiAiZmxvYXQ2NCIsICJudW1weV90eXBlIjogImZsb2F0'
              b'NjQiLCAibWV0YWRhdGEiOiBudWxsfSwgeyJuYW1lIjogIkNSU0FyclRpbWUi'
              b'LCAiZmllbGRfbmFtZSI6ICJDUlNBcnJUaW1lIiwgInBhbmRhc190eXBlIjog'
              b'ImludDY0IiwgIm51bXB5X3R5cGUiOiAiaW50NjQiLCAibWV0YWRhdGEiOiBu'
              b'dWxsfSwgeyJuYW1lIjogIlVuaXF1ZUNhcnJpZXIiLCAiZmllbGRfbmFtZSI6'
              b'ICJVbmlxdWVDYXJyaWVyIiwgInBhbmRhc190eXBlIjogInVuaWNvZGUiLCAi'
              b'bnVtcHlfdHlwZSI6ICJvYmplY3QiLCAibWV0YWRhdGEiOiBudWxsfSwgeyJu'
              b'YW1lIjogIkZsaWdodE51bSIsICJmaWVsZF9uYW1lIjogIkZsaWdodE51bSIs'
              b'ICJwYW5kYXNfdHlwZSI6ICJpbnQ2NCIsICJudW1weV90eXBlIjogImludDY0'
              b'IiwgIm1ldGFkYXRhIjogbnVsbH0sIHsibmFtZSI6ICJDUlNFbGFwc2VkVGlt'
              b'ZSIsICJmaWVsZF9uYW1lIjogIkNSU0VsYXBzZWRUaW1lIiwgInBhbmRhc190'
              b'eXBlIjogImZsb2F0NjQiLCAibnVtcHlfdHlwZSI6ICJmbG9hdDY0IiwgIm1l'
              b'dGFkYXRhIjogbnVsbH0sIHsibmFtZSI6ICJBaXJUaW1lIiwgImZpZWxkX25h'
              b'bWUiOiAiQWlyVGltZSIsICJwYW5kYXNfdHlwZSI6ICJmbG9hdDY0IiwgIm51'
              b'bXB5X3R5cGUiOiAiZmxvYXQ2NCIsICJtZXRhZGF0YSI6IG51bGx9LCB7Im5h'
              b'bWUiOiAiT3JpZ2luIiwgImZpZWxkX25hbWUiOiAiT3JpZ2luIiwgInBhbmRh'
              b'c190eXBlIjogInVuaWNvZGUiLCAibnVtcHlfdHlwZSI6ICJvYmplY3QiLCAi'
              b'bWV0YWRhdGEiOiBudWxsfSwgeyJuYW1lIjogIkRlc3QiLCAiZmllbGRfbmFt'
              b'ZSI6ICJEZXN0IiwgInBhbmRhc190eXBlIjogInVuaWNvZGUiLCAibnVtcHlf'
              b'dHlwZSI6ICJvYmplY3QiLCAibWV0YWRhdGEiOiBudWxsfSwgeyJuYW1lIjog'
              b'IkRpc3RhbmNlIiwgImZpZWxkX25hbWUiOiAiRGlzdGFuY2UiLCAicGFuZGFz'
              b'X3R5cGUiOiAiZmxvYXQ2NCIsICJudW1weV90eXBlIjogImZsb2F0NjQiLCAi'
              b'bWV0YWRhdGEiOiBudWxsfSwgeyJuYW1lIjogIlRheGlPdXQiLCAiZmllbGRf'
              b'bmFtZSI6ICJUYXhpT3V0IiwgInBhbmRhc190eXBlIjogImZsb2F0NjQiLCAi'
              b'bnVtcHlfdHlwZSI6ICJmbG9hdDY0IiwgIm1ldGFkYXRhIjogbnVsbH0sIHsi'
              b'bmFtZSI6ICJDYW5jZWxsZWQiLCAiZmllbGRfbmFtZSI6ICJDYW5jZWxsZWQi'
              b'LCAicGFuZGFzX3R5cGUiOiAiaW50NjQiLCAibnVtcHlfdHlwZSI6ICJpbnQ2'
              b'NCIsICJtZXRhZGF0YSI6IG51bGx9LCB7Im5hbWUiOiAiQ2FycmllckRlbGF5'
              b'IiwgImZpZWxkX25hbWUiOiAiQ2FycmllckRlbGF5IiwgInBhbmRhc190eXBl'
              b'IjogImZsb2F0NjQiLCAibnVtcHlfdHlwZSI6ICJmbG9hdDY0IiwgIm1ldGFk'
              b'YXRhIjogbnVsbH0sIHsibmFtZSI6ICJXZWF0aGVyRGVsYXkiLCAiZmllbGRf'
              b'bmFtZSI6ICJXZWF0aGVyRGVsYXkiLCAicGFuZGFzX3R5cGUiOiAiZmxvYXQ2'
              b'NCIsICJudW1weV90eXBlIjogImZsb2F0NjQiLCAibWV0YWRhdGEiOiBudWxs'
              b'fSwgeyJuYW1lIjogIk5BU0RlbGF5IiwgImZpZWxkX25hbWUiOiAiTkFTRGVs'
              b'YXkiLCAicGFuZGFzX3R5cGUiOiAiZmxvYXQ2NCIsICJudW1weV90eXBlIjog'
              b'ImZsb2F0NjQiLCAibWV0YWRhdGEiOiBudWxsfSwgeyJuYW1lIjogIlNlY3Vy'
              b'aXR5RGVsYXkiLCAiZmllbGRfbmFtZSI6ICJTZWN1cml0eURlbGF5IiwgInBh'
              b'bmRhc190eXBlIjogImZsb2F0NjQiLCAibnVtcHlfdHlwZSI6ICJmbG9hdDY0'
              b'IiwgIm1ldGFkYXRhIjogbnVsbH0sIHsibmFtZSI6ICJMYXRlQWlyY3JhZnRE'
              b'ZWxheSIsICJmaWVsZF9uYW1lIjogIkxhdGVBaXJjcmFmdERlbGF5IiwgInBh'
              b'bmRhc190eXBlIjogImZsb2F0NjQiLCAibnVtcHlfdHlwZSI6ICJmbG9hdDY0'
              b'IiwgIm1ldGFkYXRhIjogbnVsbH1dLCAiY3JlYXRvciI6IHsibGlicmFyeSI6'
              b'ICJweWFycm93IiwgInZlcnNpb24iOiAiMC4xNS4xIn0sICJwYW5kYXNfdmVy'
              b'c2lvbiI6ICIwLjI1LjMifQAWAAAAdAQAADAEAAD4AwAAwAMAAIgDAABQAwAA'
              b'IAMAAOgCAACwAgAAeAIAAEACAAAQAgAA5AEAALgBAACEAQAAVAEAABwBAADk'
              b'AAAArAAAAHgAAABAAAAABAAAAPT7//8AAAEDGAAAAAwAAAAEAAAAAAAAAL78'
              b'//8AAAIAEQAAAExhdGVBaXJjcmFmdERlbGF5AAAALPz//wAAAQMYAAAADAAA'
              b'AAQAAAAAAAAA9vz//wAAAgANAAAAU2VjdXJpdHlEZWxheQAAAGD8//8AAAED'
              b'GAAAAAwAAAAEAAAAAAAAACr9//8AAAIACAAAAE5BU0RlbGF5AAAAAJD8//8A'
              b'AAEDGAAAAAwAAAAEAAAAAAAAAFr9//8AAAIADAAAAFdlYXRoZXJEZWxheQAA'
              b'AADE/P//AAABAxgAAAAMAAAABAAAAAAAAACO/f//AAACAAwAAABDYXJyaWVy'
              b'RGVsYXkAAAAA+Pz//wAAAQIcAAAADAAAAAQAAAAAAAAA6Pz//wAAAAFAAAAA'
              b'CQAAAENhbmNlbGxlZAAAACz9//8AAAEDGAAAAAwAAAAEAAAAAAAAAPb9//8A'
              b'AAIABwAAAFRheGlPdXQAWP3//wAAAQMYAAAADAAAAAQAAAAAAAAAIv7//wAA'
              b'AgAIAAAARGlzdGFuY2UAAAAAiP3//wAAAQUUAAAADAAAAAQAAAAAAAAAHP//'
              b'/wQAAABEZXN0AAAAALD9//8AAAEFFAAAAAwAAAAEAAAAAAAAAET///8GAAAA'
              b'T3JpZ2luAADY/f//AAABAxgAAAAMAAAABAAAAAAAAACi/v//AAACAAcAAABB'
              b'aXJUaW1lAAT+//8AAAEDGAAAAAwAAAAEAAAAAAAAAM7+//8AAAIADgAAAENS'
              b'U0VsYXBzZWRUaW1lAAA4/v//AAABAhwAAAAMAAAABAAAAAAAAAAo/v//AAAA'
              b'AUAAAAAJAAAARmxpZ2h0TnVtAAAAbP7//wAAAQUYAAAAEAAAAAQAAAAAAAAA'
              b'BAAEAAQAAAANAAAAVW5pcXVlQ2FycmllcgAAAKD+//8AAAECHAAAAAwAAAAE'
              b'AAAAAAAAAJD+//8AAAABQAAAAAoAAABDUlNBcnJUaW1lAADU/v//AAABAxgA'
              b'AAAMAAAABAAAAAAAAACe////AAACAAcAAABBcnJUaW1lAAD///8AAAECHAAA'
              b'AAwAAAAEAAAAAAAAAPD+//8AAAABQAAAAAoAAABDUlNEZXBUaW1lAAA0////'
              b'AAABAyAAAAAUAAAABAAAAAAAAAAAAAYACAAGAAYAAAAAAAIABwAAAERlcFRp'
              b'bWUAaP///wAAAQIcAAAADAAAAAQAAAAAAAAAWP///wAAAAFAAAAACQAAAERh'
              b'eU9mV2VlawAAAJz///8AAAECHAAAAAwAAAAEAAAAAAAAAIz///8AAAABQAAA'
              b'AAoAAABEYXlvZk1vbnRoAADQ////AAABAhwAAAAMAAAABAAAAAAAAADA////'
              b'AAAAAUAAAAAFAAAATW9udGgAAAAQABQACAAGAAcADAAAABAAEAAAAAAAAQIk'
              b'AAAAFAAAAAQAAAAAAAAACAAMAAgABwAIAAAAAAAAAUAAAAAEAAAAWWVhcgAA'
              b'AAA=')])

In [None]:
df = dataset.read().to_pandas().set_index(['Year', 'Month'], inplace=True)

## cleanup

In [None]:
import shutil
shutil.rmtree(TARGET_PATH)

### single parquet file

run this only when `dataset=False`

In [None]:
assert KEY in run.outputs.keys(), f"mlrun.functions: key {KEY} not found in outputs"
assert os.path.isfile(TARGET_PATH+'/'+ FILE_NAME),  f"mlrun.functions: artifact source not found at {TARGET_PATH+'/'+ FILE_NAME}"

In [None]:
copied   = pd.read_parquet(TARGET_PATH+'/'+ FILE_NAME, engine="pyarrow")