# CSV to parquet

This script takes a few hours to run.

We have many CSV files, mostly small ones, for each AEMO 'table' (dataframe). This script merges lots of small CSVs, and creates a one large parquet file (per table). "Compacting" into fewer, larger files improves performance later.

To see the advantages of parquet over csv, read [this](https://r4ds.hadley.nz/arrow#advantages-of-parquet).
The main benefits are:

* performance - e.g. if we only care about 2 out of 20 columns, we skip over 90% of data when reading from disk.
* type safety - the file keeps track of what's a float vs datetime etc. So we don't have to tell the code what each datatype is after this script.

We use pyarrow here, not Pandas. This is partly for performance reasons. Also because Pandas can't handle empty values for some datatypes.

When running, don't forget to change `base_data_dir`.


In [None]:
import os
import re
import json
import importlib
import shutil
from random import shuffle
import gc

from tqdm import tqdm # progress bar animation
import pandas as pd
import numpy as np

# pyarrow is like pandas, but works for datasets too big for memory.
import pyarrow
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.csv

# utils is our local utility module
# if we change utils.py, and re-run a normal 'import'
# python won't reload it by default. (Since it's already loaded.)
# So we force a reload
import utils
importlib.reload(utils)

## Constants and Configuration

In [None]:
base_data_dir = 'data'

# output of the previous script
source_dir = os.path.join(base_data_dir, '01-C-split-mapped-csv')

# the parquet files go here
dest_dir = os.path.join(base_data_dir, '01-D-parquet-pyarrow-dataset')

# once files are processed, we move them here
# if move_when_done
archive_dir = os.path.join(base_data_dir, '01-D-split-mapped-csv-done')
move_when_done = False

schema_path = os.path.join(base_data_dir, '01-aemo-schemas.json')

In [None]:
version_col_name = 'SCHEMA_VERSION'
top_timestamp_col_name = 'TOP_TIMESTAMP'

In [None]:
logger = utils.Logger(os.path.join(base_data_dir, 'logs.txt'))
logger.info("Initialising Logger")

## Prepare Schemas

In [None]:
with open(schema_path, 'r') as f:
    schemas = json.load(f)

In [None]:
# AEMO's schemas have Oracle SQL types
# map those to types arrow can use
# e.g. DATE -> pl.datatypes.Date
# NUMBER(2,0) -> pl.Int16
# NUMBER(15,5) -> pl.Float64
# VARCHAR2(10) -> pl.String
# if date_as_str, return string instead of datetime
# (because pyarrow can't read datetimes when parsing from CSV)
def aemo_type_to_arrow_type(t: str, date_as_str=False) -> pa.DataType:
    t = t.upper()
    if re.match(r"VARCHAR(2)?\(\d+\)", t):
        return pa.string()
    if re.match(r"CHAR\((\d+)\)", t):
        # single character
        # arrow has no dedicated type for that
        # so use string
        # (could use categorical?)
        return pa.string()
    elif t.startswith("NUMBER"):
        match = re.match(r"NUMBER ?\((\d+), ?(\d+)\)", t)
        if match:
            whole_digits = int(match.group(1))
            decimal_digits = int(match.group(2))
        else:
            # e.g. NUMBER(2)
            match = re.match(r"NUMBER ?\((\d+)", t)
            assert match, f"Unsure how to cast {t} to arrow type"
            whole_digits = int(match.group(1))
            decimal_digits = 0
            
        if decimal_digits == 0:
            # integer
            # we assume signed (can't tell unsigned from the schema)
            # but how many bits?
            max_val = 10**whole_digits

            if 2**(8-1) > max_val:
                return pa.int8()
            elif 2**(16-1) > max_val:
                return pa.int16()
            elif 2**(32-1) > max_val:
                return pa.int32()
            else:
                return pa.int64()
        else:
            # we could use pa.decimal128(whole_digits, decimal_digits)
            # but we don't need that much accuracy
            return pa.float64()
    elif (t == 'DATE'):
        # watch out, when AEMO say "date" they mean "datetime"
        # for both dates and datetimes they say "date",
        # but both have a time component. (For actual dates, it's always midnight.)
        # and some dates go out as far as 9999-12-31 23:59:59.999
        # (and some dates are 9999-12-31 23:59:59.997)
        if date_as_str:
            return pa.string()
        else:
            # no timezone here
            # pyarrow can't assume timezone when reading from CSV
            # we treat them is timezone unaware for now
            return pa.timestamp('s')
            #return pa.timestamp('s', tz='Australia/Brisbane')
    elif re.match(r"TIMESTAMP\((\d)\)", t):
        # this is the same as DATE, but with a microsecond component
        if date_as_str:
            return pa.string()
        else:
            # https://github.com/apache/arrow/issues/39839
            # bug with pyarrow. It can't handle millisecond components
            # even with .%f
            # But this millisecond granularity type is quite rare
            # I don't think it happens in any of the tables we care about.
            return pa.string()
            #return pa.timestamp('ms')
    else:
        raise ValueError(f"Unsure how to convert AEMO type {t} to arrow type")


Note that if a table has columns specified in `columns_to_drop`, those will be already omitted from the source CSV. But this script will add them back in as NA/NULL. This doesn't waste space, because parquet compresses data well. This was a deliberate decision because as we changed how we handle `columns_to_drop`, if they are in the source file, then pyarrow will try to read the data of those columns to infer datatype. That's slow, and can result in errors. (e.g. it can't reconcile int and float.)
So we don't exclude `columns_to_drop` when generating `csv_schema`.

In [None]:
# takes in the name of a folder of CSVs
# converts them all to a single parquet file
# for `table`, the files are like
# source_dir / table / SCHEMA_VERSION=2 / TOP_TIMESTAMP=2019_03_02_00_45_12 /  something.CSV.gz
def convert_csv_parquet(table):
    table_source_dir = os.path.join(source_dir, table)
    table_dest_dir = os.path.join(dest_dir, table)

    logger.info(f"Preparing to process {table} from {table_source_dir} to {table_dest_dir}")

    csv_schema = {c: aemo_type_to_arrow_type(t['AEMO_type'], date_as_str=False) for (c,t) in schemas[table]['columns'].items()}
    partition_schema = {
        "SCHEMA_VERSION": pa.int8(), 
        "TOP_TIMESTAMP": pa.string(),
    }
    schema = dict(csv_schema, **partition_schema)
    dataset = ds.dataset(
        source=table_source_dir, 
        format=ds.CsvFileFormat(
            convert_options=pyarrow.csv.ConvertOptions(
                timestamp_parsers=[
                    "%Y/%m/%d %H:%M:%S",
                    "%Y/%m/%d %H:%M:%S.%f",
                ]
            )
        ), # really .csv.gz, but pyarrow will figure that out
        partitioning=ds.partitioning(
            pa.schema(partition_schema),
            flavor="hive"
        ),
        schema=pyarrow.schema(schema)
    )

    shutil.rmtree(table_dest_dir, ignore_errors=True)
    ds.write_dataset(
        data=dataset, 
        base_dir=table_dest_dir, 
        format="parquet", 
        min_rows_per_group=1024*1024,
        existing_data_behavior="delete_matching"
    )
    logger.info(f"Finished writing {table} to {table_dest_dir}")
    if move_when_done:
        # move away,
        # so if the next table fails, and we re-run the script
        # we don't waste time re-doing this one
        table_archive_dir = os.path.join(archive_dir, table)
        logger.info(f"Finished with {table}, moving {table_source_dir} to {table_archive_dir}")
        utils.create_dir(archive_dir)
        os.rename(table_source_dir, table_archive_dir)

tables = [t for t in schemas if os.path.exists(os.path.join(source_dir, t))]
shuffle(tables)
logger.info(f"{len(tables)} Tables listed")
logger.flush = True

# no multiprocessing
# because max memory usage peaks very high
for table in tqdm(tables):
    gc.collect()
    convert_csv_parquet(table)

In [None]:
[t for t in schemas if os.path.exists(os.path.join(source_dir, t))]

In [None]:
source_dir