# CSV to parquet

## Imports

If you don't have these libraries installed, run `pip install -r requirements.txt`.

In [None]:
%pip install -r requirements.txt

In [None]:
import os
from multiprocessing import Pool, current_process
import re
import gzip
import json
import importlib

from tqdm import tqdm # progress bar animation
import pandas as pd
import numpy as np

# pyarrow is like pandas, but works for datasets too big for memory.
import pyarrow
import pyarrow as pa
import pyarrow.csv
import pyarrow.compute as pc
import pyarrow.dataset as ds
import pyarrow.parquet

# utils is our local utility module
# if we change utils.py, and re-run a normal 'import'
# python won't reload it by default. (Since it's already loaded.)
# So we force a reload
import utils
importlib.reload(utils)

## Constants and Configuration

In [None]:
disk_one = '/media/matthew/Tux/AppliedEconometrics/data'
disk_two = '/media/matthew/nemweb/AppliedEconometrics/data'
repo_data_dir = '/home/matthew/Documents/TSE/AppliedEconometrics/repo/data/'
laptop_data_dir = '/home/matthew/data/'

source_dir = os.path.join(laptop_data_dir, '01-E-split-mapped-csv')
dest_dir = os.path.join(laptop_data_dir, '01-F-parquet-duplicated')

schema_path = os.path.join(repo_data_dir, 'schemas.json')

# process files with multiprocessing, to be faster.
# If set to False, will use a for loop, which gives clearer traceback error messages.
use_multiprocessing = True

if leave_unused_cpu:
    num_processes = os.cpu_count() - 2 # *2 because we assume hyperthreading
else:
    num_processes = os.cpu_count()

# set to True to leave one unused CPU when multiprocessing
# so that you can still do other stuff on your laptop without your internet browser or whatever being laggy
leave_unused_cpu = True

In [None]:
version_col_name = 'SCHEMA_VERSION'
top_timestamp_col_name = 'TOP_TIMESTAMP'

In [None]:
logger = utils.Logger(os.path.join(repo_data_dir, 'logs.txt'))
logger.info("Initialising Logger")

## Prepare Schemas

In [None]:
with open(schema_path, 'r') as f:
    schemas = json.load(f)

In [None]:
# AEMO's schemas have Oracle SQL types
# map those to types arrow can use
# e.g. DATE -> pl.datatypes.Date
# NUMBER(2,0) -> pl.Int16
# NUMBER(15,5) -> pl.Float64
# VARCHAR2(10) -> pl.String
# if date_as_str, return string instead of datetime
# (because pyarrow can't read datetimes when parsing from CSV)
def aemo_type_to_arrow_type(t: str, date_as_str=False) -> pa.DataType:
    t = t.upper()
    if re.match(r"VARCHAR(2)?\(\d+\)", t):
        return pa.string()
    if re.match(r"CHAR\((\d+)\)", t):
        # single character
        # arrow has no dedicated type for that
        # so use string
        # (could use categorical?)
        return pa.string()
    elif t.startswith("NUMBER"):
        match = re.match(r"NUMBER ?\((\d+), ?(\d+)\)", t)
        if match:
            whole_digits = int(match.group(1))
            decimal_digits = int(match.group(2))
        else:
            # e.g. NUMBER(2)
            match = re.match(r"NUMBER ?\((\d+)", t)
            assert match, f"Unsure how to cast {t} to arrow type"
            whole_digits = int(match.group(1))
            decimal_digits = 0
            
        if decimal_digits == 0:
            # integer
            # we assume signed (can't tell unsigned from the schema)
            # but how many bits?
            max_val = 10**whole_digits

            if 2**(8-1) > max_val:
                return pa.int8()
            elif 2**(16-1) > max_val:
                return pa.int16()
            elif 2**(32-1) > max_val:
                return pa.int32()
            else:
                return pa.int64()
        else:
            # we could use pa.decimal128(whole_digits, decimal_digits)
            # but we don't need that much accuracy
            return pa.float64()
    elif (t == 'DATE') or re.match(r"TIMESTAMP\((\d)\)", t):
        # watch out, when AEMO say "date" they mean "datetime"
        # for both dates and datetimes they say "date",
        # but both have a time component. (For actual dates, it's always midnight.)
        # and some dates go out as far as 9999-12-31 23:59:59.999
        # (and some dates are 9999-12-31 23:59:59.997)
        if date_as_str:
            return pa.string()
        else:
            return pa.timestamp('s', tz='Australia/Brisbane')
    else:
        raise ValueError(f"Unsure how to convert AEMO type {t} to arrow type")


In [None]:
if not os.path.exists(dest_dir):
    os.makedirs(dest_dir)

In [None]:
# DISPATCHLOAD is quite large, and we only need some columns for our analysis. If we drop the columns we don't need, the whole thing will be far faster.
# include ones we *might* use
cols = list(schemas['DISPATCHLOAD'].keys())
for c in cols:
    if any(c.upper().startswith(prefix) for prefix in ['RAISE', 'LOWER', 'VIOLATION', 'MARGINAL']):
        print(f"Deleting column {c} from DISPATCHLOAD")
        del schemas['DISPATCHLOAD'][c]

In [None]:
# TODO: document the datetime from string handling

In [None]:
# find tables we have no schema for
[t for t in os.listdir(source_dir) if t not in schemas]

In [None]:
# takes in the name of a folder of CSVs
# converts them all to a single parquet file
# for `table`, the files are like
# source_dir / table / SCHEMA_VERSION=2 / TOP_TIMESTAMP=2019_03_02_00_45_12 /  something.CSV.gz
def convert_csv_parquet(table):
    table_dir = os.path.join(source_dir, table)
    parquet_file = os.path.join(dest_dir, table + '.parquet')

    input_schema = {}
    output_schema = {}
    datetime_columns = []
    column_names = [] # source columns
    for (c,s) in schemas[table]['columns'].items():
        t = s['AEMO_type']
        input_schema[c] = aemo_type_to_arrow_type(t, date_as_str=True)
        output_schema[c] = aemo_type_to_arrow_type(t, date_as_str=False)
        if isinstance(output_schema[c], pa.TimestampType):
            datetime_columns.append(c)
        column_names.append(c)
    output_schema.update({
        version_col_name: pa.uint8(),
        # leave as string, don't bother converting
        # we only use this as a sort key later for deduplication
        top_timestamp_col_name: pa.string(), 
    })
    
    input_schema = pyarrow.schema(input_schema)
    output_schema = pyarrow.schema(output_schema)

    # this will overwrite an existing file (from a previous run)    
    with pyarrow.parquet.ParquetWriter(parquet_file, output_schema) as writer:
        for schema_subdir in os.listdir(table_dir):
            match = re.match(rf"{version_col_name}=(\d+)", schema_subdir)
            assert match, f"Unable to extract schema version from {os.path.join(table_dir, schema_subdir)}"
            schema_version = int(match.group(1))

            for top_timestamp_subdir in os.listdir(os.path.join(table_dir, schema_subdir)):
                match = re.match(rf"{top_timestamp_col_name}=([\d_]+)", top_timestamp_subdir)
                assert match, f"Unable to extract top_timestabl from {os.path.join(table_dir, schema_subdir, top_timestamp_subdir)}"
                top_timestamp = match.group(1)

                for csv_file in os.listdir(os.path.join(table_dir, schema_subdir, top_timestamp_subdir)):
                    csv_path = os.path.join(table_dir, schema_subdir, top_timestamp_subdir, csv_file)
                    csv_reader = pyarrow.csv.open_csv(csv_path, 
                                                      convert_options=pyarrow.csv.ConvertOptions(
                                                      column_types=input_schema, 
                                                      include_missing_columns=True, 
                                                      include_columns=column_names))
        
                    for batch in csv_reader:
                        columns = []
                        
                        for (i, column_name) in enumerate(column_names):
                            assert isinstance(column_name, str), f"column_name is type {type(column_name)}"
                            column_data = batch[column_name]
                            if column_name in datetime_columns:
                                # pyarrow assumes the timesstamp is UTC by default
                                # the schema specifies the timezone as UTC+10 (which we want)
                                # but it adds 10h when doing the conversion
                                # so let's subtract 10h
                                # This is checked with a unit test later
                                column_data = pc.strptime(column_data, format="%Y/%m/%d %H:%M:%S", unit="s", error_is_null=True)
                                column_data = pc.assume_timezone(column_data, timezone='Australia/Brisbane')
                                #column_data = pc.add(column_data, -dt.timedelta(hours=TIMEZONE_OFFSET))
                            columns.append(column_data)

                        # now add the metadata from folder names as columns
                        # pa.array(repeat(x), type=pa.string(), size=batch.num_rows)
                        columns.append(pa.Array.from_pandas(np.repeat(schema_version, batch.num_rows)))

                        # convert top timestamp to datetime too
                        tt = pa.array(np.repeat(top_timestamp, batch.num_rows), type=pa.string(), size=batch.num_rows)
                        tt = pc.strptime(tt, format="%Y_%m_%d_%H_%M_%S", unit="s", error_is_null=True)
                        tt = pc.assume_timezone(tt, timezone='Australia/Brisbane')
                        columns.append(tt)
                        
                        updated_table = pyarrow.Table.from_arrays(
                            columns, 
                            schema=output_schema
                        )
                        writer.write(updated_table)

tables = [t for t in schemas if os.path.exists(os.path.join(source_dir, t))]
print(f"Tables listed")
if use_multiprocessing and False:
    if leave_unused_cpu:
        num_processes = os.cpu_count() - 2 # *2 because we assume hyperthreading
    else:
        num_processes = os.cpu_count()
    with Pool(num_processes) as p:
        list(tqdm(p.imap(convert_csv_parquet, tables), total=len(tables)))
else:
    for table in tqdm(tables):
        convert_csv_parquet(table)