In [2]:
import gzip
import os
import importlib
import json
import csv
import re
from io import TextIOWrapper
import urllib.parse
from collections import OrderedDict
import shutil
from multiprocessing import Pool
import gc

from tqdm import tqdm

import polars as pl

# utils is our local utility module
# if we change utils.py, and re-run a normal 'import'
# python won't reload it by default. (Since it's already loaded.)
# So we force a reload
import utils
importlib.reload(utils)

<module 'utils' from '/home/matthew/Documents/TSE/AppliedEconometrics/repo/utils.py'>

In [3]:
repo_data_dir = '/home/matthew/Documents/TSE/AppliedEconometrics/repo/data/'
laptop_data_dir = '/home/matthew/data/'

# output of the previous script
#source_dir = os.path.join(laptop_data_dir, '01-C-split-mapped-csv')
source_dir = os.path.join(laptop_data_dir, '01-D-split-mapped-csv-done')

# save the list of tables we have already processed in here
# so with repeated runs, we don't redo work
partition_progress_file = os.path.join(laptop_data_dir, '01-D-consolidate-partition-progress.txt')
dedup_progress_file = os.path.join(laptop_data_dir, '01-D-consolidate-dedup-progress.txt')

# the parquet files go here
partitioned_duped_dir = os.path.join(laptop_data_dir, '01-D-consolidate-csv-partitioned')
partitioned_deduped_dir = os.path.join(laptop_data_dir, '01-D-consolidate-csv-partitioned-deduplicated')

schema_path = os.path.join(repo_data_dir, 'schemas.json')


In [4]:
use_multiprocessing = True

In [5]:
version_col_name = 'SCHEMA_VERSION'
top_timestamp_col_name = 'TOP_TIMESTAMP'

In [6]:
with open(schema_path, 'r') as f:
    schemas = json.load(f)

In [7]:
logger = utils.Logger(os.path.join(repo_data_dir, 'logs.txt'))
logger.info("Initialising Logger")

In [8]:
for (table, schema) in schemas.items():
    schema['partition_key_names'] = schema['primary_keys']
    schema['partition_key_names'] = [c for c in schema['partition_key_names'] if 'DATE' not in c.upper()]
    schema['partition_key_names'] = [c for c in schema['partition_key_names'] if c.upper() not in ['RUNNO', 'DISPATCHINTERVAL', 'CONTRACTID']]
    
    if table in ['DISPATCHLOAD', 'DISPATCHREGIONSUM']:
        schema['columns_to_drop'] = [c for c in schemas[table]['columns'] if any(ss in c for ss in ('RAISE', 'LOWER', 'VIOLATION'))]
    else:
        schema['columns_to_drop'] = []  
    

In [9]:
# AEMO's schemas have Oracle SQL types
# map those to types polars can use
# if date_as_str, return string instead of datetime
# (because polars can't read datetimes when parsing from CSV)
def aemo_type_to_polars_type(t: str, tz=None, date_as_str=False):
    t = t.upper()
    if re.match(r"VARCHAR(2)?\(\d+\)", t):
        return pl.String()
    if re.match(r"CHAR\((\d+)\)", t):
        # single character
        # arrow has no dedicated type for that
        # so use string
        # (could use categorical?)
        return pl.String()
    elif t.startswith("NUMBER"):
        match = re.match(r"NUMBER ?\((\d+), ?(\d+)\)", t)
        if match:
            whole_digits = int(match.group(1))
            decimal_digits = int(match.group(2))
        else:
            # e.g. NUMBER(2)
            match = re.match(r"NUMBER ?\((\d+)", t)
            assert match, f"Unsure how to cast {t} to arrow type"
            whole_digits = int(match.group(1))
            decimal_digits = 0
            
        if decimal_digits == 0:
            # integer
            # we assume signed (can't tell unsigned from the schema)
            # but how many bits?
            max_val = 10**whole_digits

            if 2**(8-1) > max_val:
                return pl.Int8()
            elif 2**(16-1) > max_val:
                return pl.Int16()
            elif 2**(32-1) > max_val:
                return pl.Int32()
            else:
                return pl.Int64()
        else:
            # we could use pa.decimal128(whole_digits, decimal_digits)
            # but we don't need that much accuracy
            return pl.Float64()
    elif (t == 'DATE') or re.match(r"TIMESTAMP\((\d)\)", t):
        # watch out, when AEMO say "date" they mean "datetime"
        # for both dates and datetimes they say "date",
        # but both have a time component. (For actual dates, it's always midnight.)
        # and some dates go out as far as 9999-12-31 23:59:59.999
        # (and some dates are 9999-12-31 23:59:59.997)
        if date_as_str:
            return pl.String()
        else:
            return pl.Datetime(time_unit='ms', time_zone=tz)
    else:
        raise ValueError(f"Unsure how to convert AEMO type {t} to polars type")


In [None]:
def repartition(table):
    table_source_dir = os.path.join(source_dir, table)
    table_partitioned_duped_dir = os.path.join(partitioned_duped_dir, table)
    schema = schemas[table]
    in_columns = list(schema['columns'].keys())
    out_columns = in_columns + [version_col_name, top_timestamp_col_name]
    
    out_columns = [c for c in out_columns if (c not in schema['columns_to_drop']) and (c not in schema['partition_key_names'])]
    
    shutil.rmtree(table_partitioned_duped_dir, ignore_errors=True)
    file_handles = OrderedDict()
    try:
        for csv_path in utils.walk(table_source_dir):
            match = re.search(f"/{version_col_name}=(\d+)/", csv_path)
            assert match, f"Unable to extract schema version from {csv_path}"
            schema_version = int(match.group(1))
        
            match = re.search(f"/{top_timestamp_col_name}=([\d_]+)/", csv_path)
            assert match, f"Unable to extract top_timestamp from {csv_path}"
            top_timestamp = match.group(1)
            
            with gzip.open(csv_path, 'rt', newline='') as f_src_str:
                reader = csv.DictReader(f_src_str)
                for row in reader:
        
                    row.update({
                        version_col_name: schema_version,
                        top_timestamp_col_name: top_timestamp
                    })
                    
                    partition_key_values = tuple(row.get(c, None) for c in schema['partition_key_names'])
        
                    if partition_key_values in file_handles:
                        (f, writer) = file_handles[partition_key_values]
                        file_handles.move_to_end(partition_key_values)
                        new = False
                    else:
                        # decide where to save it
                        partition_subdirs = [f"{k}={urllib.parse.quote_plus(v)}" for (k,v) in zip(schema['partition_key_names'], partition_key_values)]
                        dest_path = os.path.join(table_partitioned_duped_dir, *partition_subdirs, 'data.csv.gz')
                        if os.path.exists(dest_path):
                            # logger.info(f"Re-using file for {partition_key_values}")
                            try:
                                f = gzip.open(dest_path, 'at', compresslevel=2, newline='')
                            except OSError as e:
                                if e.strerror == 'Too many open files':
                                    for _ in range(5):
                                        (old_keys, (old_f, old_writer)) = file_handles.popitem(last=True)
                                        old_f.close()
                                    f = gzip.open(dest_path, 'at', compresslevel=2, newline='')
                                else:
                                    raise
                            writer = csv.DictWriter(f, fieldnames=out_columns, extrasaction='ignore')
                        else:
                            # logger.info(f"Creating new file at {dest_path}")
                            utils.create_dir(file=dest_path)
                            try:
                                f = gzip.open(dest_path, 'wt', compresslevel=2, newline='')
                            except OSError as e:
                                if e.strerror == 'Too many open files':
                                    for _ in range(5):
                                        (old_keys, (old_f, old_writer)) = file_handles.popitem(last=True)
                                        old_f.close()
                                    f = gzip.open(dest_path, 'wt', compresslevel=2, newline='')
                                else:
                                    raise
                            writer = csv.DictWriter(f, fieldnames=out_columns, extrasaction='ignore')
                            writer.writeheader()
                        
                        file_handles[partition_key_values] = (f, writer)
        
                    writer.writerow(row)
            logger.flush()
        logger.info(f"Finished for {table}")
    finally:
        # tidy up
        for (f, writer) in file_handles.values():
            f.close()

    with open(partition_progress_file, 'a') as f:
        f.write(table + '\n')


tables = os.listdir(source_dir)

try:
    with open(partition_progress_file, 'r') as f:
        already_partitioned = [line.strip() for line in f if line.strip()]
except FileNotFoundError:
    # first run
    already_partitioned = []


try:
    with open(dedup_progress_file, 'r') as f:
        already_deduped = [line.strip() for line in f if line.strip()]
except FileNotFoundError:
    # first run
    already_deduped = []



def dedup(table):
    table_partitioned_duped_dir = os.path.join(partitioned_duped_dir , table)
    table_partitioned_deduped_dir = os.path.join(partitioned_deduped_dir , table)


    src_columns = [c for c in schemas[table]['columns'] if c not in schemas[table]['columns_to_drop'] and c not in schemas[table]['partition_key_names']]
    schema = {c: aemo_type_to_polars_type(schemas[table]['columns'][c]['AEMO_type'], date_as_str=True) for c in src_columns}
    schema.update({
        version_col_name: pl.UInt8(),
        top_timestamp_col_name: pl.String(),
    })
    sort_keys = [version_col_name, top_timestamp_col_name]
    if 'LASTCHANGED' in schema:
        sort_keys.append('LASTCHANGED')

    primary_keys = [p for p in schemas[table]['primary_keys'] if p not in schemas[table]['partition_key_names']]
    if not os.path.exists(table_partitioned_duped_dir):
        raise ValueError(f"Source folder {table_partitioned_duped_dir} does not exist")
    logger.info(f"Listing files in {table_partitioned_duped_dir}, with {schema}")
    for src_path in utils.walk(table_partitioned_duped_dir):
        try:
            sub_path = os.path.relpath(path=src_path, start=table_partitioned_duped_dir)
            dest_path = os.path.join(table_partitioned_deduped_dir, sub_path)
    
            # polars can't write to .csv.gz directly, only .csv
            # but it can write to a file-like object of a gzipped file
            utils.create_dir(file=dest_path)
            logger.info(f"Openning {src_path=}")
            logger.flush()
            with gzip.open(dest_path, 'wt', compresslevel=4, newline='') as f_dest:
                (
                    pl.read_csv(src_path, dtypes=schema)
                    .sort(sort_keys, descending=True)
                    .unique(primary_keys or None)
                    .select(pl.exclude(version_col_name, top_timestamp_col_name))
                    .write_csv(f_dest)
                )
        except Exception as e:
            print(f"Error with {table=} {src_path=}")
            raise

    with open(dedup_progress_file, 'a') as f:
        f.write(table + '\n')

# no multiprocessing
# because we'll hit the max file handler count sooner
try:
    for table in tqdm(tables):
        gc.collect()
        if table not in already_partitioned:
            repartition(table)
        if (table not in already_partitioned) or (table not in already_deduped):
            dedup(table)
finally:
    logger.flush()


 14%|████████████████████▍                                                                                                                            | 20/142 [00:00<00:02, 44.61it/s]

In [None]:
# TODO: partition P5MIN_INTERCONNECTORSOLN by year(INTERVAL_DATETIME )