In [32]:
import gzip
import os
import importlib
import json
import csv
import re
from io import TextIOWrapper
import urllib.parse
from collections import OrderedDict
import shutil
from multiprocessing import Pool
import gc

from tqdm import tqdm

import polars as pl

# utils is our local utility module
# if we change utils.py, and re-run a normal 'import'
# python won't reload it by default. (Since it's already loaded.)
# So we force a reload
import utils
importlib.reload(utils)

<module 'utils' from '/home/matthew/Documents/TSE/AppliedEconometrics/repo/utils.py'>

In [33]:
repo_data_dir = '/home/matthew/Documents/TSE/AppliedEconometrics/repo/data/'
laptop_data_dir = '/home/matthew/data/'

# output of the previous script
#source_dir = os.path.join(laptop_data_dir, '01-C-split-mapped-csv')
source_dir = os.path.join(laptop_data_dir, '01-D-split-mapped-csv-done')

# save the list of tables we have already processed in here
# so with repeated runs, we don't redo work
progress_file = os.path.join(laptop_data_dir, '01-D-consolidate-partition-progress.txt')

# the parquet files go here
partitioned_duped_dir = os.path.join(laptop_data_dir, '01-D-consolidate-csv-partitioned')
partitioned_deduped_dir = os.path.join(laptop_data_dir, '01-D-consolidate-csv-partitioned-deduplicated')

schema_path = os.path.join(repo_data_dir, 'schemas.json')


In [34]:
use_multiprocessing = True

In [35]:
version_col_name = 'SCHEMA_VERSION'
top_timestamp_col_name = 'TOP_TIMESTAMP'

In [36]:
with open(schema_path, 'r') as f:
    schemas = json.load(f)

In [37]:
logger = utils.Logger(os.path.join(repo_data_dir, 'logs.txt'))
logger.info("Initialising Logger")

In [39]:
for (table, schema) in schemas.items():
    schema['partition_key_names'] = schema['primary_keys']
    schema['partition_key_names'] = [c for c in schema['partition_key_names'] if 'DATE' not in c.upper()]
    schema['partition_key_names'] = [c for c in schema['partition_key_names'] if c.upper() not in ['RUNNO', 'DISPATCHINTERVAL', 'CONTRACTID']]
    
    if table == 'DISPATCHLOAD':
        schema['columns_to_drop'] = [c for c in schemas[table]['columns'] if any(ss in c for ss in ('RAISE', 'LOWER', 'VIOLATION'))]
    else:
        schema['columns_to_drop'] = []  
    

In [30]:
def repartition(table):
    table_source_dir = os.path.join(source_dir, table)
    table_partitioned_duped_dir = os.path.join(partitioned_duped_dir, table)
    schema = schemas[table]
    in_columns = list(schema['columns'].keys())
    out_columns = in_columns + [version_col_name, top_timestamp_col_name]
    
    out_columns = [c for c in out_columns if (c not in schema['columns_to_drop']) and (c not in schema['partition_key_names'])]
    
    shutil.rmtree(table_partitioned_duped_dir, ignore_errors=True)
    file_handles = OrderedDict()
    try:
        for csv_path in utils.walk(table_source_dir):
            match = re.search(f"/{version_col_name}=(\d+)/", csv_path)
            assert match, f"Unable to extract schema version from {csv_path}"
            schema_version = int(match.group(1))
        
            match = re.search(f"/{top_timestamp_col_name}=([\d_]+)/", csv_path)
            assert match, f"Unable to extract top_timestamp from {csv_path}"
            top_timestamp = match.group(1)
            
            with gzip.open(csv_path, 'rt', newline='') as f_src_str:
                reader = csv.DictReader(f_src_str)
                for row in reader:
        
                    row.update({
                        version_col_name: schema_version,
                        top_timestamp_col_name: top_timestamp
                    })
                    
                    partition_key_values = tuple(row.get(c, None) for c in partition_key_names)
        
                    if partition_key_values in file_handles:
                        (f, writer) = file_handles[partition_key_values]
                        file_handles.move_to_end(partition_key_values)
                        new = False
                    else:
                        # decide where to save it
                        partition_subdirs = [f"{k}={urllib.parse.quote_plus(v)}" for (k,v) in zip(partition_key_names, partition_key_values)]
                        dest_path = os.path.join(table_partitioned_duped_dir, *partition_subdirs, 'data.csv.gz')
                        if os.path.exists(dest_path):
                            # logger.info(f"Re-using file for {partition_key_values}")
                            try:
                                f = gzip.open(dest_path, 'at', compresslevel=2, newline='')
                            except OSError as e:
                                if e.strerror == 'Too many open files':
                                    for _ in range(5):
                                        (old_keys, (old_f, old_writer)) = file_handles.popitem(last=True)
                                        old_f.close()
                                    f = gzip.open(dest_path, 'at', compresslevel=2, newline='')
                                else:
                                    raise
                            writer = csv.DictWriter(f, fieldnames=out_columns, extrasaction='ignore')
                        else:
                            # logger.info(f"Creating new file at {dest_path}")
                            utils.create_dir(file=dest_path)
                            try:
                                f = gzip.open(dest_path, 'wt', compresslevel=2, newline='')
                            except OSError as e:
                                if e.strerror == 'Too many open files':
                                    for _ in range(5):
                                        (old_keys, (old_f, old_writer)) = file_handles.popitem(last=True)
                                        old_f.close()
                                    f = gzip.open(dest_path, 'wt', compresslevel=2, newline='')
                                else:
                                    raise
                            writer = csv.DictWriter(f, fieldnames=out_columns, extrasaction='ignore')
                            writer.writeheader()
                        
                        file_handles[partition_key_values] = (f, writer)
        
                    writer.writerow(row)
            logger.flush()
        logger.info(f"Finished for {table}")
    finally:
        # tidy up
        for (f, writer) in file_handles.values():
            f.close()

    with open(progress_file, 'a') as f:
        f.write(table + '\n')


tables = os.listdir(source_dir)

try:
    with open(progress_file, 'r') as f:
        already_processed = [line.strip() for line in f if line.strip()]
    tables = [t for t in tables if t not in already_processed]
except FileNotFoundError:
    # first run
    pass

def dedup(table):
    table_partitioned_duped_dir = os.path.join(partitioned_duped_dir , table)
    table_partitioned_deduped_dir = os.path.join(partitioned_deduped_dir , table)

    for src_file in utils.walk(table_partitioned_duped_dir):
        sub_path = os.path.relpath(path=src_file, start=table_partitioned_duped_dir)
        dest_file = os.path.join(table_partitioned_deduped_dir, sub_path)

        src_columns = [c for c in schemas[table]['columns'] if c not in schemas[table]['columns_to_drop'] and c not in schemas[table]['partition_key_names']]
        schema = {c: aemo_type_to_arrow_type(schemas[table]['columns'][c]) for c in src_columns}
        schema.update({
            version_col_name: pl.UInt8()
            top_timestamp_col_name: pl.String(),
        })

        # do the dedup
        raise NotImplementedError()
        


# no multiprocessing
# because we'll hit the max file handler count sooner
for table in tqdm(tables):
    gc.collect()
    repartition(table)

  0%|                                                                                                                                                                 | 0/131 [00:04<?, ?it/s]


KeyboardInterrupt: 

In [40]:
pl.String()

String