In [31]:
import gzip
import os
import importlib
import json
import csv
import re
from io import TextIOWrapper

from tqdm import tqdm

# utils is our local utility module
# if we change utils.py, and re-run a normal 'import'
# python won't reload it by default. (Since it's already loaded.)
# So we force a reload
import utils
importlib.reload(utils)

<module 'utils' from '/home/matthew/Documents/TSE/AppliedEconometrics/repo/utils.py'>

In [23]:
repo_data_dir = '/home/matthew/Documents/TSE/AppliedEconometrics/repo/data/'
laptop_data_dir = '/home/matthew/data/'

# output of the previous script
source_dir = os.path.join(laptop_data_dir, '01-D-split-mapped-csv-done')


# the parquet files go here
dest_dir = os.path.join(laptop_data_dir, '01-D-consolidate-csv')

schema_path = os.path.join(repo_data_dir, 'schemas.json')


In [24]:
version_col_name = 'SCHEMA_VERSION'
top_timestamp_col_name = 'TOP_TIMESTAMP'

In [25]:
with open(schema_path, 'r') as f:
    schemas = json.load(f)

In [26]:
logger = utils.Logger(os.path.join(repo_data_dir, 'logs.txt'))
logger.info("Initialising Logger")

In [44]:
table = 'DISPATCHLOAD'
table_source_dir = os.path.join(source_dir, table)
dest_path = os.path.join(dest_dir, table + '.csv.gz')

in_columns = list(schemas[table]['columns'].keys())
out_columns = in_columns + [version_col_name, top_timestamp_col_name]

try:
    utils.create_dir(file=dest_path)
    with gzip.open(dest_path, 'wt', newline='', compresslevel=2) as f_dest_str:
        writer = csv.DictWriter(f_dest_str, fieldnames=out_columns)
        writer.writeheader()
    
        for csv_path in tqdm(utils.walk(table_source_dir)):
            match = re.search(f"/{version_col_name}=(\d+)/", csv_path)
            assert match, f"Unable to extract schema version from {csv_path}"
            schema_version = int(match.group(1))
        
            match = re.search(f"/{top_timestamp_col_name}=([\d_]+)/", csv_path)
            assert match, f"Unable to extract top_timestamp from {csv_path}"
            top_timestamp = match.group(1)

            with gzip.open(csv_path, 'rt', newline='') as f_src_str:
                reader = csv.DictReader(f_src_str)
                for row in reader:
                    row.update({
                        version_col_name: schema_version,
                        top_timestamp_col_name: top_timestamp
                    })
                    writer.writerow(row)

            
except:
    if os.path.exists(dest_path):
        os.remove(dest_path)
    raise
        

216it [55:57, 15.54s/it]


KeyboardInterrupt: 