In [None]:
import zipfile
import os
from multiprocessing import Pool, current_process
import shutil
from typing import List, Dict, Tuple, Optional, TextIO, BinaryIO
import csv
import traceback
import json
from random import shuffle
import re
from uuid import uuid4
import io
import gzip

from tqdm import tqdm # progress bar animation
import pandas as pd 

## Path constants

You may need to change these, if you're trying to rerun the code on your own machine.

In [None]:
disk_one = '/media/matthew/Tux/AppliedEconometrics/data'
disk_two = '/media/matthew/nemweb/AppliedEconometrics/data'
repo_data_dir = '/home/matthew/Documents/TSE/AppliedEconometrics/repo/data/'

source_dir = os.path.join(disk_one, '01-A-raw')
archive_dir = os.path.join(disk_one, '01-A-raw-done')
#source_dir = os.path.join(repo_data_dir, 'debug', 'test-in')
temp_dir = os.path.join(repo_data_dir, '01-B-split-mapped-csv-temp/')
dest_dir = os.path.join(repo_data_dir, '01-E-split-mapped-csv/')

debug_path = os.path.join(repo_data_dir, 'debug/')
bad_zip_dir = os.path.join(disk_two, 'error/bad_zip/')
bad_csv_dir = os.path.join(disk_two, 'error/bad_csv/')
log_file = os.path.join(repo_data_dir, 'logs.txt')

metadata_path = os.path.join(repo_data_dir, 'MMS_Data_Model_Table_to_File_to_Report_Relationships_51.xlsx')

In [None]:
# when finished processing a zip
# move it to archive_dir
# so if we stop and restart the playbook, we don't redo that zip
# set to True when you think the code is good.
# set to False when still writing the code and testing,
# or if you've changed any of the code.
move_when_done = True


# process files with multiprocessing, to be faster.
# If set to False, will use a for loop, which gives clearer traceback error messages.
use_multiprocessing = True

# set to True to leave one unused CPU when multiprocessing
# so that you can still do other stuff on your laptop without your internet browser or whatever being laggy
leave_unused_cpu = True

if leave_unused_cpu:
    num_processes = os.cpu_count() - 2 # *2 because we assume hyperthreading
else:
    num_processes = os.cpu_count()

# for Matt, this ended up being optimal
# higher means more CPUs
# but too high and the disk is the bottleneck. Don't do too many concurrent writes.
#num_processes = 4

# how hardcore the gzip algorithm is for .csv.gz files. Low to prioritise speed over size
# but if you have an external hard drive, too low is probably slow and large
# (it's about disk write speed vs CPU compression speed) 
compresslevel = 6

## Other constants

You'll rarely need to change these.

In [None]:
# these are for what's inside metadata_path
sheet = 'MMSDM_v5-1'
report_subtype_column = 'PDR_REPORT_SUB_TYPE'
report_name_column = 'PDR_REPORT_NAME'
table_name_column = 'MMSDM_TABLE_NAME'

REPORT_NAME_NULL_PLACEHOLDER = 'NULL'
REPORT_SUBTYPE_NULL_PLACEHOLDER = 'NULL'

# AEMO data is always in UTC+10
TIMEZONE_OFFSET=10

In [None]:
# I've found some particular files have funny encodings
# just ignore them. We don't need them for our analysis.
csvgz_files_to_ignore = [
    # regex patterns
    r"7_days_High_Impact_Outages_\d+.csv.gz",
    r"High_Impact_Outages_\d+",
]

# Stuff that should go in a shared library

In [None]:
# on Linux and Mac this makes this python process lower priority
# so when it's running and using up all your CPU, your interface won't lag.
# So you can keep browsing the web, typing documents etc
try:
    os.nice(20)
except OSError: # ignore error, this is probably on Windows
    pass

In [None]:
class Logger:
    def __init__(self, path=log_file):
        self.path = path
        self.reset()
        self.f = open(self.path, 'a')
        
    def write(self, msg, flush=None):
        self.f.write(msg.rstrip() + '\n')
        #if (current_process().name != 'MainProcess') or flush:
        #    # we are in a child process, doing multiprocessing
        #    # so flush the log, to be psuedo-concurrency safe
        self.f.flush()
    def debug(self, msg):
        self.write(f"DEBUG: {msg}")
    def info(self, msg):
        self.write(f"INFO: {msg}")
    def warn(self, msg, flush=None):
        self.write(f"WARNING: {msg}")
    def warning(self, msg):
        self.warn(msg)
    def error(self, msg):
        self.write(f"ERROR: {msg}")

    def close(self):
        self.f.close()

    def flush(self):
        self.f.flush()
        
    # erase the file
    # but leave a blank file in place
    # to do that, open in `w` mode, and write nothing.
    def reset(self):
        with open(self.path, 'w'):
            pass

logger = Logger()
logger.info("Initialising Logger")

# Zip and split

## Unzip

In [None]:
# create raw folder if it doesn't exist
for d in [temp_dir, dest_dir, archive_dir]:
    if not os.path.exists(d):
        os.makedirs(d)

In [None]:
# call process()
# catch (almost) all errors, and return them
# then after we process all files, check these errors
def _process_raw(path) -> Tuple[Optional[int], Optional[Exception]]:
    try:
        bytes_written = process_raw(path)
    except KeyboardInterrupt:
        raise
    except Exception as ex: # catch anything except keyboard interrupt
        logger.error(f"Failed to unzip {path}: {ex}")
        return ex
    else:
        if move_when_done:
            # move the file to another folder
            # so if we stop and restart, we will not 
            archive_path = os.path.join(archive_dir, os.path.basename(path))
            os.rename(path, archive_path)
        return None

# takes a string of a path to a file on disk
# either .csv or .zip
def process_raw(path: str):
    if path.lower().endswith('.csv'):
        with open(path, 'r', newline='') as f:
            process_csv([path], f)
    elif path.lower().endswith('.zip'):
        with open(path, 'rb') as f:
            process_zip([path], f)
    else:
        logger.warning(f"Ignoring file {path}")
    #logger.debug(f"Finished processing {path}")

# takes in a list of filenames
# e.g. if there's a/myzip.zip on disk, which contains otherzip.zip
# for the inner zip paths is ['a/myzip.zip', 'otherzip.zip']
# These names are just for debugging and logging 
# the file-like object f is used for reading content
def process_zip(fnames: List[str], f: BinaryIO):
    if isinstance(fnames, str):
        fnames = [fnames]
        
    try:
        with zipfile.ZipFile(f) as z:
            exceptions = []
            for m in z.namelist():
                try:
                    if m.lower().endswith('.zip'):
                        with z.open(m, mode='r') as mf:
                            process_zip(fnames + [m], mf)
                    elif m.lower().endswith('.csv'):
                            with z.open(m, mode='r') as mfb:
                                with io.TextIOWrapper(mfb, newline='') as mft:
                                    process_csv(fnames + [m], mft)
                    else:
                        paths_debug = ' -> '.join(fnames)
                        logger.warning(f"Found unknown file {m} in zip {paths_debug}")
                
                except KeyboardInterrupt:
                    raise
                except Exception as ex:
                    # ignore this one for now,
                    # continue processing the other files in this zip
                    # then rethrow this later
                    exceptions.append(ex)
    except zipfile.BadZipFile as e:
        f.seek(0) # go back to the start of the file
        debug_path = os.path.join(bad_zip_dir, os.path.basename(fnames[-1]))
        if not os.path.exists(os.path.dirname(debug_path)):
            try:
                os.makedirs(os.path.dirname(debug_path))
            except FileExistsError:
                # race condition when multiprocessing
                pass
        with open(debug_path, 'wb') as f_out:
            shutil.copyfileobj(f, f_out)
            
        # save the error to a file
        with open(debug_path + '.error.txt', 'w') as f_err:
            f_err.write(f"Issue is in file heirachy: {fnames}\n")
            f_err.write(str(e) + "\n")
            traceback.print_exc(file=f_err)
        raise
    if exceptions:
        # rethrow an exception from processing one file in this zip
        raise exceptions[0]

## Split

In [None]:
class AtomicFile: #(io.TextIOBase):
    # newline='' is for csv.writer
    def __init__(self, temp_path, final_path, newline=''):
        self.temp_path = temp_path
        self.final_path = final_path
        
        # create destination folder if it doesn't exist
        if not os.path.exists(os.path.dirname(temp_path)):
            try:
                os.makedirs(os.path.dirname(temp_path))
            except FileExistsError:
                # race condition when multiprocessing
                pass
        #logger.info(f"Openning {temp_path} for writing")
        if temp_path.lower().endswith('.gz'):
            self.temp_f = gzip.open(temp_path, mode='wt', compresslevel=compresslevel, newline=newline)
        else:
            self.temp_f = open(temp_path, mode='w', newline=newline)
        self.closed = False

    def read(self, size=-1):
        assert not self.closed
        return self.temp_f.read(size)

    def write(self, s):
        assert not self.closed
        return self.temp_f.write(s)

    def close(self):
        
        if not self.closed:
            #logger.info(f"Closing {self.temp_path} after writing")
            self.temp_f.close()
            self.closed = True

            if not os.path.exists(os.path.dirname(self.final_path)):
                try:
                    os.makedirs(os.path.dirname(self.final_path))
                except FileExistsError:
                    # race condition when multiprocessing
                    pass
            
            os.rename(self.temp_path, self.final_path)
            #logger.info(f"Moving {self.temp_path} to {self.final_path}")
            self.delete_temp_dir()

    def delete_temp_file(self):
        try:
            os.remove(self.temp_path)
        except FileNotFoundError:
            pass

    def delete_final_file(self):
        try:
            os.remove(self.final_path)
        except FileNotFoundError:
            pass
        
    def delete_temp_dir(self):
            # in our case, the parent directory is likely now empty
            # delete it, if there are no other files in it
            try:
                os.rmdir(os.path.dirname(self.temp_path))
            except OSError:
                # another file is in the same directory
                # (unlikely, given how we partition the directory)
                pass   
                
    # like close, but delete everything
    def abort(self):
        
        
        if not self.closed:
            self.temp_f.close()
            self.closed = True
            self.delete_temp_file()
            self.delete_temp_dir()

    def tell(self):
        return self.temp_f.tell()


In [None]:

# takes a CSV
# fnames is a list of files
# first is the raw file on disk
# last is the CSV itself
# e.g. if this is a.txt inside b.zip inside c.zip
# fnames is ['c.zip', 'b.zip', 'a.csv']
# 
# we write to a random filename, then move to the destination
# so that if two processes are unzipping different files to the same destination concurrently, they won't corrupt the file.
#
# returns number of bytes written (uncompressed)
def process_csv(fnames: List[str], f_in: TextIO) -> int:
    if isinstance(fnames, str):
        fnames = [fnames]
    csv_name = os.path.basename(fnames[-1])

    try:
        csv_r = csv.reader(f_in)
        first_row = next(csv_r)
        assert isinstance(first_row[0], str), f"First row is wrong type, probably bytes not string. Is {type(first_row[0])}"
        # this keeps track of the 'row'
        # but note that some data values are text including a newline character
        # this is the number of records in the spreadsheet, not lines of text
        record_num = 1
        expected_start = 'C,SETP.WORLD'
        if first_row[0] != 'C':
            # some other files end up in the dataset
            # e.g. int668_v1_schedule_log_rpt_1~20231124133149.csv.gz
            # just ignore files like that
            logger.warning(f"First line does not start with C, {first_row[:10]=} in {fnames}. Ignoring")
            return

        # the first row contains a timestamp for when the file was created
        # e.g. 2020/06/04,00:03:11 (as two CSV cells)
        # change that to 2020_06_04_00_03_11
        # (only characters that can go in a folder name)
        t1 = first_row[5]
        if len(first_row) >= 6+1:
            t2 = first_row[6]
            top_timestamp_s = t1 + '_' + t2
        else:
            top_timestamp_s = t1
        top_timestamp_s = top_timestamp_s.replace('/', '_').replace(' ', '_').replace(':', '_').replace('\\', '_')
        assert '/' not in top_timestamp_s, f"Bad top timestabl: {first_row=}"
        
        row = next(csv_r)
        record_num = record_num + 1
    
        # these will be set later
        csv_w = None
        chars_to_skip = None
        f_out = None

        while True:
            if row[0] in ['C', 'I']:
                # close off the last output file
                csv_w = None 
                if f_out is not None:
                    f_out.close()

                    if num_d_rows == 0:
                        logger.warning(f"No data rows, only header row written for {fnames}, so deleting output file")
                        f_out.delete_final_file()
                    
                
            if row[0] == 'C':

                # 2nd C line, which should be the last line of the file
                if row[1] == 'END OF REPORT':
                                                    
                    assert f_in.read().strip() == '', "Remainder after end of file"

                    checksum = int(row[-1])
                    if csv_name != 'PUBLIC_PDR_CONFIG_non_mms_data_model.CSV':
                        # sometimes AEMO do the checksum wrong, and there's a -1 there
                        assert checksum in (record_num, record_num - 1), f"Checksum on last line doesn't match in {fnames}, {checksum=} {record_num=}"
        
                    break
                elif row[1] == 'SETTLEMENTS RESIDUE CONTRACT REPORT':
                    logger.info(f"Ignoring C line {csv_r.line_num} in {fnames}: {row[1]}")
                else:
                    # additional C lines can be used for arbitrary comments
                    logger.warning(f"Unexpected C line {csv_r.line_num} in {fnames}: {row}")
            elif row[0] == 'I':
                # start of new file
                # The 'version' value is an int even on I rows
                # let's write a column header instead for this I row
                # then include the version value in subsequent D rows
                # choosing a column name that won't clash with others
                version_col_name = 'SCHEMA_VERSION'
                top_timestamp_col_name = 'TOP_TIMESTAMP'
                row_type, report_name, report_subtype, version, *remainder = row
                cols_to_skip = 4
    
                if report_name in ['', None]:
                    report_name = REPORT_NAME_NULL_PLACEHOLDER
                if report_subtype in ['', None]:
                    report_subtype = REPORT_SUBTYPE_NULL_PLACEHOLDER

                # which SQL 'table' does this map to?
                # e.g. 'DISPATCH', 'PRICE' maps to 'DISPATCHPRICE'
                table = map_table(report_name, report_subtype)

                if (table is None) or (table not in tables_to_skip):
                    if (table is None) and ((report_name, report_subtype) not in packages_to_ignore):
                        logger.warning(f"Skipping/ignoring {report_name}, {report_subtype} -> {table} in {fnames} with columns {remainder}")
                    dest_folder = None
                    final_path = None
                    dst = None
                    skip = True
                    assert (f_out is None) or f_out.closed
                    f_out = None
                else:

    
                    # for the file, include two columns in the folder, not the file itself
                    # i.e. hive-style partitioning
                    # (this saves space compared to repeating it on each line.)
                    # we use these values for deduplicating rows across files.
                    subdir = os.path.join(report_name, report_subtype, f"{version_col_name}={version}", f"{top_timestamp_col_name}={top_timestamp_s}")

                    final_path = os.path.join(dest_dir, subdir, csv_name + '.gz')
                    temp_path = os.path.join(temp_dir, subdir, csv_name + '.' + str(uuid4()) + '.tmp.gz')
                    
                    f_out = AtomicFile(temp_path, final_path)
                    #f_out = open(temp_path, 'w')
                    csv_w = csv.writer(f_out)
                    csv_w.writerow(remainder)
                    skip = False
                
                num_d_rows = 0
    
            else:
                assert row[0] == 'D', f"Unexpected non-D row: {row[:100]}" 
                if not skip:
                    csv_w.writerow(row[cols_to_skip:])
                num_d_rows += 1
    
            row = next(csv_r)
            record_num = record_num + 1
        return bytes_written
    except KeyboardInterrupt:
        raise
    except Exception as e: 
        # close the output file if open
        try:
            if f_out is not None:
                f_out.abort()
        except NameError:
            pass
        try:
            os.remove(temp_path)
        except (FileNotFoundError, NameError, OSError):
            pass
        if any(re.search(ptn, csv_name) for ptn in csvgz_files_to_ignore):
            logger.info(f"Ignoring error splitting {csv_name}")
            return bytes_written
        else:
            # save the file itself (since it's inside a zip, it's annoying to find for debugging)
            f_in.seek(0) # go back to the start of the file
            debug_path = os.path.join(bad_csv_dir, csv_name) + '.gz'
            if not os.path.exists(os.path.dirname(debug_path)):
                try:
                    os.makedirs(os.path.dirname(debug_path))
                except FileExistsError:
                    # race condition when multiprocessing
                    pass
            with gzip.open(debug_path, 'wt') as f_out_copy:
                s = f_in.read(1)
                assert isinstance(s, str), f"s is {type(s)}"
                f_in.seek(0)
                shutil.copyfileobj(f_in, f_out_copy)
                
            with open(debug_path + '.error.txt', 'w') as f_err:
                f_err.write(f"Issue is in file heirachy: {fnames}\n")
                try:
                    f_err.write(f"On text line {csv_r.line_num}")
                except NameError:
                    # csv_r is not defined
                    pass
                try:
                    f_err.write(f"CSV record {record_num}")
                except NameError:
                    pass
                f_err.write(str(e) + '\n')
                traceback.print_exc(file=f_err)
            raise

## Map

In [None]:
metadata_df = pd.read_excel(metadata_path, sheet_name=sheet)
metadata_df[report_name_column].fillna(REPORT_NAME_NULL_PLACEHOLDER, inplace=True)
metadata_df[report_subtype_column].fillna(REPORT_SUBTYPE_NULL_PLACEHOLDER, inplace=True)

# there are some packages I do not know how to map to tables
packages_to_ignore = [
    # maybe there's a table for these,
    # or they're legacy tables
    # these are ones we don't care about
    ('DINT', REPORT_SUBTYPE_NULL_PLACEHOLDER),
    ('TINT', REPORT_SUBTYPE_NULL_PLACEHOLDER),
    ('DCONS', REPORT_SUBTYPE_NULL_PLACEHOLDER),
    ('DREGION', REPORT_SUBTYPE_NULL_PLACEHOLDER),
    ('SPDCPC', REPORT_SUBTYPE_NULL_PLACEHOLDER),
    ('SRAFINANCIALS', 'RECONCILIATION_SUMMARY'),
    ('DAILY', 'MLF'), # electrical transmission loss factors.
    ('BILLING_CONFIG', 'BILLSMELTERRATE'), # alumninium smelter info. Possibly relevant? (Smeltering makes up 30% of NSW load)
    ('BILLING', 'CSP_SUPPORTDATA_SRA'),
    ('BILLING', 'ASPAYMENT_SUMMARY'), # probably belongs to BILLINGASPAYMENTS table, but this is not relevant to us so I haven't bothered checking
    ('BILLING', 'DIRECTION_CRA'),
    ('TRADING', 'CUMULATIVE_PRICE'),
    ('TREGION', 'NULL'),
    ('DAILY', 'WDR_NO_SCADA'),
    ('METER_DATA', 'GEN_DUID'),
    ('SEVENDAYOUTLOOK', 'PEAK'),
    ('TUNIT', 'NULL'),
    ('GPG', 'MARKET_SUMMARY'),
    ('GPG', 'CASESOLUTION'),
    ('GPG', 'CONSTRAINTSOLUTION'),
    ('GPG', 'PRICESOLUTION'),
    ('GPG', 'INTERCONNECTORSOLUTION'),
    ('CAUSER_PAYS_SCADA', 'NETWORK'),
    ('PDR_REPORT', 'COLUMN'),
    ('DUNIT', 'NULL'),
    ('YESTBID', 'BIDDAYOFFER'),
    ('YESTBID', 'BIDPEROFFER'),
    ('RESIDUE_PRICE_OFFER', 'NULL'),
    ('RESIDUE_PRICE_BID', 'NULL'), # columns don't match the RESIDUE_PRICE_BID table, almost match RESIDUE_PRICE_FUNDS_BID
    ('IBEI', 'PUBLISHING'),
    ('EMSLIMITS', 'LIM_ALTLIM'),
    ('DEMAND', 'HISTORIC'),

    
    ('PDR_REPORT', 'TABLE'),    # manifest metadata
    ('PDR_REPORT', 'FILE'),    # manifest metadata
    ('PDR_REPORT', 'MAPPING'),    # manifest metadata
    ('PDR_REPORT', 'COLUMN'),    # manifest metadata

    # Gas data
    ('GSH', 'PARTICIPANT_OPSTATE'),
    ('GSH', 'PARTICIPANTS'),
    ('GSH', 'AUCTION_CURTAILMENT_NOTICE'),
    ('GSH', 'AUCTION_PRICE_VOLUME'),
    ('GSH', 'BENCHMARK_PRICE'),
    ('GSH', 'PARK_SERVICES'),
    ('GSH', 'AUCTION_QUANTITIES'),
    ('GSH', 'FACILITIES'),
    ('GSH', 'TRANSACTION_SUMMARY'),
    ('GSH', 'HISTORICAL_SUMMARY'),
    ('GSH', 'NOTIONAL_POINTS'),
    ('GSH', 'REVISED_AUCTION_QUANTITIES'),
    ('GSH', 'PIPELINE_SEGMENTS'),
    ('GSH', 'CAPACITY_TRANSACTION'),
    ('GSH', 'ZONES'),
    ('GSH', 'SERVICE_POINTS'),

    # we should maybe double check
    ('FORCE_MAJEURE', 'MARKETSUSREGION'),
    ('FORCE_MAJEURE', 'MARKETSUSPENSION')

]


# there are some packages which aren't in the spreadsheet, but we can guess them
# in particular there's a CO2 one I can't 
package_exceptions = {
    ('DISPATCH', 'CASESOLUTION'): 'DISPATCHCASESOLUTION',
    ('GENCONSETTRK', REPORT_SUBTYPE_NULL_PLACEHOLDER): 'GENCONSETTRK',
    ('DISPATCH', 'REGIONFCASREQUIREMENT'): 'DISPATCH_FCAS_REQ',

    # this one is possibly useful
    # example file is CO2EII_AVAILABLE_GENERATORS.CSV.gz
    # It looks like it might go into BILLING_CO2E_PUBLICATION, but the columns are different
    # Example:
    #STATIONNAME,DUID,GENSETID,REGIONID,CO2E_EMISSIONS_FACTOR,CO2E_ENERGY_SOURCE,CO2E_DATA_SOURCE
    #"Appin Power Plant",APPIN,APPIN,NSW1,0.56318004,"Coal seam methane",NGA2022
    # so define a new table for it
    ('CO2EII', 'PUBLISHING'): 'CO2EII_AVAILABLE_GENERATORS',
}


In [None]:
# There are some tables we skip just because they are huge
# most of these are each larger than the 200 smallest tables
tables_to_skip = [
    'P5MIN_UNITSOLUTION', 
    'P5MIN_CASESOLUTION',
    'P5MIN_CONSTRAINTSOLUTION', 
    'P5MIN_LOCALPRICE', # not what people get paid. Related to electrical constraints
    'DISPATCH_LOCALPRICE', # not what people get paid. Related to electrical constraints
    'NETWORK_OUTAGEDETAIL',
    'MCC_CASESOLUTION',
    'MCC_CONSTRAINTSOLUTION',
    'DISPATCHOFFERTRK', 
    'DISPATCHCASESOLUTION',
    'P5MIN_REGIONSOLUTION', 
    'DISPATCHCONSTRAINT',
    'SETFCASREGIONRECOVERY',
    'TRADINGPRICE', # surprisingly big
]

In [None]:

# returns None if we can't find a table
def map_table(report_name, report_subtype) -> Optional[str]:
    candidates = metadata_df.loc[(metadata_df[report_name_column] == report_name) & (metadata_df[report_subtype_column] == report_subtype), table_name_column]
    # sometimes there's duplicates
    # but assert they're all the same answer
    tables = set(candidates)
    if len(tables) == 0 and (report_name, report_subtype) in package_exceptions:
        return package_exceptions[(report_name, report_subtype)]
        
    if len(tables) != 1:
        return None
    table = tables.pop() # choose the only one
    return table

# unit testing
assert map_table('DISPATCH', 'PRICE') == 'DISPATCHPRICE'

## Run the above code

In [None]:

# __name__ thing to fix a multiprocessing issue
if __name__ == '__main__':
    logger.reset()
    
    files = [os.path.join(source_dir, p) for p in os.listdir(source_dir)]

    # shuffle file order
    # so that we don't do all the small files first
    # then all the large files last 
    # (or the other way around)
    # this makes the progress bar more accurate
    shuffle(files) # mutates list in place

    print("Starting")
    
    if use_multiprocessing and False:
        with Pool(num_processes) as p:
            statuses = list(tqdm(p.imap(_process_raw, files), total=len(files)))
    else:
        #ret = [process_raw(file) for file in tqdm(files)]
        statuses = [_process_raw(file) for file in tqdm(files)]
    assert all(s is None for s in statuses), "some files failed to be processed, to debug, run the next cell"

In [None]:
# Check the files which didn't download properly
# e.g. do they matter for our case?
# can you find it elsewhere?
# e.g. PUBLIC_ROOFTOP_PV_ACTUAL_MEASUREMENT_20231226053000_0000000406806404.zip was corrupted
# but can be found inside PUBLIC_ROOFTOP_PV_ACTUAL_MEASUREMENT_20231221.zip
[(str(s)[:100], f) for (s,f) in zip(statuses, files) if s]