In [102]:
import urllib.parse
import os
import importlib

from tqdm import tqdm
import polars as pl


# utils is our local utility module
# if we change utils.py, and re-run a normal 'import'
# python won't reload it by default. (Since it's already loaded.)
# So we force a reload
import utils
importlib.reload(utils)

<module 'utils' from '/home/matthew/Documents/TSE/AppliedEconometrics/repo/utils.py'>

In [104]:
data_dir = "/home/matthew/data/"
source_dir = f"{data_dir}/01-D-parquet-pyarrow-dataset/"
source_path = f"{source_dir}/DISPATCHLOAD/part-0.parquet"
intermediate_dir = f"{data_dir}/03-A-polars-partitioned-by-region/DISPATCHLOAD/"
dest_dir = f"{data_dir}/03-A-polars-joined/"
dudetailsummary_path = os.path.join(source_dir, 'DUDETAILSUMMARY', 'part-0.parquet')

In [None]:
# data is 5 minute intervals
intervals_per_hour = 12

In [99]:
duids = (
    pl.scan_parquet(source_path, low_memory=True)
    .select(pl.col("DUID").unique())
    .collect()
)

In [100]:
duids = [x['DUID'] for x in duids.to_dicts()]

In [117]:

# get emissions data per genunit
genunits_path = os.path.join(source_dir, 'GENUNITS', 'part-0.parquet')
genset_emissions = (
    pl.scan_parquet(genunits_path)
    .filter(pl.col("GENSETTYPE") == "GENERATOR")
    .sort("LASTCHANGED", descending=True)
    .unique(subset=["GENSETID"], keep='first')
    .select("GENSETID", "CO2E_EMISSIONS_FACTOR", "MAXCAPACITY")
)

dualloc_path = os.path.join(source_dir, 'DUALLOC', 'part-0.parquet')
duid_gensetid = (
    pl.scan_parquet(dualloc_path)
    .select("DUID", "GENSETID")
)

emissions_by_duid = (
    duid_gensetid
    .join(genset_emissions, left_on="GENSETID", right_on="GENSETID")
    # now do a weighted average of CO2E_EMISSIONS_FACTOR
    # group by DUID
    # weight by MAXCAPACITY
    .with_columns((pl.col("CO2E_EMISSIONS_FACTOR") * pl.col("MAXCAPACITY")).alias("SCALED_CO2"))
    .group_by("DUID")
    .sum()
    .with_columns((pl.col("SCALED_CO2") / pl.col("MAXCAPACITY")).alias("CO2E_EMISSIONS_FACTOR"))
    .select("DUID", "CO2E_EMISSIONS_FACTOR")
)

DUID,CO2E_EMISSIONS_FACTOR
str,f64


In [118]:
utils.create_dir(file=intermediate_dir)

for duid in tqdm(duids):

    region_duid = (
        pl.scan_parquet(dudetailsummary_path)
        .filter(pl.col("DUID") == pl.lit(duid))
        .filter(pl.col("REGIONID") != "SNOWY1")
        .sort("START_DATE", descending=False)
        .select("REGIONID", "DUID")
        .last()
        .collect()
    )
    
    (
        pl.scan_parquet(source_path, low_memory=False)
        .filter(pl.col("INTERVENTION") == 0)
        .filter(pl.col("DUID") == duid)
        .select(pl.exclude("INTERVENTION"))

        # now that it's filtered
        # deduplicate
        .sort("SETTLEMENTDATE", "RUNNO", "SCHEMA_VERSION", "TOP_TIMESTAMP", "LASTCHANGED", descending=[False, False, True, True, True])
        .group_by("SETTLEMENTDATE")
        .first()
        

        # now combine INITIALMW and TOTALCLEARED
        # NEXT_POWER is the value of INITIALMW in the next time (data already sorted in ascending time)
        # but for the last value (no next INITIALMW), choose TOTALCLEARED
        # the .shift() operator requires that we call .collect()
        # but by now the data is small enough to do that
        .select("SETTLEMENTDATE", "INITIALMW", "TOTALCLEARED")
        .collect()
        .with_columns(pl.coalesce(pl.col('INITIALMW').shift(-1), "TOTALCLEARED").alias("NEXT_POWER"))
        .with_columns(((pl.col("INITIALMW") + pl.col("NEXT_POWER"))/2).alias("POWER"))

        
        # add DUID back, as a literal (to be faster)
        .with_columns(pl.lit(duid).alias("DUID"))
        # add region (same value for all rows)
        .join(region_duid, left_on="DUID", right_on="DUID")
        # add emissions (same value for all rows
        #.join(emissions_by_duid, left_on="DUID", right_on="DUID")
        
        # drop everything except what we care about
        .select("REGIONID", "DUID", "SETTLEMENTDATE", "POWER")
        .with_columns(pl.col("SETTLEMENTDATE").dt.year().alias("SETTLEMENTDATE_YEAR"))
        .with_columns(pl.col("SETTLEMENTDATE").dt.month().alias("SETTLEMENTDATE_MONTH"))
        .write_parquet(intermediate_dir, 
                       use_pyarrow=True,
                       pyarrow_options={
                            "partition_cols": ["REGIONID", "SETTLEMENTDATE_YEAR", "SETTLEMENTDATE_MONTH", "DUID"],
                            "existing_data_behavior": "overwrite_or_ignore"
                        })
    )

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 543/543 [2:38:50<00:00, 17.55s/it]


In [None]:

# get emissions data per genunit
genunits_path = os.path.join(source_dir, 'GENUNITS', 'part-0.parquet')
genset_emissions = (
    pl.scan_parquet(genunits_path)
    .filter(pl.col("GENSETTYPE") == "GENERATOR")
    .sort("LASTCHANGED", descending=True)
    .unique(subset=["GENSETID"], keep='first')
    .select("GENSETID", "CO2E_EMISSIONS_FACTOR", "MAXCAPACITY")
)

dualloc_path = os.path.join(source_dir, 'DUALLOC', 'part-0.parquet')
duid_gensetid = (
    pl.scan_parquet(dualloc_path)
    .select("DUID", "GENSETID")
)

emissions_by_duid = (
    duid_gensetid
    .join(genset_emissions, left_on="GENSETID", right_on="GENSETID")
    # now do a weighted average of CO2E_EMISSIONS_FACTOR
    # group by DUID
    # weight by MAXCAPACITY
    .with_columns((pl.col("CO2E_EMISSIONS_FACTOR") * pl.col("MAXCAPACITY")).alias("SCALED_CO2"))
    .group_by("DUID")
    .sum()
    .with_columns((pl.col("SCALED_CO2") / pl.col("MAXCAPACITY")).alias("CO2E_EMISSIONS_FACTOR"))
    .select("DUID", "CO2E_EMISSIONS_FACTOR")
)


In [95]:
intermediate_dir = "/home/matthew/data/03-A-polars-partitioned-by-region/DISPATCHLOAD"
duids = set()

for regionid in tqdm(['QLD1', 'NSW1', 'VIC1', 'SA1', 'TAS1']):
    region_lfs = []
    for year in range(2009, 2023+1):
        for month in range(1, 12+1):
            pq_src_dir = f"{intermediate_dir}/REGIONID={regionid}/SETTLEMENTDATE_YEAR={year}/SETTLEMENTDATE_MONTH={month}/"
            monthly_lfs = []
            for pq_src_path in utils.walk(pq_src_dir):
                match = re.search(r"DUID=([^/]+)", pq_src_path)
                assert match, f"Can't find DUID from {pq_src_path}"
                duid = urllib.parse.unquote(match.group(1))
                duids.add(duid)
                monthly_lfs.append(
                    pl.scan_parquet(pq_src_path)
                    .with_columns(pl.lit(duid).alias("DUID"))
                )
            if monthly_lfs:
                monthly_lf = (
                    pl.concat(monthly_lfs)
                    .join(emissions_by_duid, left_on="DUID", right_on="DUID")
                    .with_columns((pl.col("POWER") / intervals_per_hour).alias("ENERGY_MWH"))
                    .with_columns((pl.col("CO2E_EMISSIONS_FACTOR") * pl.col("ENERGY_MWH")).alias("CO2_T"))
                    .group_by("SETTLEMENTDATE")
                    .sum()
                    .select("SETTLEMENTDATE", "ENERGY_MWH", "CO2_T")
                )
                region_lfs.append(monthly_lf)
                #print(f"Found {len(monthly_lfs)} files in {pq_src_dir}")
                break
            #else:
                #print(f"Warning: no files found in {pq_src_dir}")
            
    pl.concat(region_lfs).sink_parquet(f"{dest_dir}/REGIONID={regionid}/part-0.parquet")

REGIONID,DUID
str,str
"""QLD1""","""CALL_A_2"""
