In [13]:
import os
import urllib.parse
import importlib
import re

import polars as pl


# utils is our local utility module
# if we change utils.py, and re-run a normal 'import'
# python won't reload it by default. (Since it's already loaded.)
# So we force a reload
import utils
importlib.reload(utils)

<module 'utils' from '/home/matthew/Documents/TSE/AppliedEconometrics/repo/utils.py'>

In [2]:
data_dir = "/home/matthew/data/"
source_dir = f"{data_dir}/01-D-parquet-pyarrow-dataset/"
intermediate_dir = f"{data_dir}/03-A-polars-partitioned-duplicated/DISPATCHLOAD/"

In [91]:

# get emissions data per genunit
genunits_path = os.path.join(source_dir, 'GENUNITS', 'part-0.parquet')
genset_emissions = (
    pl.scan_parquet(genunits_path)
    .filter(pl.col("GENSETTYPE") == "GENERATOR")
    .sort("LASTCHANGED", descending=True)
    .unique(subset=["GENSETID"], keep='first')
    .select("GENSETID", "CO2E_EMISSIONS_FACTOR", "MAXCAPACITY")
)

dualloc_path = os.path.join(source_dir, 'DUALLOC', 'part-0.parquet')
duid_gensetid = (
    pl.scan_parquet(dualloc_path)
    .select("DUID", "GENSETID")
)

emissions_by_duid = (
    duid_gensetid
    .join(genset_emissions, left_on="GENSETID", right_on="GENSETID")
    # now do a weighted average of CO2E_EMISSIONS_FACTOR
    # group by DUID
    # weight by MAXCAPACITY
    .with_columns((pl.col("CO2E_EMISSIONS_FACTOR") * pl.col("MAXCAPACITY")).alias("SCALED_CO2"))
    .group_by("DUID")
    .sum()
    .with_columns((pl.col("SCALED_CO2") / pl.col("MAXCAPACITY")).alias("CO2E_EMISSIONS_FACTOR"))
    .select("DUID", "CO2E_EMISSIONS_FACTOR")
)

lfs = []
for duid in ['BLOWERNG', 'BRAEMAR7']:
    dudetailsummary_path = os.path.join(source_dir, 'DUDETAILSUMMARY', 'part-0.parquet')
    region_duid = (
        pl.scan_parquet(dudetailsummary_path)
        .filter(pl.col("DUID") == pl.lit(duid))
        .filter(pl.col("REGIONID") != "SNOWY1")
        .sort("START_DATE", descending=False)
        .select("REGIONID", "DUID")
        .last()
    )
   
    
    dispatchload_path = os.path.join(intermediate_dir, "DUID=" + urllib.parse.quote(duid), 'part-0.parquet')

    lf = (
        pl.scan_parquet(dispatchload_path)
        .with_columns(pl.lit(duid).alias("DUID"))
        .join(region_duid, left_on="DUID", right_on="DUID")
        .join(emissions_by_duid, left_on="DUID", right_on="DUID")
    )
    lfs.append(lf)

lf = pl.concat(lfs)

# now group by region
# sum across SETTLEMENTDATE
lf = (
    lf
    .with_columns((pl.col("POWER") / 12).alias("ENERGY_MWH"))
    .with_columns((pl.col("ENERGY_MWH") * pl.col("CO2E_EMISSIONS_FACTOR")).alias("CO2_T"))
    .group_by("REGIONID", "SETTLEMENTDATE")
    .sum()
    .select("REGIONID", "SETTLEMENTDATE", "ENERGY_MWH", "CO2_T")
)

lf.fetch()

REGIONID,SETTLEMENTDATE,ENERGY_MWH,CO2_T
str,datetime[ms],f64,f64


In [76]:
(
    duid_gensetid
    .join(genset_emissions, left_on="GENSETID", right_on="GENSETID")
    .filter(pl.col("GENSETID").is_null())
    .fetch()
)

DUID,GENSETID,CO2E_EMISSIONS_FACTOR,MAXCAPACITY
str,str,f64,i32


In [26]:
emissions_by_duid = (
    duid_gensetid
    .join(genset_emissions, left_on="GENSETID", right_on="GENSETID")
    # now do a weighted average of CO2E_EMISSIONS_FACTOR
    # group by DUID
    # weight by MAXCAPACITY
    .with_columns((pl.col("CO2E_EMISSIONS_FACTOR") * pl.col("MAXCAPACITY")).alias("SCALED_CO2"))
    .group_by("DUID")
    .sum()
    .with_columns((pl.col("SCALED_CO2") / pl.col("MAXCAPACITY")).alias("CO2E_EMISSIONS_FACTOR"))
    .select("DUID", "CO2E_EMISSIONS_FACTOR")
)
emissions_by_duid.fetch()

NameError: name 'duid_gensetid' is not defined

In [27]:

# get emissions data per genunit
genunits_path = os.path.join(source_dir, 'GENUNITS', 'part-0.parquet')
genset_emissions = (
    pl.scan_parquet(genunits_path)
    .filter(pl.col("GENSETTYPE") == "GENERATOR")
    .sort("LASTCHANGED", descending=True)
    .unique(subset=["GENSETID"], keep='first')
    .select("GENSETID", "CO2E_EMISSIONS_FACTOR", "MAXCAPACITY")
)

dualloc_path = os.path.join(source_dir, 'DUALLOC', 'part-0.parquet')
duid_gensetid = (
    pl.scan_parquet(dualloc_path)
    .select("DUID", "GENSETID")
)

emissions_by_duid = (
    duid_gensetid
    .join(genset_emissions, left_on="GENSETID", right_on="GENSETID")
    # now do a weighted average of CO2E_EMISSIONS_FACTOR
    # group by DUID
    # weight by MAXCAPACITY
    .with_columns((pl.col("CO2E_EMISSIONS_FACTOR") * pl.col("MAXCAPACITY")).alias("SCALED_CO2"))
    .group_by("DUID")
    .sum()
    .with_columns((pl.col("SCALED_CO2") / pl.col("MAXCAPACITY")).alias("CO2E_EMISSIONS_FACTOR"))
    .select("DUID", "CO2E_EMISSIONS_FACTOR")
)


In [34]:
intermediate_dir = "/home/matthew/data/03-A-polars-partitioned-by-region/DISPATCHLOAD"
duids = set()
all_lfs = []
for regionid in ['QLD1', 'NSW1', 'VIC1', 'SA1', 'TAS1']:
    for year in range(2009, 2023+1):
        for month in range(1, 12+1):
            pq_src_dir = f"{intermediate_dir}/REGIONID={regionid}/SETTLEMENTDATE_YEAR={year}/SETTLEMENTDATE_MONTH={month}/"
            monthly_lfs = []
            for pq_src_path in utils.walk(pq_src_dir):
                match = re.search(r"DUID=([^/]+)", pq_src_path)
                assert match, f"Can't find DUID from {pq_src_path}"
                duid = urllib.parse.unquote(match.group(1))
                duids.add(duid)
                monthly_lfs.append(
                    pl.scan_parquet(pq_src_path)
                    .with_columns(pl.lit(duid).alias("DUID"))
                )
            if monthly_lfs:
                monthly_lf = (
                    pl.concat(monthly_lfs)
                    .join(emissions_by_duid, left_on="DUID", right_on="DUID")
                    .with_columns((pl.col("POWER") / 12).alias("ENERGY_MWH"))
                    .with_columns((pl.col("CO2E_EMISSIONS_FACTOR") * pl.col("ENERGY_MWH")).alias("CO2_T"))
                    .group_by("SETTLEMENTDATE")
                    .sum()
                    .select("SETTLEMENTDATE", "ENERGY_MWH", "CO2_T")
                )
                all_lfs.append(monthly_lf)
                #print(f"Found {len(monthly_lfs)} files in {pq_src_dir}")
                break
            #else:
                #print(f"Warning: no files found in {pq_src_dir}")
            
pl.concat(all_lfs).fetch()

SETTLEMENTDATE,ENERGY_MWH,CO2_T
datetime[ms],f64,f64
2009-07-02 08:20:00,9.091105,6.550852
2009-07-01 08:50:00,9.2858025,7.375932
2009-07-02 01:15:00,3.744469,3.869341
2009-07-01 12:45:00,6.867803,4.50949
2009-07-02 00:30:00,4.218469,4.431249
2009-07-02 15:20:00,6.704333,7.404383
2009-07-02 07:30:00,9.448105,6.974061
2009-07-01 01:05:00,3.841708,4.029379
2009-07-02 13:45:00,8.345552,9.410181
2009-07-02 09:50:00,8.827105,6.237891


In [19]:
urllib.parse.unquote("a%20b")

'a b'

In [10]:
intermediate_dir

'/home/matthew/data/03-A-polars-partitioned-by-region/DISPATCHLOAD'