# Create Load Control Analysis

This script creates an output time series that can be analyzed in Power BI for improperly functioning load controllers. The output time series has data for every timestamp of a meter during load control event(s).  To be in the dataset, the meter must have all load controllers being controlled.  For example, a meter with dual fuel and water heater controllers on it will only have timestamps in the output dataset when there are load control events controlling both devices.

The script incrementally appends the output time series for new load control events that occurred since the last output save.  

There are four inputs to this process:
- A time series that represents the load control events.  This is created by upstream jobs and updated daily.
- A map that ties together load control events and controlled load types. This is manually maintained.
- Information for each meter that has one or more controlled loads.  This comes from iVUE and is updated daily.
- The full indexed meter data that is updated daily in upstream jobs.

In [0]:
%run ../../Utilities/ConfigUtilities

In [0]:
from pyspark.sql.types import IntegerType, StructType, StructField, StringType
from pyspark.sql.functions import max, min, count, when, sum, concat_ws, col, to_timestamp, lit
from delta.tables import *

debug = 1

### Get the load control, event to type map, and meter information.

In [0]:
# To incrementally update the output table, we need to find the last load control event that has been processed.
try:
    lc_df_iwh_df = DeltaTable.forPath(spark, LOAD_CONTROL_METERTS_PATH).toDF()
    last_processed_index = lc_df_iwh_df.select(max('EndMeterSampleIndex')).collect()[0][0]
except:   # Table is empty
    last_processed_index= 1

if debug:
    print(last_processed_index)

In [0]:
# Get the load control time series and filter to the new events.
lc_ts_df = spark.read.parquet(LOAD_CONTROL_TIMESERIES_PATH)

lc_new_ts_df = lc_ts_df.filter(lc_ts_df.MeterSampleIndex > last_processed_index)

if lc_new_ts_df.count() == 0:
    dbutils.notebook.exit("No new load control periods found.") 

if debug:
    display(lc_new_ts_df)

In [0]:
# Read the program to meter map data.
meter_control_event_types_df = spark.read.format('delta'). load(LOAD_CONTROL_METER_PROGRAM_MAP_PATH)

if debug:
    display(meter_control_event_types_df)

### Update the load control time series with the meter information.  
This involves ensuring that all controlled loads for a meter are being controlled. This analysis won't be of value for partial control

In [0]:
# Join the load control time series with the meter info.
lc_meter_ts_df = lc_new_ts_df.join(meter_control_event_types_df, on='ResourceName', how='inner')

if debug:
    display(lc_meter_ts_df)
    display(lc_meter_ts_df.filter(col('BI_MTR_NBR') == 65918693))

In [0]:
# Eliminate meters that don't have all load types being controlled for any MeterSampleIndex.

# Get the meters.
lc_meter_ts_dups_df = lc_meter_ts_df.groupBy('MeterSampleIndex', 'BI_MTR_NBR').agg(count('BI_LOAD_TYPE').alias('Count'))

if debug:
    display(lc_meter_ts_dups_df)
    display(lc_meter_ts_dups_df.filter(col('BI_MTR_NBR') == 65918693))

In [0]:
# Get the program count for each meter.
meter_control_program_count_df = meter_control_event_types_df.groupBy('BI_MTR_NBR').agg(count('BI_LOAD_TYPE').alias('ProgramCount'))

if debug:
    display(meter_control_program_count_df)
    display(meter_control_program_count_df.filter(col('BI_MTR_NBR') == 65918693))

In [0]:
# Join the duplicate count from the load control time series with the program count dataframe for each meter.  
meter_control_program_diff_df = meter_control_program_count_df.join(lc_meter_ts_dups_df, on='BI_MTR_NBR', how='inner')

# Create a new column to compare program count with instance count.
meter_control_program_diff_df = meter_control_program_diff_df.withColumn('ProgramDiff', col('ProgramCount') - col('Count'))

if debug:
    display(meter_control_program_diff_df)
    display(meter_control_program_diff_df.groupBy('ProgramDiff').agg(count('BI_MTR_NBR').alias('MeterCount')))
    display(meter_control_program_diff_df.filter(col('BI_MTR_NBR') == 65918693))

In [0]:
# Filter down to meter samples that are fully fulfilled and join with the load control time series to restrict it.
meter_control_program_fullfilled_df = meter_control_program_diff_df.filter(col('ProgramDiff') == 0).select('BI_MTR_NBR', 'MeterSampleIndex')

lc_meter_fullfilled_ts_df = lc_meter_ts_df.join(meter_control_program_fullfilled_df, on=['BI_MTR_NBR', 'MeterSampleIndex'], how='inner')

if debug:
    display(lc_meter_fullfilled_ts_df)
    display(lc_meter_fullfilled_ts_df.filter(col('BI_MTR_NBR') == 65918693))
    print("All load control time series data points: " + str(lc_meter_ts_df.count())) 
    print("Fulfilled load control time series data points: " + str(lc_meter_fullfilled_ts_df.count())) 
                                                           

In [0]:
if debug:
    display(lc_meter_ts_df.filter(col('BI_MTR_NBR') == 65918693 ))
    display(lc_meter_fullfilled_ts_df.filter(col('BI_MTR_NBR') == 65918693 ))

In [0]:
# Subset dataframe before the join. The distinct() shouldn't be needed, but included to avoid dupicates.
lc_meter_fullfilled_min_ts_df = lc_meter_fullfilled_ts_df.select('BI_ACCT', 'BI_SRV_LOC_NBR', 'BI_MTR_NBR', 'MeterSampleIndex', 'LoadControlEventID', 'BI_LOAD_TYPE', 'LoadControlEvent').distinct()

if debug:
    display(lc_meter_fullfilled_min_ts_df)
    display(lc_meter_fullfilled_min_ts_df.filter(col('BI_MTR_NBR') == 65918693 ))

### Read the meter data and join 
The meter data is joined with the meters / samples that represent fulfilled programs.

In [0]:
# Read the meter data
meter_data_df = DeltaTable.forPath(spark, MDM_INDEXED_PATH).toDF()

# Filter out old data to simplify the next join.  ALso limit to forward flow and channel 1
first_lc_meter_sample_index = lc_meter_fullfilled_min_ts_df.select(min(col('MeterSampleIndex'))).collect()[0][0]
meter_data_df = meter_data_df.filter((col('EndMeterSampleIndex') >= first_lc_meter_sample_index) 
                                     & (col('FlowDirection') == 'F') & (col('Channel') == 1))

# If no new data, exit.
if meter_data_df.count() == 0:
    dbutils.notebook.exit("No new data found.") 

if debug:
    print(first_lc_meter_sample_index)
    display(meter_data_df)

In [0]:
# Join the meter data with the distinct indexes / meters in the load control time series.
load_control_df = meter_data_df.join(lc_meter_fullfilled_min_ts_df, \
                            (meter_data_df.EndMeterSampleIndex==lc_meter_fullfilled_min_ts_df.MeterSampleIndex) & \
                            (meter_data_df.MeterNumber==lc_meter_fullfilled_min_ts_df.BI_MTR_NBR), how='inner')

if debug:
    print(load_control_df.count())
    display(load_control_df)
    display(load_control_df.filter(col('BI_MTR_NBR') == 65918693 ))

In [0]:
display(load_control_df.filter(col('BI_MTR_NBR') == 12151219))

In [0]:
# Clean up the dataframe.
load_control_df = load_control_df.select('BI_ACCT', 'BI_SRV_LOC_NBR', 'MeterNumber', 'UnitOfMeasure', 'FlowDirection', 'Channel', 'StartDateTime', 'EndDateTime', 'StartMeterSampleIndex', 'EndMeterSampleIndex', 'AMIValue', 'VEEValue', 'LoadControlEventId', 'BI_LOAD_TYPE', 'LoadControlEvent') 

if debug:
    display(load_control_df)
    display(load_control_df.filter((col('BI_MTR_NBR') == 65918693)))
    display(load_control_df.filter(col('BI_MTR_NBR') == 12151219))

### Aggregate data to hourly.

In [0]:
# We need local time and not UTC time for the time series. Get the calendar dataset, subset it, and then join with the load control data.
calendar_df = spark.read.format('parquet').load(INDEXED_CALENDAR_PATH)

if debug:
    # Check on a sample load control window starting at 3:00 PM and ending at 7:00 PM.
    display(calendar_df.filter(col('MeterSampleIndex') == 371472))
    display(calendar_df.filter(col('MeterSampleIndex') == 371520))

    display(calendar_df.filter(col('MeterSampleIndex') == 448902))

# Eliminate the UTC time info.
calendar_df = calendar_df.select('MeterSampleIndex', 'LocalTimeStamp', 'LocalYear', 'LocalMonth', 'LocalDay', 'LocalHour', 'LocalMinute')



In [0]:
# Join with the load control dates.  
load_control_dates_df = load_control_df.join(calendar_df, load_control_df.StartMeterSampleIndex==calendar_df.MeterSampleIndex, how='inner')

if debug:
    display(load_control_dates_df)

In [0]:
# Create an HourEnding column.  Since the join was on the start index for all time periods, we can just add an hour.
load_control_dates_df = load_control_dates_df.withColumn('HourEnding', col('LocalHour')+1)

if debug:
    display(load_control_dates_df)
    display(load_control_dates_df.filter(col('LoadControlEventID') == "LREC.IRR_2025-06-02 16:00:00_2025-06-02 20:00:00"))

In [0]:
from pyspark.sql.functions import sum, first, last

# Aggregate to hourly data.
load_control_hourly_df = load_control_dates_df.groupBy('BI_ACCT', 'BI_SRV_LOC_NBR', 'MeterNumber', 'UnitOfMeasure', 'FlowDirection', 'Channel', 'LocalYear', 'LocalMonth', 'LocalDay', 'HourEnding', 'LoadControlEventId', 'BI_LOAD_TYPE').agg(
    sum("AMIValue").alias("HourlyAMIValue"),
    sum("VEEValue").alias("HourlyVEEValue"),
    max("EndMeterSampleIndex").alias("EndMeterSampleIndex"), 
    min("LoadControlEvent").alias("LoadControlEvent"))

if debug:
    display(load_control_hourly_df)
    display(load_control_hourly_df.filter(col('LoadControlEventID') == "LREC.PSWH8_2025-04-08 05:00:00_2025-04-08 11:30:00"))

In [0]:
if debug:
    sample1 = load_control_hourly_df.filter((col('MeterNumber') == 65918693))
    display(sample1)


In [0]:
# Rename the Local time data to match the other data.
load_control_hourly_df = load_control_hourly_df.withColumnRenamed('LocalYear', 'Year') \
                                .withColumnRenamed('LocalMonth', 'Month') \
                                    .withColumnRenamed('LocalDay', 'Day') 



In [0]:
# Append the new data to the existing data.
load_control_hourly_df.write.format("delta").mode("append").option("mergeSchema", "true").save(LOAD_CONTROL_METERTS_PATH)

In [0]:
# Vacuum
spark.sql(f"VACUUM '{LOAD_CONTROL_METERTS_PATH}'")