#Create Hourly Meter Data
This notebook aggregates all date to an hour ending time period. This is frequently needed to align meter sample rates, align with other inputs, and reduce data volume for vizualization. This is done incrementally to reduce load.

In [0]:
%run ../Utilities/ConfigUtilities

In [0]:
# Imports and debug
from pyspark.sql.functions import lit, sum, col, max
from delta.tables import DeltaTable

debug = 1

In [0]:
# To incrementally update the output table, we need to find the last load control event that has been processed.
try:
    hourly_df = DeltaTable.forPath(spark, MDM_HOURLY_PATH).toDF()
    last_processed_index = hourly_df.select(max('EndMeterSampleIndex')).collect()[0][0]
except:   # Table is empty
    last_processed_index= 1

if debug:
    print(last_processed_index)

In [0]:
# Get the new data from the indexed data.
new_data_df = spark.read.format('delta').load(MDM_INDEXED_PATH).filter(col('EndMeterSampleIndex') > last_processed_index)

if new_data_df.count() == 0:
    dbutils.notebook.exit("No new data found.") 

if debug:
    display(new_data_df)

In [0]:
# Get the calendar data.  We want to work with local time, so remove the UTC time information.
calendar_df = spark.read.format('parquet').load(INDEXED_CALENDAR_PATH)

# Eliminate the UTC time info.
calendar_df = calendar_df.select('MeterSampleIndex', 'LocalTimeStamp', 'LocalYear', 'LocalMonth', 'LocalDay', 'LocalHour', 'LocalMinute')



In [0]:
# Join with the new data.  Use the start sample as it's easier to calculate an hour ending (calendar goes from 0->23; we want 1->24).  
new_data_dates_df = new_data_df.join(calendar_df, new_data_df.StartMeterSampleIndex==calendar_df.MeterSampleIndex, how='inner')

if debug:
    display(new_data_dates_df)

In [0]:
# Create an HourEnding column.  Since the join was on the start index for all time periods, we can just add an hour.
new_data_dates_df = new_data_dates_df.withColumn('HourEnding', col('LocalHour')+1)

if debug:
    display(new_data_dates_df)

In [0]:
# Aggregate to hourly data.
new_data_hourly_df = new_data_dates_df.groupBy('MeterNumber', 'UnitOfMeasure', 'FlowDirection', 'Channel', 'LocalYear', 
                                               'LocalMonth', 'LocalDay', 'HourEnding').agg(
                                                    sum("AMIValue").alias("HourlyAMIValue"),
                                                    sum("VEEValue").alias("HourlyVEEValue"),
                                                    max("EndMeterSampleIndex").alias("EndMeterSampleIndex"))

if debug:
    display(new_data_hourly_df)

In [0]:
# Append the new data to the existing data.
new_data_hourly_df.write.format("delta").mode("append").option("mergeSchema", "true").save(MDM_HOURLY_PATH)

In [0]:
# Clean up the delta history.
spark.sql(f"VACUUM '{MDM_HOURLY_PATH}'")