# Create Usage Metrics
Creates several aggregates and metrics associated with hourly usage.  

In [0]:
%run ../Utilities/ConfigUtilities

In [0]:
# Set up the environment using a function in ConfigUtilties.
set_spark_config()

In [0]:
from pyspark.sql.functions import col, when, sum, mean, percentile_approx, row_number, desc, row_number
from pyspark.sql.window import Window

debug = 1

In [0]:
# Read the main meter only dataset.
main_usage_df = spark.read.format("delta").load(MDM_MAIN_METER_ONLY_DATA_PATH)

if debug:
    display(main_usage_df)

In [0]:
# Read the indexed calendar to easily get the timestamp breakdown.  Join with the start read time index.
cal_df =  spark.read.parquet(INDEXED_CALENDAR_PATH)

main_usage_cal_df = main_usage_df.join(cal_df, main_usage_df.StartMeterSampleIndex == cal_df.MeterSampleIndex,  how='inner')

if debug:
    display(main_usage_cal_df)

In [0]:
# Create an hour ending column.
# Start time goes from 00:00 to 23:xx.  We should end up with HourEnding from 1 to 24.
main_usage_cal_df = main_usage_cal_df.withColumn('HourEnding', col("Hour") + 1)

if debug:
    display(main_usage_cal_df)

In [0]:
# Drop columns that aren't needed for the aggregation.
columns_to_drop = ['StartDateTime', 'EndDateTime', 'Hour', 'Minute', 'Second', 'StartMeterSampleIndex', 'EndMeterSampleIndex', 'SampleRate', 'TimeStamp', 'MeterSampleIndex']
main_usage_cal_df = main_usage_cal_df.drop(*columns_to_drop)

if debug:
    display(main_usage_cal_df)

In [0]:
# Group by all columns to sum AMI and VEE values.
hourly_usage_df = main_usage_cal_df.groupBy([col for col in main_usage_cal_df.columns if col not in ['AMIValue', 'VEEValue']]) \
                              .agg(sum('AMIValue').alias('HourlyAMIValue'), sum('VEEValue').alias('HourlyVEEValue'))

if debug:
    display(hourly_usage_df)

In [0]:
# Save this for use in other scenarios.
hourly_usage_df.write.format("delta").mode("overwrite").save(MDM_MAIN_METER_ONLY_HOURLY_DATA_PATH)

## Create metrics
Now that individual meters are aggregated to hourly, we can start with the metrics.

### Create a couple of useful aggregates first.

In [0]:
# Aggregate all meters to Year, Month, Day and HourEnding
hour_day_month_df = hourly_usage_df.groupby("Year", "Month", "Day", "HourEnding")\
            .agg(sum('HourlyAMIValue').alias('AllMetersDayHourAMIValue'), sum('HourlyVEEValue').alias('AllMetersDayHourVEEValue'))

# Aggregate all meters to Year, Month, and HourEnding
hour_month_df = hour_day_month_df.groupby("Year", "Month", "HourEnding")\
            .agg(sum('AllMetersDayHourAMIValue').alias('AllMetersMonthHourTotalAMIValue'), \
                mean('AllMetersDayHourAMIValue').alias('AllMetersMonthHourMeanAMIValue'), \
                percentile_approx('AllMetersDayHourAMIValue', 0.5).alias('AllMetersMonthHourMedianAMIValue'), \
                    sum('AllMetersDayHourVEEValue').alias('AllMetersMonthHourTotalVEEValue'), \
                    mean('AllMetersDayHourVEEValue').alias('AllMetersMonthHourMeanVEEValue'), \
                    percentile_approx('AllMetersDayHourVEEValue', 0.5).alias('AllMetersMonthHourMedianVEEValue'))

if debug==1:
    display(hour_month_df)

In [0]:
# Save the 

### Rank hours by month.

In [0]:
windowSpec  = Window.partitionBy("Year", "Month").orderBy(desc("AllMetersMonthHourMeanVEEValue"))

hour_month_rank_df = hour_month_df.withColumn("MonthRank",row_number().over(windowSpec))

if debug==1:
    display(hour_month_rank_df)
    display(hour_month_rank_df.filter((col('Year')==2024) & (col('Month')==8)))


In [0]:
# Save results - delta
from delta.tables import *

hour_month_rank_df.write.mode('overwrite').format("delta").save(MDM_MONTH_HOUR_RANK_PATH)

# Clean up old files
delta_table = DeltaTable.forPath(spark, MDM_MONTH_HOUR_RANK_PATH)
delta_table.vacuum()  

### Save the hourly aggregrate
This will be used to find the top 100 hours in a time period in Power BI.

In [0]:
# Confirm previously created dataframe meets intent.
display(hour_day_month_df)

In [0]:
# Save results - delta
from delta.tables import *

hour_day_month_df.write.mode('overwrite').format("delta").save(MDM_HOURLY_TOTALS_PATH)

# Clean up old files
delta_table = DeltaTable.forPath(spark, MDM_HOURLY_TOTALS_PATH)
delta_table.vacuum()  

### Rank hours for each day based on usage.

In [0]:
# Use window functions to do the daily rank.
windowSpec  = Window.partitionBy("Year", "Month", "Day").orderBy(desc("AllMetersDayHourVEEValue"))

hourly_rank_by_day_df = hour_day_month_df.withColumn("DailyRank", row_number().over(windowSpec))

if debug==1:
    display(hourly_rank_by_day_df)


In [0]:
# Save
hourly_rank_by_day_df.write.mode('overwrite').format("delta").save(MDM_DAILY_RANK_PATH)

# Clean up old files
delta_table = DeltaTable.forPath(spark, MDM_DAILY_RANK_PATH)
delta_table.vacuum()  
