In [0]:
%run ../Utilities/ConfigUtilities

In [0]:
# Set up the environment using a function in ConfigUtilties.
set_spark_config()

In [0]:
from pyspark.sql.functions import col, when, sum, mean, percentile_approx, row_number, desc, row_number, expr, abs
from pyspark.sql.window import Window

debug = 1

In [0]:
# Read the main meter only dataset.
main_usage_df = spark.read.format("delta").load(MDM_MAIN_METER_ONLY_HOURLY_DATA_PATH)

In [0]:
# Calculate the VEE vs. AMI difference and set counter for each reading that is different.
main_usage_df = main_usage_df.withColumn("VEE_AMI_DIFF", abs(col("HourlyAMIValue") - col("HourlyVEEValue"))) \
                .withColumn("DIFF_COUNT", when(col("VEE_AMI_DIFF") != 0, 1).otherwise(0))

if debug:
    display(main_usage_df)

In [0]:
# Create date from year, month, day columns.
main_usage_df = main_usage_df.withColumn("DATE", expr("make_date(YEAR, MONTH, DAY)"))

if debug:
    display(main_usage_df)

In [0]:
# Explore by hour
hour_df = main_usage_df.groupBy("HourEnding").agg(sum("DIFF_COUNT").alias("DIFF_COUNT_SUM"), sum("VEE_AMI_DIFF").alias("VEE_AMI_DIFF_SUM"))

hour_df = hour_df.withColumn("DIFF_COUNT_AVG", col("VEE_AMI_DIFF_SUM") / col("DIFF_COUNT_SUM")) 

display(hour_df.orderBy(desc("DIFF_COUNT_SUM")))

In [0]:
# Explore by date
date_df = main_usage_df.groupBy("DATE").agg(sum("DIFF_COUNT").alias("DIFF_COUNT_SUM"), sum("VEE_AMI_DIFF").alias("VEE_AMI_DIFF_SUM"))

display(date_df.orderBy(desc("DIFF_COUNT_SUM")))

In [0]:
# Explore by month
month_df = main_usage_df.groupBy("Year", "Month").agg(sum("DIFF_COUNT").alias("DIFF_COUNT_SUM"), sum("VEE_AMI_DIFF").alias("VEE_AMI_DIFF_SUM"))

display(month_df.orderBy(desc("DIFF_COUNT_SUM")))