In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, hour, when, sum as spark_sum

# Step 1: Start Spark Session
spark = SparkSession.builder.appName("DeviceLevelAggregation").getOrCreate()

In [2]:
# Step 2: Load energy usage CSV file
df = spark.read.option("header", True).option("inferSchema", True) \
    .csv("energyusage.csv")

In [3]:
# Step 3: Extract hour from timestamp
df = df.withColumn("hour", hour(col("timestamp")))
df.show()

+---------+-------+-------------------+----------+----+
|device_id|room_id|          timestamp|energy_kwh|hour|
+---------+-------+-------------------+----------+----+
|      109|      1|2025-07-22 08:00:00|       3.3|   8|
|      103|      2|2025-07-22 08:30:00|      1.67|   8|
|      104|      1|2025-07-22 09:00:00|      0.83|   9|
|      102|      1|2025-07-22 09:30:00|      1.52|   9|
|      105|      3|2025-07-22 10:00:00|      3.36|  10|
|      102|      2|2025-07-22 10:30:00|      1.02|  10|
|      107|      3|2025-07-22 11:00:00|      0.96|  11|
|      107|      1|2025-07-22 11:30:00|      2.42|  11|
|      105|      3|2025-07-22 12:00:00|      1.33|  12|
|      110|      2|2025-07-22 12:30:00|      2.94|  12|
|      110|      1|2025-07-22 13:00:00|      2.58|  13|
|      108|      3|2025-07-22 13:30:00|      1.44|  13|
|      103|      2|2025-07-22 14:00:00|      3.43|  14|
|      103|      3|2025-07-22 14:30:00|      3.44|  14|
|      107|      3|2025-07-22 15:00:00|      3.1

In [5]:
# Step 4: Label usage type: peak (18–23) or off-peak
df = df.withColumn(
    "usage_type",
    when((col("hour") >= 18) & (col("hour") <= 23), "peak").otherwise("off_peak")
)
df.show()

+---------+-------+-------------------+----------+----+----------+
|device_id|room_id|          timestamp|energy_kwh|hour|usage_type|
+---------+-------+-------------------+----------+----+----------+
|      109|      1|2025-07-22 08:00:00|       3.3|   8|  off_peak|
|      103|      2|2025-07-22 08:30:00|      1.67|   8|  off_peak|
|      104|      1|2025-07-22 09:00:00|      0.83|   9|  off_peak|
|      102|      1|2025-07-22 09:30:00|      1.52|   9|  off_peak|
|      105|      3|2025-07-22 10:00:00|      3.36|  10|  off_peak|
|      102|      2|2025-07-22 10:30:00|      1.02|  10|  off_peak|
|      107|      3|2025-07-22 11:00:00|      0.96|  11|  off_peak|
|      107|      1|2025-07-22 11:30:00|      2.42|  11|  off_peak|
|      105|      3|2025-07-22 12:00:00|      1.33|  12|  off_peak|
|      110|      2|2025-07-22 12:30:00|      2.94|  12|  off_peak|
|      110|      1|2025-07-22 13:00:00|      2.58|  13|  off_peak|
|      108|      3|2025-07-22 13:30:00|      1.44|  13|  off_p

In [7]:

# Step 5: Group by device and usage_type
grouped_df = df.groupBy("device_id", "usage_type") \
    .agg(spark_sum("energy_kwh").alias("total_energy"))
df.show()

+---------+-------+-------------------+----------+----+----------+
|device_id|room_id|          timestamp|energy_kwh|hour|usage_type|
+---------+-------+-------------------+----------+----+----------+
|      109|      1|2025-07-22 08:00:00|       3.3|   8|  off_peak|
|      103|      2|2025-07-22 08:30:00|      1.67|   8|  off_peak|
|      104|      1|2025-07-22 09:00:00|      0.83|   9|  off_peak|
|      102|      1|2025-07-22 09:30:00|      1.52|   9|  off_peak|
|      105|      3|2025-07-22 10:00:00|      3.36|  10|  off_peak|
|      102|      2|2025-07-22 10:30:00|      1.02|  10|  off_peak|
|      107|      3|2025-07-22 11:00:00|      0.96|  11|  off_peak|
|      107|      1|2025-07-22 11:30:00|      2.42|  11|  off_peak|
|      105|      3|2025-07-22 12:00:00|      1.33|  12|  off_peak|
|      110|      2|2025-07-22 12:30:00|      2.94|  12|  off_peak|
|      110|      1|2025-07-22 13:00:00|      2.58|  13|  off_peak|
|      108|      3|2025-07-22 13:30:00|      1.44|  13|  off_p

In [8]:
# Step 6: Pivot to show peak and off-peak side by side
pivot_df = grouped_df.groupBy("device_id") \
    .pivot("usage_type", ["peak", "off_peak"]) \
    .sum("total_energy") \
    .na.fill(0)
pivot_df.show()

+---------+------------------+------------------+
|device_id|              peak|          off_peak|
+---------+------------------+------------------+
|      108|1.7999999999999998|              6.36|
|      101|              7.57|             13.05|
|      103|              2.92|             19.33|
|      107|              1.91|             20.66|
|      102|               1.0|             16.12|
|      109|              0.77|              8.24|
|      105|              4.18|             17.55|
|      110|             12.26|             20.16|
|      106|              9.39|16.849999999999998|
|      104|              3.48|             15.17|
+---------+------------------+------------------+



In [9]:
# Step 7: Add total energy column
final_df = pivot_df.withColumn("total_usage", col("peak") + col("off_peak"))
final_df.show()

+---------+------------------+------------------+-----------+
|device_id|              peak|          off_peak|total_usage|
+---------+------------------+------------------+-----------+
|      108|1.7999999999999998|              6.36|       8.16|
|      101|              7.57|             13.05|      20.62|
|      103|              2.92|             19.33|      22.25|
|      107|              1.91|             20.66|      22.57|
|      102|               1.0|             16.12|      17.12|
|      109|              0.77|              8.24|       9.01|
|      105|              4.18|             17.55|      21.73|
|      110|             12.26|             20.16|      32.42|
|      106|              9.39|16.849999999999998|      26.24|
|      104|              3.48|             15.17|      18.65|
+---------+------------------+------------------+-----------+



In [10]:
# Step 8: Get top 5 energy-consuming devices
top_devices = final_df.orderBy(col("total_usage").desc()).limit(5)
top_devices.show()

+---------+-----+------------------+-----------+
|device_id| peak|          off_peak|total_usage|
+---------+-----+------------------+-----------+
|      110|12.26|             20.16|      32.42|
|      106| 9.39|16.849999999999998|      26.24|
|      107| 1.91|             20.66|      22.57|
|      103| 2.92|             19.33|      22.25|
|      105| 4.18|             17.55|      21.73|
+---------+-----+------------------+-----------+



In [11]:


# Step 9: Save output
top_devices.write.option("header", True).mode("overwrite") \
    .csv("top_devices_output")
