In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
#######################################
###!@0 START INIT ENVIRONMENT
!ls /content/drive/MyDrive/spark-3.5.2-bin-hadoop3.tgz
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!tar xf /content/drive/MyDrive/spark-3.5.2-bin-hadoop3.tgz
!pip install -q findspark
!pip install -q pyspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.2-bin-hadoop3"
###!@0 END INIT ENVIRONMENT

/content/drive/MyDrive/spark-3.5.2-bin-hadoop3.tgz


In [6]:
!mkdir -p /content/data
!rm -rf /content/data/*.csv
!ln -s /content/drive/MyDrive/DES_Project/DataSet/weather_data/*.csv /content/data/

In [9]:
#Load Weather and consumption Data

csv_path1 = '/content/drive/MyDrive/DES_Project/DataSet/weather_data.csv'
csv_path2 = '/content/drive/MyDrive/DES_Project/DataSet/electricity_consumption_delhi.csv'

In [10]:
#######################################
###!@1 START OF PYSPARK INIT
import findspark
findspark.init()
findspark.find()
from pyspark.sql import SparkSession
input_type = 'sample'
spark = SparkSession.builder\
         .master("local")\
         .appName("Colab")\
         .config('spark.ui.port', '4050')\
         .getOrCreate()
# Spark is ready to go within Colab!
###!@1 END OF PYSPARK INIT

In [11]:
#Load data from csv files

weather_data = spark.read.csv(csv_path1, header=True, inferSchema=True)
electricity_data = spark.read.csv(csv_path2, header=True, inferSchema=True)

In [12]:
#######################################
###!@1 START OF PYSPARK INIT
import findspark
findspark.init()
findspark.find()
from pyspark.sql import SparkSession
input_type = 'sample'
spark = SparkSession.builder\
         .master("local")\
         .appName("Colab")\
         .config('spark.ui.port', '4050')\
         .getOrCreate()
# Spark is ready to go within Colab!
###!@1 END OF PYSPARK INIT

In [13]:
#Import required Libraries

from pyspark.sql.functions import col, count
from pyspark.sql.functions import to_date
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import avg, date_trunc

In [14]:
# Display the first few rows

electricity_data.show(5)
weather_data.show(5)

+-----+-------------------+----------------+
| City|               Date|Consumption (MW)|
+-----+-------------------+----------------+
|Delhi|1979-01-01 00:00:00|              50|
|Delhi|1979-01-01 00:01:00|              61|
|Delhi|1979-01-01 00:02:00|              50|
|Delhi|1979-01-01 00:03:00|              50|
|Delhi|1979-01-01 00:04:00|              50|
+-----+-------------------+----------------+
only showing top 5 rows

+-----+-------------------+---------------+--------------+------------+--------------+-------------------+----------------+--------------+------------+-------------------+-------------------+
| City|               Date|Temperature (C)|Feels Like (C)|Humidity (%)|Pressure (hPa)|Weather Description|Wind Speed (m/s)|Cloudiness (%)|Rain (1h mm)|            Sunrise|             Sunset|
+-----+-------------------+---------------+--------------+------------+--------------+-------------------+----------------+--------------+------------+-------------------+---------------

In [15]:
# Print schema

electricity_data.printSchema()
weather_data.printSchema()

root
 |-- City: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Consumption (MW): integer (nullable = true)

root
 |-- City: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Temperature (C): double (nullable = true)
 |-- Feels Like (C): double (nullable = true)
 |-- Humidity (%): integer (nullable = true)
 |-- Pressure (hPa): integer (nullable = true)
 |-- Weather Description: string (nullable = true)
 |-- Wind Speed (m/s): double (nullable = true)
 |-- Cloudiness (%): integer (nullable = true)
 |-- Rain (1h mm): double (nullable = true)
 |-- Sunrise: timestamp (nullable = true)
 |-- Sunset: timestamp (nullable = true)



In [17]:

# Check data statistics

electricity_data.describe().show()
weather_data.describe().show()

+-------+--------+------------------+
|summary|    City|  Consumption (MW)|
+-------+--------+------------------+
|  count|24144480|          24144480|
|   mean|    NULL|3242.0363132277025|
| stddev|    NULL|2824.2361845591695|
|    min|   Delhi|                50|
|    max|   Delhi|              8600|
+-------+--------+------------------+

+-------+--------+------------------+------------------+------------------+----------------+-------------------+------------------+------------------+------------------+
|summary|    City|   Temperature (C)|    Feels Like (C)|      Humidity (%)|  Pressure (hPa)|Weather Description|  Wind Speed (m/s)|    Cloudiness (%)|      Rain (1h mm)|
+-------+--------+------------------+------------------+------------------+----------------+-------------------+------------------+------------------+------------------+
|  count|24111360|          24111360|          24111360|          24111360|        24111360|           24111360|          24111360|          241113

In [18]:
# Check for missing values

def missing_values(df):
    return df.select([(count(col(c)) / df.count()).alias(c + "_missing") for c in df.columns])

missing_values(electricity_data).show()
missing_values(weather_data).show()

+------------+------------+------------------------+
|City_missing|Date_missing|Consumption (MW)_missing|
+------------+------------+------------------------+
|         1.0|         1.0|                     1.0|
+------------+------------+------------------------+

+------------+------------+-----------------------+----------------------+--------------------+----------------------+---------------------------+------------------------+----------------------+--------------------+---------------+--------------+
|City_missing|Date_missing|Temperature (C)_missing|Feels Like (C)_missing|Humidity (%)_missing|Pressure (hPa)_missing|Weather Description_missing|Wind Speed (m/s)_missing|Cloudiness (%)_missing|Rain (1h mm)_missing|Sunrise_missing|Sunset_missing|
+------------+------------+-----------------------+----------------------+--------------------+----------------------+---------------------------+------------------------+----------------------+--------------------+---------------+---------

In [19]:
from pyspark.sql.functions import avg, date_trunc

# Truncate the timestamp to get the day
electricity_data = electricity_data.withColumn("Date", date_trunc("day", col("Date")))

# Group by day and compute the daily average of Consumption (MW)
daily_electricity_data = electricity_data.groupBy("Date").agg(avg("Consumption (MW)").alias("Daily_Consumption"))

daily_electricity_data.show(5)

+-------------------+-----------------+
|               Date|Daily_Consumption|
+-------------------+-----------------+
|1979-10-25 00:00:00|74.61736111111111|
|1980-07-22 00:00:00|         75.28125|
|1981-07-21 00:00:00|74.30069444444445|
|1982-02-16 00:00:00|76.12638888888888|
|1982-05-19 00:00:00|75.54166666666667|
+-------------------+-----------------+
only showing top 5 rows



In [20]:
# Truncate the timestamp to get the day
weather_data = weather_data.withColumn("Date", date_trunc("day", col("Date")))

# Group by day and compute daily averages for relevant columns
daily_weather_data = weather_data.groupBy("Date").agg(
    avg("Temperature (C)").alias("Avg_Temperature"),
    avg("Humidity (%)").alias("Avg_Humidity"),
    avg("Wind Speed (m/s)").alias("Avg_Wind_Speed"),
    avg("Cloudiness (%)").alias("Avg_Cloudiness"),
    avg("Rain (1h mm)").alias("Avg_Rain")
)

daily_weather_data.show(5)

+-------------------+------------------+------------------+------------------+------------------+------------------+
|               Date|   Avg_Temperature|      Avg_Humidity|    Avg_Wind_Speed|    Avg_Cloudiness|          Avg_Rain|
+-------------------+------------------+------------------+------------------+------------------+------------------+
|1979-10-25 00:00:00|30.020722222222183| 50.16180555555555|2.5672430555555574| 39.44444444444444|2.5462638888888853|
|1980-07-22 00:00:00| 30.01484027777782| 49.99861111111111|2.5423055555555516|40.108333333333334|2.4898888888888906|
|1979-01-10 00:00:00| 13.71720833333335|50.421527777777776| 2.571326388888892| 38.93611111111111|2.4883402777777817|
|1979-03-07 00:00:00| 37.47304166666668| 50.33611111111111|2.5971180555555575| 39.43402777777778| 2.461298611111108|
|1979-04-03 00:00:00| 37.51625000000001| 49.94305555555555|2.5208263888888953|39.509027777777774|2.4597291666666665|
+-------------------+------------------+------------------+-----

In [21]:
# Join daily electricity data with daily weather data on Date
daily_data = daily_electricity_data.join(daily_weather_data, on="Date", how="inner")

daily_data.show(5)

+-------------------+-----------------+------------------+------------------+------------------+------------------+------------------+
|               Date|Daily_Consumption|   Avg_Temperature|      Avg_Humidity|    Avg_Wind_Speed|    Avg_Cloudiness|          Avg_Rain|
+-------------------+-----------------+------------------+------------------+------------------+------------------+------------------+
|1979-10-25 00:00:00|74.61736111111111|30.020722222222183| 50.16180555555555|2.5672430555555574| 39.44444444444444|2.5462638888888853|
|1980-07-22 00:00:00|         75.28125| 30.01484027777782| 49.99861111111111|2.5423055555555516|40.108333333333334|2.4898888888888906|
|1979-01-10 00:00:00|73.61805555555556| 13.71720833333335|50.421527777777776| 2.571326388888892| 38.93611111111111|2.4883402777777817|
|1979-03-07 00:00:00|75.41388888888889| 37.47304166666668| 50.33611111111111|2.5971180555555575| 39.43402777777778| 2.461298611111108|
|1979-04-03 00:00:00|74.28055555555555| 37.516250000000

In [23]:
# Select features
feature_columns = ["Avg_Temperature", "Avg_Humidity", "Avg_Wind_Speed", "Avg_Cloudiness", "Avg_Rain"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Transform dataset
final_daily_data = assembler.transform(daily_data).select("features", "Daily_Consumption")

In [24]:
# Split into train and test sets
train_data, test_data = final_daily_data.randomSplit([0.8, 0.2], seed=42)

In [25]:
# Train the model
gbt = GBTRegressor(featuresCol="features", labelCol="Daily_Consumption", maxIter=50)
model = gbt.fit(train_data)

In [26]:
# Evaluate the model
predictions = model.transform(test_data)
evaluator = RegressionEvaluator(labelCol="Daily_Consumption", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 2901.5879574995583


In [21]:
# # Convert date columns to proper format

# weather_data = weather_data.withColumn("Date", to_date("Date"))
# electricity_data = electricity_data.withColumn("Date", to_date("Date"))

# # Join datasets on the date column
# data = electricity_data.join(weather_data, on="Date", how="inner")

# # Drop one of the city columns
# data = data.drop(weather_data["city"])

# data = data.withColumn("Date", to_date(col("Date")))

# data.printSchema()

root
 |-- date: date (nullable = true)
 |-- City: string (nullable = true)
 |-- Consumption (MW): integer (nullable = true)
 |-- Temperature (C): double (nullable = true)
 |-- Feels Like (C): double (nullable = true)
 |-- Humidity (%): integer (nullable = true)
 |-- Pressure (hPa): integer (nullable = true)
 |-- Weather Description: string (nullable = true)
 |-- Wind Speed (m/s): double (nullable = true)
 |-- Cloudiness (%): integer (nullable = true)
 |-- Rain (1h mm): double (nullable = true)
 |-- Sunrise: timestamp (nullable = true)
 |-- Sunset: timestamp (nullable = true)



In [22]:
# # Select features
# feature_columns = ["Temperature (C)", "Humidity (%)", "Wind Speed (m/s)", "Cloudiness (%)", "Rain (1h mm)"]
# assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# # Transform dataset
# final_data = assembler.transform(data).select("features", "Consumption (MW)")

In [23]:
# train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)

In [24]:
# # Initialize the model
# gbt = GBTRegressor(featuresCol="features", labelCol="Consumption (MW)", maxIter=50)

# # Train the model
# model = gbt.fit(train_data)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/content/spark-3.5.2-bin-hadoop3/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/content/spark-3.5.2-bin-hadoop3/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
# # Make predictions
# predictions = model.transform(test_data)

# # Evaluate predictions
# evaluator = RegressionEvaluator(labelCol="Consumption (MW)", predictionCol="prediction", metricName="rmse")
# rmse = evaluator.evaluate(predictions)
# print(f"Root Mean Squared Error (RMSE): {rmse}")