In [0]:

from pyspark.sql.types import *
from pyspark.sql import functions as f
import pandas as pd
import numpy as np

In [0]:
df = spark.read.csv("/FileStore/tables/household_power_consumption.txt",header='True', inferSchema='True',sep=";")

In [0]:
df.show()

+----------+-------------------+-------------------+---------------------+-------+----------------+--------------+--------------+--------------+
|      Date|               Time|Global_active_power|Global_reactive_power|Voltage|Global_intensity|Sub_metering_1|Sub_metering_2|Sub_metering_3|
+----------+-------------------+-------------------+---------------------+-------+----------------+--------------+--------------+--------------+
|16/12/2006|2022-05-04 17:24:00|              4.216|                0.418|234.840|          18.400|         0.000|         1.000|          17.0|
|16/12/2006|2022-05-04 17:25:00|              5.360|                0.436|233.630|          23.000|         0.000|         1.000|          16.0|
|16/12/2006|2022-05-04 17:26:00|              5.374|                0.498|233.290|          23.000|         0.000|         2.000|          17.0|
|16/12/2006|2022-05-04 17:27:00|              5.388|                0.502|233.740|          23.000|         0.000|         1.000| 

In [0]:
df = df.select(
        df.Date.cast("string"),
        df.Global_active_power.cast("int"), 
        df.Global_reactive_power.cast("int"), 
        df.Voltage.cast("int"),
        df.Global_intensity.cast("int"), 
        df.Sub_metering_1.cast("int"),
        df.Sub_metering_2.cast("int"), 
        df.Sub_metering_3.cast("int"))

In [0]:
%python
#Split date and time 
from pyspark.sql.functions import split
 
df = df.withColumn("day", split(df['Date'], '/').getItem(0)) \
    .withColumn("month", split(df['Date'], '/').getItem(1)) \
    .withColumn("year", split(df['Date'], '/').getItem(2))
df.select("Date", "day", "month", "year").show(5)

+----------+---+-----+----+
|      Date|day|month|year|
+----------+---+-----+----+
|16/12/2006| 16|   12|2006|
|16/12/2006| 16|   12|2006|
|16/12/2006| 16|   12|2006|
|16/12/2006| 16|   12|2006|
|16/12/2006| 16|   12|2006|
+----------+---+-----+----+
only showing top 5 rows



In [0]:
%python
df = df.select(
        df.Date.cast("string"), 
        df.Global_active_power.cast("double"), 
        df.Global_reactive_power.cast("double"), 
        df.Voltage.cast("double"),
        df.Global_intensity.cast("double"), 
        df.Sub_metering_1.cast("int"),
        df.Sub_metering_2.cast("int"), 
        df.Sub_metering_3.cast("int"),
        df.day.cast("int"),
        df.month.cast("int"),
        df.year.cast("int"),
        )

In [0]:
df=df.dropna()

In [0]:
%python
#Drop 'Date' , 'Time' and 'Hour' columns
df=df.drop("Date")

In [0]:
%python
#Supervised learning - RandomForest Regression
from pyspark.ml.feature import VectorAssembler
df = df.withColumnRenamed('Global_active_power', "Target_Variable")

In [0]:
%python
#Merge multiple columns into a vector column
assembler = VectorAssembler(inputCols=['Global_reactive_power','Voltage','Global_intensity','Sub_metering_1','Sub_metering_2','Sub_metering_3','day','year'], outputCol="features_vector")
df_assembled_ = assembler.transform(df)
df_assembled_ = df_assembled_.select("features_vector", "Target_Variable")
df_assembled_.show(5, truncate=False)

+-----------------------------------------+---------------+
|features_vector                          |Target_Variable|
+-----------------------------------------+---------------+
|[0.0,234.0,18.0,0.0,1.0,17.0,16.0,2006.0]|4.0            |
|[0.0,233.0,23.0,0.0,1.0,16.0,16.0,2006.0]|5.0            |
|[0.0,233.0,23.0,0.0,2.0,17.0,16.0,2006.0]|5.0            |
|[0.0,233.0,23.0,0.0,1.0,17.0,16.0,2006.0]|5.0            |
|[0.0,235.0,15.0,0.0,1.0,17.0,16.0,2006.0]|3.0            |
+-----------------------------------------+---------------+
only showing top 5 rows



In [0]:
%python
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol='features_vector', outputCol="features")
finalize_df = scaler.fit(df_assembled_).transform(df_assembled_)
finalize_df = finalize_df.select('features', 'Target_Variable')
finalize_df.show(5, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------+---------------+
|features                                                                                                                  |Target_Variable|
+--------------------------------------------------------------------------------------------------------------------------+---------------+
|[0.0,71.93634920987235,4.051592026622789,0.0,0.17176149998751689,2.0148974623468727,1.817836852276496,1784.0817587661666] |4.0            |
|[0.0,71.62892891410365,5.177034256240231,0.0,0.17176149998751689,1.8963740822088213,1.817836852276496,1784.0817587661666] |5.0            |
|[0.0,71.62892891410365,5.177034256240231,0.0,0.34352299997503377,2.0148974623468727,1.817836852276496,1784.0817587661666] |5.0            |
|[0.0,71.62892891410365,5.177034256240231,0.0,0.17176149998751689,2.0148974623468727,1.817836852276496,1784.0817587661666] |5.0            |
|[0.0,72.2437

In [0]:
%python
#ML Model
(train, test) = finalize_df.randomSplit([0.75, 0.25])
print("Train count:", train.count())
print("Test count:", test.count())

Train count: 1536874
Test count: 512406


In [0]:
%python
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(featuresCol="features", labelCol="Target_Variable")
model = rf.fit(train)

In [0]:
%python
predictions = model.transform(test)

In [0]:
%python
predictions.select("Target_Variable", "prediction").show(10)

+---------------+-------------------+
|Target_Variable|         prediction|
+---------------+-------------------+
|            5.0| 2.9169997214791654|
|            0.0| 0.0643858652366012|
|            3.0| 2.3256996805595955|
|            4.0|  2.820246295402023|
|            0.0|0.06642958351208979|
|            0.0|0.06642958351208979|
|            0.0| 0.0643858652366012|
|            0.0| 0.0643858652366012|
|            1.0| 0.8693395381210915|
|            4.0| 2.7669588676361636|
+---------------+-------------------+
only showing top 10 rows



In [0]:
%python
from pyspark.ml.evaluation import RegressionEvaluator
 
evaluator = RegressionEvaluator(
    labelCol="Target_Variable", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)

In [0]:
%python
print("RMSE:", rmse)

RMSE: 0.207324251874429
