In [56]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GBTRegressor

In [57]:
# Initialize Spark session
spark = SparkSession.builder.appName("PowerPlantAnalysis").getOrCreate()


In [58]:
# Read the CSV file
pp_df = spark.read.csv("/content/power-plant.csv", header=True, inferSchema=True)

In [59]:
VectorAssembler=VectorAssembler(inputCols=["AT","V","AP","RH"],outputCol="features")

In [60]:
vpp_df=VectorAssembler.transform(pp_df)

In [61]:
vpp_df.take(1)

[Row(AT=14.96, V=41.76, AP=1024.07, RH=73.17, PE=463.26, features=DenseVector([14.96, 41.76, 1024.07, 73.17]))]

In [62]:
lr=LinearRegression(featuresCol="features",labelCol="PE")

In [63]:
lr_model=lr.fit(vpp_df)

In [64]:
lr_model.coefficients

DenseVector([-1.9775, -0.2339, 0.0621, -0.1581])

In [65]:
lr_model.intercept

454.60927445304543

In [66]:
lr_model.summary.rootMeanSquaredError

4.557126016749479

In [67]:
splits=vpp_df.randomSplit([0.7,0.3])


In [68]:
train_df=splits[0]
test_df=splits[1]

In [69]:
train_df.count()

6759

In [70]:
test_df.count()

2809

In [71]:
dt=DecisionTreeRegressor(featuresCol="features",labelCol="PE")

In [72]:
dt_model=dt.fit(train_df)

In [73]:
dt_prediction=dt_model.transform(test_df)

In [74]:
dt_evaluator = RegressionEvaluator(labelCol="PE", predictionCol="prediction", metricName="rmse")

In [75]:
rms=dt_evaluator.evaluate(dt_prediction)

In [76]:
rms

4.49263478544322

In [77]:
gbt=GBTRegressor(featuresCol="features",labelCol="PE")

In [78]:
gbt_model=gbt.fit(train_df)

In [79]:
gbt_prediction=gbt_model.transform(test_df)

In [80]:
gbt_evaluator=RegressionEvaluator(labelCol="PE",predictionCol="prediction",metricName="rmse")

In [81]:
gbt_rms=gbt_evaluator.evaluate(gbt_prediction)

In [82]:
gbt_rms

4.0015028612517245