In [None]:
! pip install pyspark findspark

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor, LinearRegression
from pyspark.ml import Pipeline
import numpy as np
import findspark


# Specify the Spark home directory and version
findspark.init('C:/spark/spark-3.5.1-bin-hadoop3/spark-3.5.1-bin-hadoop3')

# Configure Spark to use a master URL and set up the application name
master_url = "spark://192.168.57.215:7077"
app_name = "Ensemble-Method"

In [None]:
spark = SparkSession.builder \
    .appName("MLlib Ensemble Model") \
    .getOrCreate()

In [None]:
df = spark.read.csv("../Dataset/T1.csv", header=True, inferSchema=True)

# Prepare features
feature_cols = df.columns
feature_cols.remove("LV ActivePower (kW)")
vector_assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df = vector_assembler.transform(df)

In [None]:
# Split the data into train and test
(train_data, test_data) = df.randomSplit([0.7, 0.3], seed=42)

In [None]:
# Define models
rf = RandomForestRegressor(featuresCol="features", labelCol="LV ActivePower (kW)", numTrees=500, maxDepth=4)
gbt = GBTRegressor(featuresCol="features", labelCol="LV ActivePower (kW)", maxIter=10)
lr = LinearRegression(featuresCol="features", labelCol="LV ActivePower (kW)", maxIter=10)

In [None]:
# Fit models
rf_model = rf.fit(train_data)
gbt_model = gbt.fit(train_data)
lr_model = lr.fit(train_data)

In [None]:
# Make predictions
rf_predictions = rf_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)
lr_predictions = lr_model.transform(test_data)

In [None]:
# Ensemble predictions
ensemble_predictions = rf_predictions \
    .withColumn("prediction", 
                (rf_predictions["prediction"] + gbt_predictions["prediction"] + lr_predictions["prediction"]*2) / 4)  # Adjust weights accordingly

In [None]:
# Evaluate the ensemble model
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="LV ActivePower (kW)", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(ensemble_predictions)

In [None]:
print("RMSE: ", rmse)

In [None]:
# Stop SparkSession
spark.stop()