In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.regression import RandomForestRegressor, LinearRegression, DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col
from pyspark.ml.regression import GBTRegressor

In [None]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Wind Power Prediction") \
    .getOrCreate()

In [None]:
# Load data
df = spark.read.csv('../Dataset/T1.csv', header=True, inferSchema=True)


In [None]:
# Remove irrelevant columns
df = df.drop('Date/Time', 'Theoretical_Power_Curve (KWh)')

In [None]:
# Normalize features
feature_columns = ['Wind Direction (°)', 'Wind Speed (m/s)']
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
scaler = MinMaxScaler(inputCol='features', outputCol='scaled_features')

In [None]:
# Define models
models = {
    'XGBoost': GBTRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor()
}


In [None]:
# Define evaluator
evaluator = RegressionEvaluator(labelCol="LV ActivePower (kW)", predictionCol="prediction", metricName="r2")


In [None]:
# Split data into train and test sets
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

In [None]:
for name, model in models.items():
    pipeline = Pipeline(stages=[assembler, scaler, model])
    paramGrid = ParamGridBuilder().build()
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=5)
    cvModel = crossval.fit(train_data)
    predictions = cvModel.transform(test_data)
    r2 = evaluator.evaluate(predictions)
    print(f'R2-{name}: {r2}')

In [None]:
# Stop SparkSession
spark.stop()