In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.regression import RandomForestRegressor, LinearRegression, DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorAssembler, MinMaxScaler

import os

In [2]:
# Initialize SparkSession
os.environ['SPARK_HOME'] = 'C:/spark-3.5.0-bin-hadoop3'
os.environ['PATH'] += 'C:/spark-3.5.0-bin-hadoop3/bin'
spark = SparkSession.builder \
    .appName("Wind Power Prediction") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", "2") \
    .getOrCreate()



In [3]:
# Load data
df = spark.read.csv('../Dataset/T1.csv', header=True, inferSchema=True)


In [4]:
# Remove irrelevant columns
df = df.drop('Date/Time', 'Theoretical_Power_Curve (KWh)')

In [5]:
print(df)

DataFrame[LV ActivePower (kW): double, Wind Speed (m/s): double, Wind Direction (°): double]


In [6]:
# Normalize features
feature_columns = ['Wind Direction (°)', 'Wind Speed (m/s)']
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
scaler = MinMaxScaler(inputCol='features', outputCol='scaled_features')

In [7]:
# Define models
models = {
    'XGBoost': GBTRegressor(labelCol='LV ActivePower (kW)'),
    'Random Forest': RandomForestRegressor(labelCol='LV ActivePower (kW)'),
    'Linear Regression': LinearRegression(labelCol='LV ActivePower (kW)'),
    'Decision Tree': DecisionTreeRegressor(labelCol='LV ActivePower (kW)')
}

In [8]:
# Define evaluator
evaluator = RegressionEvaluator(labelCol="LV ActivePower (kW)", predictionCol="prediction", metricName="r2")

In [9]:
# Split data into train and test sets
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

In [10]:
for name, model in models.items():
    pipeline = Pipeline(stages=[assembler, scaler, model])
    paramGrid = ParamGridBuilder().build()
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=5)
    cvModel = crossval.fit(train_data)
    predictions = cvModel.transform(test_data)
    r2 = evaluator.evaluate(predictions)
    print(f'R2-{name}: {r2}')

R2-XGBoost: 0.9163568488646847
R2-Random Forest: 0.9087736937295984
R2-Linear Regression: 0.8335548083227226
R2-Decision Tree: 0.9137755646017032


In [8]:
# Stop SparkSession
spark.stop()

In [9]:
! pip install findspark



DEPRECATION: Loading egg at c:\python311\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330


In [64]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import MinMaxScaler, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor, LinearRegression, DecisionTreeRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

import findspark


# Specify the Spark home directory and version
findspark.init('C:/spark/spark-3.5.1-bin-hadoop3/spark-3.5.1-bin-hadoop3')

# Configure Spark to use a master URL and set up the application name
master_url = "spark://192.168.106.215:7077"
app_name = "spark-basic"

# Create a SparkSession with the specified master and app name
spark = SparkSession.builder.master(master_url).appName(app_name).getOrCreate()

# Load data from a local CSV file
df = spark.read.csv('../Dataset/T1.csv', header=True, inferSchema=True)

# Remove irrelevant columns
# df = df.drop('Date/Time', 'Theoretical_Power_Curve (KWh)')

# Define feature and target columns
feature_columns = ['Wind Direction (°)', 'Wind Speed (m/s)']
target_col = 'LV ActivePower (kW)'

# Normalize features
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
scaler = MinMaxScaler(inputCol='features', outputCol='scaled_features')

# Define models dictionary
models = {
    'XGBoost': GBTRegressor(labelCol=target_col),
    'Random Forest': RandomForestRegressor(labelCol=target_col),
    'Linear Regression': LinearRegression(labelCol=target_col),
    'Decision Tree': DecisionTreeRegressor(labelCol=target_col)
}

# Define evaluator
evaluator = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName="r2")

# Split data into train and test sets
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# Parallel processing for model training
def fit_pipeline(model, train_data):
    pipeline = Pipeline(stages=[assembler, scaler, model])
    return pipeline.fit(train_data)



for name, model in models.items():
    pipeline_model = fit_pipeline(model, train_data)
    predictions = pipeline_model.transform(test_data)
    r2 = evaluator.evaluate(predictions)
    print(f'R2-{name}: {r2}')
    predictions.select('LV ActivePower (kW)', 'prediction').show()

   

# Stop SparkSession
# spark.stop()




R2-XGBoost: 0.9155019626655422
+-------------------+------------------+
|LV ActivePower (kW)|        prediction|
+-------------------+------------------+
|        306.3765869| 306.4857866645647|
|        447.6057129| 493.8577009584026|
|        463.6512146|403.58988842417574|
|        655.1942749| 586.5081952614954|
|        787.2462158| 703.0435799927914|
|        1053.771973| 949.4365999335415|
|        1021.458008| 809.4933326208605|
|        1145.536011|1110.6376703839064|
|        1009.533997|  810.746932277874|
|         899.492981| 704.2971796498049|
|        725.1101074| 589.0563373204445|
|        443.9139099|404.84348808118915|
|        644.0377808| 589.0563373204445|
|         408.997406| 322.5775659473099|
|        578.2615967| 496.4058430173516|
|        142.2024994| 60.64344500594865|
|        250.7559052|222.73630789401986|
|        2341.133057|2126.3273042460855|
|        2692.929932|2614.1159285101357|
|        3062.978027| 3143.654176106143|
+-------------------+-----

In [46]:
! pip install requests



DEPRECATION: Loading egg at c:\python311\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330





In [50]:
! pip install arrow

Collecting arrow
  Using cached arrow-1.3.0-py3-none-any.whl.metadata (7.5 kB)
Collecting types-python-dateutil>=2.8.10 (from arrow)
  Using cached types_python_dateutil-2.8.19.20240106-py3-none-any.whl.metadata (1.8 kB)
Using cached arrow-1.3.0-py3-none-any.whl (66 kB)
Using cached types_python_dateutil-2.8.19.20240106-py3-none-any.whl (9.7 kB)
Installing collected packages: types-python-dateutil, arrow
Successfully installed arrow-1.3.0 types-python-dateutil-2.8.19.20240106


DEPRECATION: Loading egg at c:\python311\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330


In [68]:
import requests
import arrow
import pandas as pd

# Specify your custom date range
start_date = arrow.get('2024-03-01T00:00:00')  # Replace with your desired start date and time
end_date = arrow.get('2024-03-30T23:59:59')    # Replace with your desired end date and time

# Make sure to convert the dates to UTC timestamps
start_timestamp = start_date.to('UTC').timestamp()
end_timestamp = end_date.to('UTC').timestamp()

# Specify the API parameters
api_params = {
    'lat': 38.24,
    'lng': 27.5,
    'params': ','.join(['windSpeed', 'windDirection']),
    'start': start_timestamp,
    'end': end_timestamp
}

# Specify your API key
api_key = '086b97d4-dab4-11ee-be1e-0242ac130002-086b982e-dab4-11ee-be1e-0242ac130002'

# Make the API request
response = requests.get(
    'https://api.stormglass.io/v2/weather/point',
    params=api_params,
    headers={'Authorization': api_key}
)

data = response.json()['hours']

# Create a DataFrame
df = pd.DataFrame(data)

# Extract relevant columns with respect to the nested structure
df = df[['time', 'windSpeed', 'windDirection']]

# Extract 'noaa' values from the nested structure
df['windSpeed'] = df['windSpeed'].apply(lambda x: x.get('noaa') if isinstance(x, dict) else None)
df['windDirection'] = df['windDirection'].apply(lambda x: x.get('noaa') if isinstance(x, dict) else None)

# Convert time from string to datetime
df['time'] = pd.to_datetime(df['time'], utc=True)
final_data=test_data





In [47]:
df=df.rename(columns={"windSpeed":"Wind Speed (m/s)","windDirection":"Wind Direction (°)","time":"Date"})
# Display the DataFrame
print(df)

                         Date  Wind Speed (m/s)  Wind Direction (°)
0   2024-03-01 00:00:00+00:00              1.48              125.54
1   2024-03-01 01:00:00+00:00              1.12              136.43
2   2024-03-01 02:00:00+00:00              0.75              147.32
3   2024-03-01 03:00:00+00:00              0.39              158.21
4   2024-03-01 04:00:00+00:00              0.78              140.25
..                        ...               ...                 ...
236 2024-03-10 20:00:00+00:00              2.14              148.65
237 2024-03-10 21:00:00+00:00              2.29              140.02
238 2024-03-10 22:00:00+00:00              2.20              142.32
239 2024-03-10 23:00:00+00:00              2.11              144.62
240 2024-03-11 00:00:00+00:00              2.02              146.92

[241 rows x 3 columns]


In [48]:
py_spark=spark.createDataFrame(df)

In [70]:
for name, model in models.items():
    pipeline_model = fit_pipeline(model, train_data)
    predictions = pipeline_model.transform(final_data)
    predictions.select('Date/Time','Wind Speed (m/s)','Wind Direction (°)','features','scaled_features', 'prediction').show()

+----------------+----------------+------------------+--------------------+--------------------+------------------+
|       Date/Time|Wind Speed (m/s)|Wind Direction (°)|            features|     scaled_features|        prediction|
+----------------+----------------+------------------+--------------------+--------------------+------------------+
|01 01 2018 00:20|     5.216036797|       272.5647888|[272.5647888,5.21...|[0.75712948378742...| 306.4857866645647|
|01 01 2018 01:00|     5.793007851|       266.1636047|[266.1636047,5.79...|[0.73934829776336...| 493.8577009584026|
|01 01 2018 01:20|     5.584629059|       253.4806976|[253.4806976,5.58...|[0.70411776432643...|403.58988842417574|
|01 01 2018 02:10|     6.199746132|       266.7331848|[266.7331848,6.19...|[0.74093047530356...| 586.5081952614954|
|01 01 2018 03:10|     6.437530994|       257.5602112|[257.5602112,6.43...|[0.71544982243882...| 703.0435799927914|
|01 01 2018 03:50|     7.288355827|       255.4445953|[255.4445953,7.28.