## Baseline binary classification model

In [85]:
# packages
import os
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [86]:
# mount your google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Reading data sets and merging it together**

In [124]:
df_weather = pd.read_csv('/content/drive/My Drive/ms_wind_curtailment_prediction/weather_data.csv', sep = ',', index_col=0)
df_weather['timestamp'] = pd.to_datetime(df_weather['date'])
df_weather.set_index('timestamp', inplace=True)
df_weather.drop('date', axis = 1, inplace = True)
df_redispatch = pd.read_csv('/content/drive/My Drive/ms_wind_curtailment_prediction/wind_redispatch_2020_24.csv', sep = ';', index_col=0)
df_redispatch.index = pd.to_datetime(df_redispatch.index)
df_solar = pd.read_csv('/content/drive/My Drive/ms_wind_curtailment_prediction/solar_feedin_2020_24.csv', sep = ';', index_col=0)
df_solar.index = pd.to_datetime(df_solar.index)
df_demand = pd.read_csv('/content/drive/My Drive/ms_wind_curtailment_prediction/electricity_demand_2020_24.csv', sep = ';', index_col=0)
df_demand.index = pd.to_datetime(df_demand.index)

In [125]:
df_redispatch_subset = df_redispatch[['redispatch']]
df = df_redispatch_subset.merge(df_weather, how='outer', left_index=True, right_index=True)
df = df.resample('15T').first()
df = df.merge(df_solar, how='outer', left_index=True, right_index=True)
df = df.merge(df_demand, how='outer', left_index=True, right_index=True)

In [126]:
df = df[(df.index.date >= pd.to_datetime('2020-01-01').date()) & (df.index.date <= pd.to_datetime('2023-12-30').date())]

In [128]:
#forward actual
#backward forecast

In [127]:
# save csv
df.to_csv('/content/drive/My Drive/ms_wind_curtailment_prediction/curtailment_target_features.csv', sep = ';')

## Train random forest binary classification while handling class imbalance target variable

**why level only has 0,30,50,60% and what does it actually mean**

In [129]:
df.head()

Unnamed: 0_level_0,redispatch,wind_speed_m/s,wind_direction_degrees,radiation_global_J/m2,air_temperature_K,humidity_percent,wind_gust_max_m/s,wind_direction_gust_max_degrees,forecast_solar_MW,actual_solar_MW,total_grid_load_MWh,residual_load_MWh,pumped_storage_MWh
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2020-01-01 00:00:00,0.0,2.1,250.0,0.0,278.45,80.7,3.3,250.0,0.0,0.0,730.95,624.18,3.25
2020-01-01 00:15:00,0.0,2.7,265.0,0.0,278.55,79.95,4.25,265.0,0.0,0.0,727.23,620.78,4.35
2020-01-01 00:30:00,0.0,2.4,240.0,0.0,278.45,80.4,3.5,250.0,0.0,0.0,722.2,611.37,6.4
2020-01-01 00:45:00,0.0,2.7,250.0,0.0,278.55,79.45,3.8,245.0,0.0,0.0,719.22,604.33,8.32
2020-01-01 01:00:00,0.0,2.7,260.0,0.0,278.45,80.5,3.9,260.0,0.0,0.0,717.07,600.83,6.32


In [132]:
#weather_columns = ['wind_speed_m/s', 'wind_direction_degrees', 'radiation_global_J/m2',
#                   'air_temperature_K', 'humidity_percent', 'wind_gust_max_m/s',
#                   'wind_direction_gust_max_degrees']


# Split the entire dataset into training and test sets
cutoff_time = "2023-01-01"
train = df[df.index < cutoff_time]
test = df[df.index >= cutoff_time]
X_train = train.drop('redispatch', axis = 1)
y_train = train['redispatch']
X_test = test.drop('redispatch', axis = 1)
y_test = test['redispatch']

In [133]:
# Define preprocessing pipeline for weather data
preprocessing_weather = Pipeline([
    ('imputing', SimpleImputer(strategy='median')),
    ('scaling',  StandardScaler())
])

preprocessing_weather

In [134]:
# Define main pipeline
main_pipeline = Pipeline([
    ('preprocessor', preprocessing_weather)
])

main_pipeline

In [135]:
X_train_pipe = main_pipeline.fit_transform(X_train)

In [136]:
sm = SMOTE(random_state=42)

In [137]:
# Fit and resample the data using the pipeline

X_train_resampled, y_train_resampled = sm.fit_resample(X_train_pipe, y_train)

In [138]:
# Train your model using X_train_resampled and y_train_resampled
rf_classifier = RandomForestClassifier(n_estimators=100,
                                        max_depth=5,
                                        min_samples_split=5,
                                        min_samples_leaf=1,
                                        max_features='sqrt',
                                        class_weight='balanced', # assigns higher weights to minority class
                                        random_state=42)
rf_classifier.fit(X_train_resampled, y_train_resampled)

In [139]:
# Evaluate model
X_test_scaled = main_pipeline.transform(X_test)
y_pred = rf_classifier.predict(X_test_scaled)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.94      0.72      0.82     32472
         1.0       0.10      0.42      0.17      2484

    accuracy                           0.70     34956
   macro avg       0.52      0.57      0.49     34956
weighted avg       0.88      0.70      0.77     34956



In [140]:
from collections import Counter

Counter(y_train_resampled)

Counter({0.0: 99338, 1.0: 99338})