## Baseline binary classification model

In [85]:
# packages
import os
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [86]:
# mount your google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Reading data sets and merging it together**

In [124]:
df_weather = pd.read_csv('/content/drive/My Drive/ms_wind_curtailment_prediction/weather_data.csv', sep = ',', index_col=0)
df_weather['timestamp'] = pd.to_datetime(df_weather['date'])
df_weather.set_index('timestamp', inplace=True)
df_weather.drop('date', axis = 1, inplace = True)
df_redispatch = pd.read_csv('/content/drive/My Drive/ms_wind_curtailment_prediction/wind_redispatch_2020_24.csv', sep = ';', index_col=0)
df_redispatch.index = pd.to_datetime(df_redispatch.index)
df_solar = pd.read_csv('/content/drive/My Drive/ms_wind_curtailment_prediction/solar_feedin_2020_24.csv', sep = ';', index_col=0)
df_solar.index = pd.to_datetime(df_solar.index)
df_demand = pd.read_csv('/content/drive/My Drive/ms_wind_curtailment_prediction/electricity_demand_2020_24.csv', sep = ';', index_col=0)
df_demand.index = pd.to_datetime(df_demand.index)

In [125]:
df_redispatch_subset = df_redispatch[['redispatch']]
df = df_redispatch_subset.merge(df_weather, how='outer', left_index=True, right_index=True)
df = df.resample('15T').first()
df = df.merge(df_solar, how='outer', left_index=True, right_index=True)
df = df.merge(df_demand, how='outer', left_index=True, right_index=True)

In [126]:
df = df[(df.index.date >= pd.to_datetime('2020-01-01').date()) & (df.index.date <= pd.to_datetime('2023-12-30').date())]

In [128]:
#forward actual
#backward forecast

In [127]:
# save csv
df.to_csv('/content/drive/My Drive/ms_wind_curtailment_prediction/curtailment_target_features.csv', sep = ';')

## Train random forest binary classification while handling class imbalance target variable

**why level only has 0,30,50,60% and what does it actually mean**

In [None]:
weather_columns = ['wind_speed_m/s', 'wind_direction_degrees', 'radiation_global_J/m2',
                   'air_temperature_K', 'humidity_percent', 'wind_gust_max_m/s',
                   'wind_direction_gust_max_degrees']


# Split the entire dataset into training and test sets
cutoff_time = "2023-01-01"
train = df[df.index < cutoff_time]
test = df[df.index >= cutoff_time]
X_train = train[weather_columns]
y_train = train['redispatch']
X_test = test[weather_columns]
y_test = test['redispatch']

In [None]:
# Define preprocessing pipeline for weather data
preprocessing_weather = Pipeline([
    ('imputing', SimpleImputer(strategy='median')),
    ('scaling',  StandardScaler())
])

preprocessing_weather

In [None]:
# Define main pipeline
main_pipeline = Pipeline([
    ('preprocessor', preprocessing_weather)
])

main_pipeline

In [None]:
X_train_pipe = main_pipeline.fit_transform(X_train)

In [None]:
sm = SMOTE(random_state=42)

In [None]:
# Fit and resample the data using the pipeline

X_train_resampled, y_train_resampled = sm.fit_resample(X_train_pipe, y_train)

In [None]:
# Train your model using X_train_resampled and y_train_resampled
rf_classifier = RandomForestClassifier(n_estimators=100,
                                        max_depth=5,
                                        min_samples_split=5,
                                        min_samples_leaf=1,
                                        max_features='sqrt',
                                        class_weight='balanced', # assigns higher weights to minority class
                                        random_state=42)
rf_classifier.fit(X_train_resampled, y_train_resampled)

In [None]:
# Evaluate model
X_test_scaled = main_pipeline.transform(X_test)
y_pred = rf_classifier.predict(X_test_scaled)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.72      0.82     48823
           1       0.10      0.41      0.16      3744

    accuracy                           0.70     52567
   macro avg       0.52      0.57      0.49     52567
weighted avg       0.88      0.70      0.77     52567



In [None]:
from collections import Counter

Counter(y_train_resampled)

Counter({0: 148962, 1: 148962})