In [8]:
# packages
import os
import numpy as np
import pandas as pd
import joblib

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, confusion_matrix, precision_score, f1_score
from sklearn.model_selection import TimeSeriesSplit
from matplotlib import pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf

In [9]:
# mount your google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
df = pd.read_csv('/content/drive/My Drive/ms_wind_curtailment_prediction/lagged_curtailment_target_features.csv', sep = ';', index_col=0)

Converting index to datetime type


In [24]:
# convert index to datetime type
df.index = pd.to_datetime(df.index)
# drop wind speed to reduce multicollinearity
df.drop(['wind_speed_m/s'], inplace=True, axis=1)

Adding weekday/weekend columns and season columns

In [15]:
'''# Assuming df is your DataFrame with a datetime index
# Create a new column for weekday/weekend
df['weekday'] = df.index.weekday < 5

# Create a new column for season based on month
df['season'] = (df.index.month % 12 + 3) // 3

# Perform one-hot encoding for the 'season' column
df = pd.get_dummies(df, columns=['season'], drop_first=True)

# Perform one-hot encoding for the 'weekday' column
df = pd.get_dummies(df, columns=['weekday'], drop_first=True)

# Convert the 'weekday' and 'season' columns to numerical
df['weekday'] = df['weekday_True'].astype(int)
df['season'] = df[['season_2', 'season_3', 'season_4']].idxmax(axis=1).str.extract(r'(\d)').astype(int)

# Drop the intermediate columns created during one-hot encoding
df.drop(columns=['weekday_True', 'season_2', 'season_3', 'season_4'], inplace=True)

# Now df contains the original data with the new weekday and season columns encoded numerically
'''

Extra trees classifier

In [25]:
# get desired df size
start_date = '2021-01-01'
end_date = '2023-06-30'
df = df.loc[start_date:end_date]

In [26]:
# impute, scale pipeline and smote (for class imbalance)
preprocessor = Pipeline([
    ('scaler', StandardScaler())
])

smote = SMOTE(random_state=13)

In [31]:
# Initialize the Extra Trees Classifier
extra_trees_clf = ExtraTreesClassifier(n_estimators=200, max_features='sqrt', min_samples_leaf=3, min_samples_split=2, max_depth=5, random_state=42)

# define features X and target y
X = df.drop(['redispatch', 'level'], axis = 1)
y = df['redispatch']

# cross-validation
n_splits = 70
test_size = 48 #(48 - 12h with 15 min intervalls)
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size, gap = 10)

precision_scores = []
f1_scores = []
conf_matrices = []
precision_train_scores = []
f1_train_scores = []
conf_train_matrices = []


for fold, (train_index, test_index) in enumerate(tscv.split(X), 1):
    print(f"Training on fold {fold}/{n_splits}")

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    X_train = X_train.copy()
    X_test = X_test.copy()

    # having at least one instance of redispatch 1 or 0 in test
    if y_test.sum() == 0 or y_test.sum() == 1:
        continue

    # preprocess train and test data
    X_train_scaled = preprocessor.fit_transform(X_train)
    X_train_preprocessed, y_train_preprocessed = smote.fit_resample(X_train_scaled, y_train)
    X_test_preprocessed = preprocessor.transform(X_test)

    print("Training the Extra Trees Classifier...")
    # fit the model
    extra_trees_clf.fit(X_train_preprocessed, y_train_preprocessed)

    print("Making predictions on train and test sets...")
    # Make predictions on the train and test data with threshold
    threshold = 0.65
    y_pred_proba = extra_trees_clf.predict_proba(X_test_preprocessed)[:, 1]
    y_pred = (y_pred_proba >= threshold).astype(int)
    y_train_proba = extra_trees_clf.predict_proba(X_train_preprocessed)[:, 1]
    y_pred_train = (y_train_proba >= threshold).astype(int)

    # evaluate
    print("Calculating evaluation metrics for test set...")
    precision_scores.append(precision_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    conf_matrices.append(confusion_matrix(y_test, y_pred))

    print("Calculating evaluation metrics for train set...")
    precision_train_scores.append(precision_score(y_train_preprocessed, y_pred_train))
    f1_train_scores.append(f1_score(y_train_preprocessed, y_pred_train))
    conf_train_matrices.append(confusion_matrix(y_train_preprocessed, y_pred_train))

# print evaluation results
print("Average Scores:")
print("Precision:", np.array(precision_scores).mean())
print("F1-Scores:", np.array(f1_scores).mean())
print("Confusion Matrix:", sum(conf_matrices)/len(conf_matrices))
print("Precision (Train):", np.array(precision_train_scores).mean())
print("F1-Scores (Train):", np.array(f1_train_scores).mean())
print("Confusion Matrix (Train):", sum(conf_train_matrices)/len(conf_train_matrices))

Training on fold 1/70
Training on fold 2/70
Training on fold 3/70
Training on fold 4/70
Training on fold 5/70
Training on fold 6/70
Training on fold 7/70
Training on fold 8/70
Training on fold 9/70
Training on fold 10/70
Training on fold 11/70
Training on fold 12/70
Training the Extra Trees Classifier...
Making predictions on train and test sets...
Calculating evaluation metrics for test set...
Calculating evaluation metrics for train set...
Training on fold 13/70
Training on fold 14/70
Training on fold 15/70
Training on fold 16/70
Training on fold 17/70
Training on fold 18/70
Training on fold 19/70
Training on fold 20/70
Training on fold 21/70
Training on fold 22/70
Training on fold 23/70
Training on fold 24/70
Training on fold 25/70
Training on fold 26/70
Training on fold 27/70
Training on fold 28/70
Training on fold 29/70
Training on fold 30/70
Training the Extra Trees Classifier...
Making predictions on train and test sets...
Calculating evaluation metrics for test set...
Calculati

In [None]:
Average Scores:
Precision: 0.5194977714585558
F1-Scores: 0.6722744782018751
Confusion Matrix: [[23.5        10.66666667]
 [ 0.         13.83333333]]
Precision (Train): 0.7957944076961746
F1-Scores (Train): 0.5605979337236772
Confusion Matrix (Train): [[70239.66666667  8773.83333333]
 [44824.         34189.5       ]]

In [None]:
# define features X and target y
X = df.drop(['redispatch', 'level'], axis = 1)
y = df['redispatch']

# Initialize TimeSeriesSplit with the desired number of splits
tscv = TimeSeriesSplit(n_splits=5)

# Initialize lists to store train and test indices
train_indices = []
test_indices = []

# Perform Time Series Cross Validation
for train_index, test_index in tscv.split(X):
    train_indices.append(train_index)
    test_indices.append(test_index)

# Choose the desired split for train and test sets
split_index = 4  # choose the last split to have 80% data in train set and 20% in the test set

# Get the train and test indices
train_index = train_indices[split_index]
test_index = test_indices[split_index]

# Split the dataframe into train and test sets
train_X, train_y = X.iloc[train_index], y.iloc[train_index]
test_X, test_y = X.iloc[test_index], y.iloc[test_index]

# Print the lengths of train and test sets
print("Train set length:", len(train_X))
print("Test set length:", len(test_X))

# Initialize the Extra Trees Classifier
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, max_features='sqrt', min_samples_leaf=4, min_samples_split=2, max_depth=20, random_state=42)

# Initialize SMOTE for class imbalance handling
smote = SMOTE(random_state=42)

# Initialize StandardScaler for feature scaling
scaler = StandardScaler()

print("Applying SMOTE for class imbalance handling...")
# Apply SMOTE for class imbalance handling
train_X_resampled, train_y_resampled = smote.fit_resample(train_X, train_y)

print("Scaling the features...")
# Scale the features
train_X_scaled = scaler.fit_transform(train_X_resampled)
test_X_scaled = scaler.transform(test_X)


print("Training the Extra Trees Classifier...")
# Fit the model to the training data
extra_trees_clf.fit(train_X_scaled, train_y_resampled)

print("Making predictions on train and test sets...")
# Make predictions on the train and test data
train_y_pred = extra_trees_clf.predict(train_X_scaled)
test_y_pred = extra_trees_clf.predict(test_X_scaled)

print("Calculating evaluation metrics for train set...")
# Calculate evaluation metrics for train set
train_f1 = f1_score(train_y_resampled, train_y_pred, average='macro')
train_precision = precision_score(train_y_resampled, train_y_pred, average='macro', zero_division='warn')

print("Calculating evaluation metrics for test set...")
# Calculate evaluation metrics for test set
test_f1 = f1_score(test_y, test_y_pred, average='macro')
test_precision = precision_score(test_y, test_y_pred, average='macro', zero_division='warn')

# Print average scores across all folds for train and test sets
print("Train F1 score:", train_f1)
print("Train precision score:", train_precision)
print("Test F1 score:", test_f1)
print("Test precision score:", test_precision)


Train set length: 58340
Test set length: 11668
Applying SMOTE for class imbalance handling...
Scaling the features...
Training the Extra Trees Classifier...
Making predictions on train and test sets...
Calculating evaluation metrics for train set...
Calculating evaluation metrics for test set...
Train F1 score: 0.9796146786370097
Train precision score: 0.9798145915903719
Test F1 score: 0.5018879240540509
Test precision score: 0.5307973523914371


Saving the extra trees model

In [None]:
# Save Extra Trees classifier
joblib.dump(extra_trees_clf, 'extra_trees_classifier.pkl')

['extra_trees_classifier.pkl']

1. first test of extra trees classifier with 100 estimators and with no time lagging (2020 - end of 2023)
  
  Average F1 score: 0.48574702158730904
  Average precision score: 0.5065032114306256

2. second test of extra trees classifier with 500 estimators and with no time lagging (2020 - end of 2023)

  Average F1 score: 0.4860819102848346
  Average precision score: 0.5197146774933564

3. third test of extra trees classifier with 100 estimators and with time lagging - the features are also scaled and class imbalance is solved (2020 - end of 2023)

  Average F1 score: 0.5022292421470077
  Average precision score: 0.5135108579330385

4. fourth test of extra trees classifier with 500 estimators and with time lagging - the features are also scaled and class imbalance is solved (2020 - end of 2023)

  Average F1 score: 0.5022939357149807
  Average precision score: 0.5139316629373413

5. fifth test of extra trees classifier with 500 estimators and with time lagging - the features are also scaled and class imbalance is solved (2020 - end of 2023) - NO WIND SPEED AND ACTUAL SOLAR
  
  Average F1 score: 0.5032077844561853
  Average precision score: 0.5142210693172059

6. sixth test of extra trees classifier with 500 estimators and with time lagging - the features are also scaled and class imbalance is solved (2020 - end of 2023) - NO WIND GUST MAX AND FORECAST SOLAR

  Average F1 score: 0.5038018272452696
  Average precision score: 0.5219906097438674

7. seventh test of extra trees classifier with 100 estimators and with time lagging - the features are also scaled and class imbalance is solved (2020 - end of 2023) - NO WIND GUST MAX AND ACTUAL SOLAR
  
  Average Train F1 score: 0.9877561977704226
  Average Train precision score: 0.9881442108759486
  Average Test F1 score: 0.5060695859875237
  Average Test precision score: 0.5218312260860379

8. eigth test of extra trees classifier with 100 estimators and with time lagging - the features are also scaled and class imbalance is solved (2020 - end of 2023) - NO WIND SPEED AND ACTUAL SOLAR

  Average Train F1 score: 0.987464728765999
  Average Train precision score: 0.9878657689591807
  Average Test F1 score: 0.507415861963602
  Average Test precision score: 0.5253941436935031

9. ninth test of extra trees classifier with 100 estimators and with time lagging - the features are also scaled and class imbalance is solved (2022 - end of 2023) - NO WIND SPEED AND ACTUAL SOLAR

  Train F1 score: 0.9706662734715747
  Train precision score: 0.9713627820195279
  Test F1 score: 0.5074936981415743
  Test precision score: 0.5278601341308067

10. tenth test of extra trees classifier with 100 estimators and with time lagging - the features are also scaled and class imbalance is solved (2022 - end of 2023) - NO WIND SPEED AND ACTUAL SOLAR & WEEKDAY + SEASON COLS ADDED

  Train F1 score: 0.9796146786370097
  Train precision score: 0.9798145915903719
  Test F1 score: 0.5018879240540509
  Test precision score: 0.5307973523914371

Trying extra trees classifier with a grid search to find the best hyperparameter set

In [None]:
start_time = '2021-12-31 23:45:00'

df_lagged = df_lagged[df_lagged.index > start_time]

#data is reduced to only 2 years - 2022 and 2023

In [None]:
# Initialize TimeSeriesSplit with the desired number of splits
tscv = TimeSeriesSplit(n_splits=5)

# Initialize lists to store train and test indices
train_indices = []
test_indices = []

# Perform Time Series Cross Validation
for train_index, test_index in tscv.split(df_lagged.index):
    train_indices.append(train_index)
    test_indices.append(test_index)

# Choose the desired split for train and test sets
split_index = 4  # choose the last split to have 80% data in train set and 20% in the test set

# Get the train and test indices
train_index = train_indices[split_index]
test_index = test_indices[split_index]

# Split the dataframe into train and test sets
train_df = df_lagged.iloc[train_index]
test_df = df_lagged.iloc[test_index]

# Print the lengths of train and test sets
print("Train set length:", len(train_df))
print("Test set length:", len(test_df))


Train set length: 58340
Test set length: 11668


Unnamed: 0_level_0,redispatch,wind_speed_m/s,wind_speed_m/s_lag1,wind_speed_m/s_lag2,wind_speed_m/s_lag3,wind_direction_degrees,wind_direction_degrees_lag1,wind_direction_degrees_lag2,wind_direction_degrees_lag3,radiation_global_J/m2,...,total_grid_load_MWh_lag2,total_grid_load_MWh_lag3,residual_load_MWh,residual_load_MWh_lag1,residual_load_MWh_lag2,residual_load_MWh_lag3,pumped_storage_MWh,pumped_storage_MWh_lag1,pumped_storage_MWh_lag2,pumped_storage_MWh_lag3
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-01 00:00:00,0.0,4.8,5.15,4.8,4.75,270.0,270.0,270.0,270.0,0.0,...,757.32,764.58,212.98,215.08,223.97,230.88,39.98,50.4,46.2,47.45
2022-01-01 00:15:00,0.0,4.8,4.8,5.15,4.8,265.0,270.0,270.0,270.0,0.0,...,747.0,757.32,209.07,212.98,215.08,223.97,39.67,39.98,50.4,46.2
2022-01-01 00:30:00,0.0,5.7,4.8,4.8,5.15,270.0,265.0,270.0,270.0,0.0,...,744.1,747.0,208.6,209.07,212.98,215.08,43.3,39.67,39.98,50.4
2022-01-01 00:45:00,0.0,6.25,5.7,4.8,4.8,270.0,270.0,265.0,270.0,0.0,...,737.25,744.1,199.5,208.6,209.07,212.98,46.33,43.3,39.67,39.98
2022-01-01 01:00:00,0.0,6.2,6.25,5.7,4.8,270.0,270.0,270.0,265.0,0.0,...,728.78,737.25,194.3,199.5,208.6,209.07,39.12,46.33,43.3,39.67


In [None]:
# Prepare the data
X_train = train_df.drop(columns=['redispatch'])  # Extract features
y_train = train_df['redispatch']  # Extract target

X_test = test_df.drop(columns=['redispatch'])  # Extract features
y_test = test_df['redispatch']  # Extract target

In [None]:
# Initialize the Extra Trees Classifier
extra_trees_clf = ExtraTreesClassifier(random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Initialize GridSearchCV with the classifier and parameter grid
grid_search = GridSearchCV(extra_trees_clf, param_grid, cv=tscv, scoring='f1_macro', verbose=2)

# Perform grid search using the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best F1 score:", best_score)

# Evaluate the best model on the testing set
best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict(X_test)
f1_test = f1_score(y_test, y_pred, average='macro')
precision_test = precision_score(y_test, y_pred, average='macro', zero_division='warn')
print("F1 score on the testing set:", f1_test)
print("Precision score on the testing set:", precision_test)


Fitting 5 folds for each of 540 candidates, totalling 2700 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   5.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   4.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=

Best Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Best F1 score: 0.5167404206535176
F1 score on the testing set: 0.5237467789810708
Precision score on the testing set: 0.5584318606693672