In [1]:
# packages
import os
import numpy as np
import pandas as pd
import joblib

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, confusion_matrix, precision_score, f1_score
from sklearn.model_selection import TimeSeriesSplit
from matplotlib import pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf

In [2]:
# mount your google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/My Drive/ms_wind_curtailment_prediction/lagged_curtailment_target_features.csv', sep = ';', index_col=0)

Converting index to datetime type


In [4]:
# convert index to datetime type
df.index = pd.to_datetime(df.index)
# drop wind speed to reduce multicollinearity
df.drop(['wind_speed_m/s', 'wind_speed_m/s_lag1', 'wind_speed_m/s_lag2', 'wind_speed_m/s_lag3'], inplace=True, axis=1)

Adding weekday/weekend columns and season columns

In [None]:
'''# Assuming df is your DataFrame with a datetime index
# Create a new column for weekday/weekend
df['weekday'] = df.index.weekday < 5

# Create a new column for season based on month
df['season'] = (df.index.month % 12 + 3) // 3

# Perform one-hot encoding for the 'season' column
df = pd.get_dummies(df, columns=['season'], drop_first=True)

# Perform one-hot encoding for the 'weekday' column
df = pd.get_dummies(df, columns=['weekday'], drop_first=True)

# Convert the 'weekday' and 'season' columns to numerical
df['weekday'] = df['weekday_True'].astype(int)
df['season'] = df[['season_2', 'season_3', 'season_4']].idxmax(axis=1).str.extract(r'(\d)').astype(int)

# Drop the intermediate columns created during one-hot encoding
df.drop(columns=['weekday_True', 'season_2', 'season_3', 'season_4'], inplace=True)

# Now df contains the original data with the new weekday and season columns encoded numerically
'''

Extra trees classifier

In [5]:
# get desired df size
start_date = '2021-07-01'
end_date = '2023-11-30'
df_cv = df.loc[start_date:end_date]

In [7]:
# impute, scale pipeline and smote (for class imbalance)
preprocessor = Pipeline([
    ('scaler', StandardScaler())
])

smote = SMOTE(random_state=13)

# define features X and target y
X = df_cv.drop(['redispatch', 'level'], axis = 1)
y = df_cv['redispatch']

In [8]:
share_minority = y.value_counts().get(1, 0)/len(y)
print(share_minority)

0.09733031460780152


In [16]:
# Initialize the Extra Trees Classifier
extra_trees_clf = ExtraTreesClassifier(n_estimators=200, max_features='sqrt', min_samples_leaf=3, min_samples_split=2, max_depth=3, random_state=42)

# cross-validation
n_splits = 500 #stops at 10 valid folds
test_size = 96 #(24 - 6h; 48 - 12h; 96 - 24h with 15 min intervalls)
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)

precision_scores = []
f1_scores = []
conf_matrices = []
precision_train_scores = []
f1_train_scores = []
conf_train_matrices = []

total_folds = 0
valid_folds = 0
for train_index, test_index in tscv.split(X):
    total_folds += 1  # Increment the total folds counter
    print(f"Fold {total_folds}")

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Check if stratification condition is met and fold is within the first 10
    if abs(y_test.sum()/len(y_test) - share_minority) < 0.05:
      valid_folds += 1  # Increment the valid folds counter
      print(f"Training on valid fold {valid_folds}")

      # preprocess train and test data
      X_train_scaled = preprocessor.fit_transform(X_train)
      X_train_preprocessed, y_train_preprocessed = smote.fit_resample(X_train_scaled, y_train)
      X_test_preprocessed = preprocessor.transform(X_test)

      # fit the model
      #sample_weights = np.linspace(0, 2, len(y_train_preprocessed))  # linearly increasing weights
      extra_trees_clf.fit(X_train_preprocessed, y_train_preprocessed)#, sample_weight=sample_weights)

      # Make predictions on the train and test data with threshold
      threshold = 0.65
      y_pred_proba = extra_trees_clf.predict_proba(X_test_preprocessed)[:, 1]
      y_pred = (y_pred_proba >= threshold).astype(int)
      y_train_proba = extra_trees_clf.predict_proba(X_train_preprocessed)[:, 1]
      y_pred_train = (y_train_proba >= threshold).astype(int)

      # evaluate
      precision_scores.append(precision_score(y_test, y_pred))
      f1_scores.append(f1_score(y_test, y_pred))
      conf_matrices.append(confusion_matrix(y_test, y_pred))
      precision_train_scores.append(precision_score(y_train_preprocessed, y_pred_train))
      f1_train_scores.append(f1_score(y_train_preprocessed, y_pred_train))
      conf_train_matrices.append(confusion_matrix(y_train_preprocessed, y_pred_train))

      if valid_folds == 10:
        break

# print evaluation
print("Average Scores:")
print("Precision:", np.array(precision_scores).mean())
print("F1-Scores:", np.array(f1_scores).mean())
average_conf_matrix = np.round(sum(conf_matrices) / len(conf_matrices)).astype(int)
print("Average Confusion Matrix:")
print(f"{'True Negative':<20} {'False Positive':<20}")
print(f"{average_conf_matrix[0][0]:<20} {average_conf_matrix[0][1]:<20}")
print(f"{'False Negative':<20} {'True Positive':<20}")
print(f"{average_conf_matrix[1][0]:<20} {average_conf_matrix[1][1]:<20}")

print("Precision (Train):", np.array(precision_train_scores).mean())
print("F1-Scores (Train):", np.array(f1_train_scores).mean())
average_conf_matrix_train = np.round(sum(conf_train_matrices) / len(conf_train_matrices)).astype(int)
print("Average Confusion Matrix (Train):")
print(f"{'True Negative':<20} {'False Positive':<20}")
print(f"{average_conf_matrix_train[0][0]:<20} {average_conf_matrix_train[0][1]:<20}")
print(f"{'False Negative':<20} {'True Positive':<20}")
print(f"{average_conf_matrix_train[1][0]:<20} {average_conf_matrix_train[1][1]:<20}")

Fold 1
Fold 2
Training on valid fold 1
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Training on valid fold 2
Fold 8
Fold 9
Training on valid fold 3
Fold 10
Training on valid fold 4
Fold 11
Fold 12
Fold 13
Training on valid fold 5
Fold 14
Fold 15
Fold 16
Fold 17
Fold 18
Fold 19
Fold 20
Fold 21
Fold 22
Fold 23
Fold 24
Fold 25
Fold 26
Fold 27
Fold 28
Fold 29
Fold 30
Fold 31
Fold 32
Fold 33
Fold 34
Fold 35
Fold 36
Fold 37
Training on valid fold 6
Fold 38
Training on valid fold 7
Fold 39
Fold 40
Fold 41
Fold 42
Fold 43
Fold 44
Fold 45
Fold 46
Fold 47
Fold 48
Training on valid fold 8
Fold 49
Fold 50
Fold 51
Fold 52
Fold 53
Fold 54
Fold 55
Fold 56
Fold 57
Fold 58
Fold 59
Fold 60
Fold 61
Fold 62
Fold 63
Fold 64
Fold 65
Fold 66
Fold 67
Fold 68
Fold 69
Fold 70
Fold 71
Fold 72
Fold 73
Fold 74
Fold 75
Fold 76
Fold 77
Fold 78
Fold 79
Fold 80
Fold 81
Fold 82
Training on valid fold 9
Fold 83
Fold 84
Training on valid fold 10
Average Scores:
Precision: 0.20918192918192915
F1-Scores: 0.28905931170157506
Average 

**Train model with best hyperparameter and safe**

In [17]:
# select train datset
start_date = '2021-07-01'
end_date = '2023-11-30'
df_model_train = df.loc[start_date:end_date]
df_model_pred = df.loc['2023-11-30':'2023-12-31']

# define features X and target y
X_train = df_model_train.drop(['redispatch', 'level'], axis = 1)
X_test = df_model_pred.drop(['redispatch', 'level'], axis = 1)
y_train = df_model_train['redispatch']
y_test = df_model_pred['redispatch']

# preprocess train and test data
X_train_scaled = preprocessor.fit_transform(X_train)
X_train_preprocessed, y_train_preprocessed = smote.fit_resample(X_train_scaled, y_train)
X_test_preprocessed = preprocessor.transform(X_test)

# fit model
extra_trees_clf.fit(X_train_preprocessed, y_train_preprocessed)

In [26]:
# Save Extra Trees classifier
joblib.dump(extra_trees_clf, '/content/drive/My Drive/ms_wind_curtailment_prediction/extra_trees_classifier.pkl')

['/content/drive/My Drive/ms_wind_curtailment_prediction/extra_trees_classifier.pkl']

Trying extra trees classifier with a grid search to find the best hyperparameter set

In [None]:
start_time = '2021-12-31 23:45:00'

df_lagged = df_lagged[df_lagged.index > start_time]

#data is reduced to only 2 years - 2022 and 2023

In [None]:
# Initialize TimeSeriesSplit with the desired number of splits
tscv = TimeSeriesSplit(n_splits=5)

# Initialize lists to store train and test indices
train_indices = []
test_indices = []

# Perform Time Series Cross Validation
for train_index, test_index in tscv.split(df_lagged.index):
    train_indices.append(train_index)
    test_indices.append(test_index)

# Choose the desired split for train and test sets
split_index = 4  # choose the last split to have 80% data in train set and 20% in the test set

# Get the train and test indices
train_index = train_indices[split_index]
test_index = test_indices[split_index]

# Split the dataframe into train and test sets
train_df = df_lagged.iloc[train_index]
test_df = df_lagged.iloc[test_index]

# Print the lengths of train and test sets
print("Train set length:", len(train_df))
print("Test set length:", len(test_df))


Train set length: 58340
Test set length: 11668


Unnamed: 0_level_0,redispatch,wind_speed_m/s,wind_speed_m/s_lag1,wind_speed_m/s_lag2,wind_speed_m/s_lag3,wind_direction_degrees,wind_direction_degrees_lag1,wind_direction_degrees_lag2,wind_direction_degrees_lag3,radiation_global_J/m2,...,total_grid_load_MWh_lag2,total_grid_load_MWh_lag3,residual_load_MWh,residual_load_MWh_lag1,residual_load_MWh_lag2,residual_load_MWh_lag3,pumped_storage_MWh,pumped_storage_MWh_lag1,pumped_storage_MWh_lag2,pumped_storage_MWh_lag3
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-01 00:00:00,0.0,4.8,5.15,4.8,4.75,270.0,270.0,270.0,270.0,0.0,...,757.32,764.58,212.98,215.08,223.97,230.88,39.98,50.4,46.2,47.45
2022-01-01 00:15:00,0.0,4.8,4.8,5.15,4.8,265.0,270.0,270.0,270.0,0.0,...,747.0,757.32,209.07,212.98,215.08,223.97,39.67,39.98,50.4,46.2
2022-01-01 00:30:00,0.0,5.7,4.8,4.8,5.15,270.0,265.0,270.0,270.0,0.0,...,744.1,747.0,208.6,209.07,212.98,215.08,43.3,39.67,39.98,50.4
2022-01-01 00:45:00,0.0,6.25,5.7,4.8,4.8,270.0,270.0,265.0,270.0,0.0,...,737.25,744.1,199.5,208.6,209.07,212.98,46.33,43.3,39.67,39.98
2022-01-01 01:00:00,0.0,6.2,6.25,5.7,4.8,270.0,270.0,270.0,265.0,0.0,...,728.78,737.25,194.3,199.5,208.6,209.07,39.12,46.33,43.3,39.67


In [None]:
# Prepare the data
X_train = train_df.drop(columns=['redispatch'])  # Extract features
y_train = train_df['redispatch']  # Extract target

X_test = test_df.drop(columns=['redispatch'])  # Extract features
y_test = test_df['redispatch']  # Extract target

In [None]:
# Initialize the Extra Trees Classifier
extra_trees_clf = ExtraTreesClassifier(random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Initialize GridSearchCV with the classifier and parameter grid
grid_search = GridSearchCV(extra_trees_clf, param_grid, cv=tscv, scoring='f1_macro', verbose=2)

# Perform grid search using the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best F1 score:", best_score)

# Evaluate the best model on the testing set
best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict(X_test)
f1_test = f1_score(y_test, y_pred, average='macro')
precision_test = precision_score(y_test, y_pred, average='macro', zero_division='warn')
print("F1 score on the testing set:", f1_test)
print("Precision score on the testing set:", precision_test)


Fitting 5 folds for each of 540 candidates, totalling 2700 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   5.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   4.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=

Best Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Best F1 score: 0.5167404206535176
F1 score on the testing set: 0.5237467789810708
Precision score on the testing set: 0.5584318606693672