In [1]:
# packages
import os
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import TimeSeriesSplit

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, recall_score, confusion_matrix

In [7]:
# mount your google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
# read csv
df = pd.read_csv('/content/drive/My Drive/ms_wind_curtailment_prediction/lagged_curtailment_target_features.csv', sep = ';', index_col=0)

In [12]:
# preprocessing pipelines
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])

smote = SMOTE(random_state=13)

# create model instance
rf_classifier = RandomForestClassifier(n_estimators=100,
                                        max_depth=5,
                                        min_samples_split=5,
                                        min_samples_leaf=1,
                                        max_features='sqrt',
                                        class_weight='balanced', # assigns higher weights to minority class
                                        random_state=42)
# features X and target y
X = df.drop(['redispatch', 'level'], axis = 1)
y = df['redispatch']

In [16]:
# cross-validation
n_splits = 70
test_size = 24 #(12h with 15 min intervalls)
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)

recall_scores = []
conf_matrices = []

for train_index, test_index in tscv.split(X):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    X_train = X_train.copy()
    X_test = X_test.copy()

    # check if at least one instance of redispatch with status 1 is present in the test data
    if y_test.sum() == 0:
        continue

    # importance of the last redispatch status for the training data
    window_size = 2  # last 30 min
    last_redispatch_importance_train = []
    for i in range(len(y_train)):
        window_start = max(0, i - window_size)
        window_end = i
        importance = y_train.iloc[window_start:window_end].sum() # also incorporate the level
        last_redispatch_importance_train.append(importance)
    X_train['last_redispatch_importance'] = last_redispatch_importance_train

    # rate of decrease for the importance values
    decrease_rate = last_redispatch_importance_train[-1] / len(X_test)

    # forward fill the last value of 'last_redispatch_importance' for the test data with decreasing values
    last_redispatch_importance_test = np.linspace(last_redispatch_importance_train[-1], 0, len(X_test))
    X_test['last_redispatch_importance'] = last_redispatch_importance_test

    # Preprocess and oversample only the training data
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_train_preprocessed, y_train_preprocessed = smote.fit_resample(X_train_preprocessed, y_train)

    # Preprocess the test data
    X_test_preprocessed = preprocessor.transform(X_test)

    # Fit the model on the preprocessed training data
    rf_classifier.fit(X_train_preprocessed, y_train_preprocessed)

    # Make predictions on the preprocessed test data
    y_pred = rf_classifier.predict(X_test_preprocessed)

    # Convert predicted probabilities to binary predictions
    y_pred_binary = [1 if pred > 0.3 else 0 for pred in y_pred]

 #   print(y_test.value_counts())
    print("printing last redispatch importance ", X_test['last_redispatch_importance'])

    recall = recall_score(y_test, y_pred_binary)
    recall_scores.append(recall)
    conf_matrix = confusion_matrix(y_test, y_pred_binary)
    conf_matrices.append(conf_matrix)

# evaluate
print("Average Recall Score:", sum(recall_scores) / len(recall_scores))
print("Average Confusion Matrix:", sum(conf_matrices) / len(conf_matrices))

printing last redispatch importance  timestamp
2023-12-14 06:00:00    0.0
2023-12-14 06:15:00    0.0
2023-12-14 06:30:00    0.0
2023-12-14 06:45:00    0.0
2023-12-14 07:00:00    0.0
2023-12-14 07:15:00    0.0
2023-12-14 07:30:00    0.0
2023-12-14 07:45:00    0.0
2023-12-14 08:00:00    0.0
2023-12-14 08:15:00    0.0
2023-12-14 08:30:00    0.0
2023-12-14 08:45:00    0.0
2023-12-14 09:00:00    0.0
2023-12-14 09:15:00    0.0
2023-12-14 09:30:00    0.0
2023-12-14 09:45:00    0.0
2023-12-14 10:00:00    0.0
2023-12-14 10:15:00    0.0
2023-12-14 10:30:00    0.0
2023-12-14 10:45:00    0.0
2023-12-14 11:00:00    0.0
2023-12-14 11:15:00    0.0
2023-12-14 11:30:00    0.0
2023-12-14 11:45:00    0.0
Name: last_redispatch_importance, dtype: float64
printing last redispatch importance  timestamp
2023-12-14 12:00:00    2.000000
2023-12-14 12:15:00    1.913043
2023-12-14 12:30:00    1.826087
2023-12-14 12:45:00    1.739130
2023-12-14 13:00:00    1.652174
2023-12-14 13:15:00    1.565217
2023-12-14 13:30: