## Baseline binary classification model

In [58]:
# packages
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import recall_score, confusion_matrix, accuracy_score, precision_score

In [2]:
# mount your google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**EDA**

In [42]:
df = pd.read_csv('/content/drive/My Drive/ms_wind_curtailment_prediction/curtailment_target_features.csv', sep = ';', index_col=0)

In [None]:
df_reset_index = df.reset_index(drop=True)
sns.pairplot(df_reset_index, diag_kind='kde')

**The baseline model simply assigns a positive redispatch status when the wind speed exceeds a specific value**

In [45]:
# selects only the feature wind speed and replace NaN values with the mean
df = df[['redispatch', 'wind_speed_m/s']]
mean_value = df['wind_speed_m/s'].mean()
df['wind_speed_m/s'].fillna(value=mean_value, inplace=True)

In [47]:
df.groupby('redispatch').describe() # rows with wind speed above 6 m/s are considered as redispatch

Unnamed: 0_level_0,wind_speed_m/s,wind_speed_m/s,wind_speed_m/s,wind_speed_m/s,wind_speed_m/s,wind_speed_m/s,wind_speed_m/s,wind_speed_m/s
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
redispatch,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0.0,131810.0,4.516849,2.62367,0.0,2.5,4.1,6.0,19.75
1.0,8398.0,6.202194,2.703884,0.2,4.2,6.106122,7.8,18.55


In [48]:
# Drop the rows to get appropriate test data
df.drop(df.index[-4416:], inplace=True)

**Cross validation in Time Series**

In [60]:
# cross-validation
n_splits = 16
test_size = 24 #(6h with 15 min intervalls)
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size) #, step=step)

# features X and target y
X = df['wind_speed_m/s']
y = df['redispatch']

recall_scores = []
precision_scores = []
accuracy_scores = []
conf_matrices = []

for i, (train_index, test_index) in enumerate(tscv.split(X)):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    X_train = X_train.copy()
    X_test = X_test.copy()

    # check if at least one instance of redispatch with status 1 is present in the test data
    if y_test.sum() == 0:
        continue

    # Make predictions on test data
    y_pred = [1 if wind_speed > 6 else 0 for wind_speed in X_test]

    recall = recall_score(y_test, y_pred)
    recall_scores.append(recall)
    precision = precision_score(y_test, y_pred)
    precision_scores.append(precision)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    conf_matrix = confusion_matrix(y_test, y_pred)
    conf_matrices.append(conf_matrix)

# evaluate
print("Average Recall:", sum(recall_scores) / len(recall_scores))
print("Average Precision:", sum(precision_scores) / len(precision_scores))
print("Average Accuracy:", sum(accuracy_scores) / len(accuracy_scores))
print("Average Confusion Matrix:", sum(conf_matrices) / len(conf_matrices))

Average Recall: 0.5454545454545454
Average Precision: 0.13627450980392156
Average Accuracy: 0.5138888888888888
Average Confusion Matrix: [[9.66666667 9.66666667]
 [2.         2.66666667]]


  _warn_prf(average, modifier, msg_start, len(result))
