## Baseline binary classification model

In [9]:
# packages
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import confusion_matrix, precision_score, f1_score

In [3]:
# mount your google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**EDA**

In [4]:
df = pd.read_csv('/content/drive/My Drive/ms_wind_curtailment_prediction/curtailment_target_features.csv', sep = ';', index_col=0)

In [6]:
df_reset_index = df.reset_index(drop=True)
#sns.pairplot(df_reset_index, diag_kind='kde')

**The baseline model simply assigns a positive redispatch status when the wind speed exceeds a specific value**

In [7]:
# selects only the feature wind speed and replace NaN values with the mean
df = df[['redispatch', 'wind_gust_max_m/s']]
mean_value = df['wind_gust_max_m/s'].mean()
df['wind_gust_max_m/s'].fillna(value=mean_value, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['wind_gust_max_m/s'].fillna(value=mean_value, inplace=True)


In [8]:
df.groupby('redispatch').describe() # rows with wind speed above 9 m/s are considered as redispatch

Unnamed: 0_level_0,wind_gust_max_m/s,wind_gust_max_m/s,wind_gust_max_m/s,wind_gust_max_m/s,wind_gust_max_m/s,wind_gust_max_m/s,wind_gust_max_m/s,wind_gust_max_m/s
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
redispatch,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0.0,131810.0,6.583403,3.87832,0.0,3.6,6.0,8.8,30.2
1.0,8398.0,9.233684,3.969521,0.3,6.35,9.2,11.65,26.1


In [None]:
# Drop the rows to get appropriate test data
# df.drop(df.index[-4416:], inplace=True)

**Cross validation in Time Series**

In [11]:
# get desired df size
start_date = '2021-07-01'
end_date = '2023-11-30'
df_cv = df.loc[start_date:end_date]

# features X and target y
X = df['wind_gust_max_m/s']
y = df['redispatch']

share_minority = y.value_counts().get(1, 0)/len(y)

# cross-validation
n_splits = 500 #stops at 10 valid folds
test_size = 96 #(24 - 6h; 48 - 12h; 96 - 24h with 15 min intervalls)
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)

precision_scores = []
f1_scores = []
conf_matrices = []
precision_train_scores = []
f1_train_scores = []
conf_train_matrices = []

total_folds = 0
valid_folds = 0
for train_index, test_index in tscv.split(X):
    total_folds += 1
    print(f"Fold {total_folds}")

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Check if stratification condition is met and fold is within the first 10
    if abs(y_test.sum()/len(y_test) - share_minority) < 0.05:
      valid_folds += 1  # Increment the valid folds counter
      print(f"Training on valid fold {valid_folds}")

      # Make predictions on test data
      y_pred = [1 if wind_gust_max > 9 else 0 for wind_gust_max in X_test]
      y_pred_train = [1 if wind_gust_max > 9 else 0 for wind_gust_max in X_train]

      # evaluate
      precision_scores.append(precision_score(y_test, y_pred))
      f1_scores.append(f1_score(y_test, y_pred))
      conf_matrices.append(confusion_matrix(y_test, y_pred))
      precision_train_scores.append(precision_score(y_train, y_pred_train))
      f1_train_scores.append(f1_score(y_train, y_pred_train))
      conf_train_matrices.append(confusion_matrix(y_train, y_pred_train))

      if valid_folds == 10:
        break

# print evaluation
print("Average Scores:")
print("Precision:", np.array(precision_scores).mean())
print("F1-Scores:", np.array(f1_scores).mean())
average_conf_matrix = np.round(sum(conf_matrices) / len(conf_matrices)).astype(int)
print("Average Confusion Matrix:")
print(f"{'True Negative':<20} {'False Positive':<20}")
print(f"{average_conf_matrix[0][0]:<20} {average_conf_matrix[0][1]:<20}")
print(f"{'False Negative':<20} {'True Positive':<20}")
print(f"{average_conf_matrix[1][0]:<20} {average_conf_matrix[1][1]:<20}")

print("Precision (Train):", np.array(precision_train_scores).mean())
print("F1-Scores (Train):", np.array(f1_train_scores).mean())
average_conf_matrix_train = np.round(sum(conf_train_matrices) / len(conf_train_matrices)).astype(int)
print("Average Confusion Matrix (Train):")
print(f"{'True Negative':<20} {'False Positive':<20}")
print(f"{average_conf_matrix_train[0][0]:<20} {average_conf_matrix_train[0][1]:<20}")
print(f"{'False Negative':<20} {'True Positive':<20}")
print(f"{average_conf_matrix_train[1][0]:<20} {average_conf_matrix_train[1][1]:<20}")

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Training on valid fold 1
Fold 8
Fold 9
Fold 10
Fold 11
Fold 12
Fold 13
Fold 14
Fold 15
Fold 16
Fold 17
Fold 18
Training on valid fold 2
Fold 19
Fold 20
Fold 21
Fold 22
Fold 23
Fold 24
Fold 25
Fold 26
Fold 27
Fold 28
Fold 29
Fold 30
Fold 31
Fold 32
Fold 33
Fold 34
Fold 35
Fold 36
Fold 37
Fold 38
Fold 39
Training on valid fold 3


  _warn_prf(average, modifier, msg_start, len(result))


Fold 40
Fold 41
Fold 42
Fold 43
Fold 44
Fold 45
Fold 46
Fold 47
Fold 48
Fold 49
Fold 50
Training on valid fold 4
Fold 51
Fold 52
Fold 53
Fold 54
Training on valid fold 5
Fold 55
Fold 56
Fold 57
Fold 58
Fold 59
Training on valid fold 6
Fold 60
Fold 61
Fold 62
Fold 63
Fold 64
Fold 65
Fold 66
Fold 67
Fold 68
Fold 69
Fold 70
Fold 71
Fold 72
Fold 73
Fold 74
Fold 75
Fold 76
Fold 77
Fold 78
Fold 79
Fold 80
Fold 81
Fold 82
Fold 83
Fold 84
Fold 85
Fold 86
Fold 87
Fold 88
Fold 89
Fold 90
Fold 91
Fold 92
Fold 93
Fold 94
Fold 95
Fold 96
Fold 97
Fold 98
Fold 99
Fold 100
Fold 101
Fold 102
Fold 103
Fold 104
Fold 105
Fold 106
Fold 107
Fold 108
Fold 109
Fold 110
Fold 111
Fold 112
Fold 113
Fold 114
Fold 115
Fold 116
Fold 117
Fold 118
Fold 119
Fold 120
Fold 121
Fold 122
Fold 123
Fold 124
Fold 125
Fold 126
Fold 127
Training on valid fold 7
Fold 128
Fold 129
Fold 130
Fold 131
Fold 132
Training on valid fold 8
Fold 133
Training on valid fold 9
Fold 134
Training on valid fold 10
Average Scores:
Precision: 0.