##Ensemble Method

In [1]:
# packages
import os
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import StackingClassifier

from sklearn.metrics import recall_score, confusion_matrix, accuracy_score, precision_score
from matplotlib import pyplot as plt

In [2]:
# mount your google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_csv('/content/drive/My Drive/ms_wind_curtailment_prediction/lagged_curtailment_target_features.csv', sep = ';', index_col=0)

In [4]:
# Drop the rows to get appropriate test data
start_date = '2021-01-01'
end_date = '2023-11-14'
df = df.loc[start_date:end_date]

In [None]:
# features X and target y
X = df.drop(['redispatch', 'level'], axis = 1)
y = df['redispatch']

In [5]:
# preprocessing pipelines
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())#,
   # ('feature_selection', SelectKBest(score_func=f_classif, k = 20))
])

smote = SMOTE(random_state=13)

In [6]:
# importance of the last redispatch status for the training data
def last_redispatch(y_train, X_test):
    """
    tbd.
    """
    window_size = 2  #last 30 min
    last_redispatch_importance_train = []
    for i in range(len(y_train)):
        window_start = max(0, i - window_size)
        window_end = i
        importance = y_train.iloc[window_start:window_end].sum() # also incorporate the level ???
        last_redispatch_importance_train.append(importance)
    # forward fill the last value of 'last_redispatch_importance' for the test data with exponentially decreasing values
    decay_factor = 0.9
    last_redispatch_importance_test = [last_redispatch_importance_train[-1] * (decay_factor ** i) for i in range(len(X_test))]

    return last_redispatch_importance_train, last_redispatch_importance_test

In [8]:
# models
logistic_reg = LogisticRegression(max_iter=1000, C=0.1)  #  regularization parameter C
knn_classifier = KNeighborsClassifier(n_neighbors=1)
#random_forest = RandomForestClassifier(max_depth=1, n_estimators=10, random_state=9)
xgb_classifier = XGBClassifier(booster='gbtree', reg_alpha=6, eval_metric='logloss', gamma=5, min_child_weight = 3,
                               n_estimators=100, max_depth=2, learning_rate=0.01, objective='binary:logistic',
                               random_state=13, scale_pos_weight=14)


# cross-validation
n_splits = 70
test_size = 48 #(12h with 15 min intervalls)
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)

recall_scores = []
accuracy_scores = []
conf_matrices = []
recall_train_scores = []
accuracy_train_scores = []
conf_train_matrices = []

for train_index, test_index in tscv.split(X):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    X_train = X_train.copy()
    X_test = X_test.copy()

    # check if at least one instance of redispatch with status 1 is present in the test data
    if y_test.sum() == 0:
        continue
    '''
    # add importance of the last redispatch status
    last_redispatch_importance_train, last_redispatch_importance_test = last_redispatch(y_train, X_test)
    X_train['last_redispatch_importance'] = last_redispatch_importance_train
    X_test['last_redispatch_importance'] = last_redispatch_importance_test
    '''
    # preprocess data
    X_train_scaled = preprocessor.fit_transform(X_train, y_train)
    X_train_preprocessed, y_train_preprocessed = smote.fit_resample(X_train_scaled, y_train)
    X_test_preprocessed = preprocessor.transform(X_test)

    # stacking of heterogenous weak learners and xgboost
    estimators_stacking = [
        ('lg', logistic_reg),
        ('knn', knn_classifier),
        ('rf', logistic_reg)#random_forest
    ]

    # Final estimator with XGBoost
    final_estimator = xgb_classifier

    stacking_classifier = StackingClassifier(estimators=estimators_stacking, final_estimator=final_estimator)

    # fit model
    stacking_classifier.fit(X_train_preprocessed, y_train_preprocessed)

    # make predictions
    y_pred = stacking_classifier.predict(X_test_preprocessed)
    y_pred_train = stacking_classifier.predict(X_train_scaled)

    #print(y_test.value_counts())
    #print("last redispatch importance: ", X_test['last_redispatch_importance'])

    # evaluate
    recall_scores.append(recall_score(y_test, y_pred))
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    conf_matrices.append(confusion_matrix(y_test, y_pred))

    recall_train_scores.append(recall_score(y_train, y_pred_train))
    accuracy_train_scores.append(accuracy_score(y_train, y_pred_train))
    conf_train_matrices.append(confusion_matrix(y_train, y_pred_train))

    #sorted_idx = stacking_classifier.feature_importances_.argsort()
    #plt.barh(X_train.columns[sorted_idx], stacking_classifier.feature_importances_[sorted_idx])


# print evaluation results
print("Average Recall:", sum(recall_scores) / len(recall_scores))
print("Average Accuracy:", sum(accuracy_scores) / len(accuracy_scores))
print("Average Confusion Matrix:", sum(conf_matrices) / len(conf_matrices))
print("Average Recall (Train):", sum(recall_train_scores) / len(recall_train_scores))
print("Average Accuracy (Train):", sum(accuracy_train_scores) / len(accuracy_train_scores))
print("Average Confusion Matrix (Train):", sum(conf_train_matrices) / len(conf_train_matrices))

Average Recall: 1.0
Average Accuracy: 0.3072916666666667
Average Confusion Matrix: [[ 0.   33.25]
 [ 0.   14.75]]
Average Recall (Train): 1.0
Average Accuracy (Train): 0.08297119759437158
Average Confusion Matrix (Train): [[    0.   90218.25]
 [    0.    8162.75]]


In [24]:
'''
# next: GRID SEARCH
- Decrease max_depth: Reduce the maximum depth of the trees. Smaller trees are less likely to overfit the training data.
- Increase min_child_weight = 3: This parameter specifies the minimum sum of instance weight (hessian) needed in a child. Increasing it makes the algorithm more conservative.
- Increase gamma: Gamma specifies the minimum loss reduction required to make a further partition on a leaf node of the tree. Increasing it makes the algorithm more conservative.
- Add more training data: If possible, adding more diverse training data can help the model generalize better.


### with two lin reg, no feature selection and learning rate 0.001 and target feature thing
Average Recall: 1.0
Average Accuracy: 0.3072916666666667
Average Confusion Matrix: [[ 0.   33.25]
 [ 0.   14.75]]
Average Recall (Train): 1.0
Average Accuracy (Train): 0.08297119759437158
Average Confusion Matrix (Train): [[    0.   90218.25]
 [    0.    8162.75]]


### with two lin reg, no feature selection and learning rate 0.001
Average Recall: 1.0
Average Accuracy: 0.3072916666666667
Average Confusion Matrix: [[ 0.   33.25]
 [ 0.   14.75]]
Average Recall (Train): 1.0
Average Accuracy (Train): 0.08297119759437158
Average Confusion Matrix (Train): [[    0.   90218.25]
 [    0.    8162.75]]

### with three lin reg, no feature selection and learning rate 0.001 (no stacking but bagging) (MAYBE AGAIN WITH FEATURE LAGGED TARGET?)
To shrink the effect of each tree in gradient boosting and reduce overfitting,
you can decrease the learning rate. The learning rate controls the contribution
of each tree to the final ensemble model. A lower learning rate means that each
tree makes a smaller adjustment to the predictions, which can help prevent
overfitting by allowing more trees to be added to the ensemble without
over-emphasizing the training data.
Average Recall: 1.0
Average Accuracy: 0.3072916666666667
Average Confusion Matrix: [[ 0.   33.25]
 [ 0.   14.75]]
Average Recall (Train): 1.0
Average Accuracy (Train): 0.08297119759437158
Average Confusion Matrix (Train): [[    0.   90218.25]
 [    0.    8162.75]]


### with twice lin reg, no feature selection and learning rate 0.01

Average Recall: 0.39403953101708517
Average Accuracy: 0.6788194444444443
Average Confusion Matrix: [[24.5         8.75      ]
 [ 6.66666667  8.08333333]]

Average Recall (Train): 1.0
Average Accuracy (Train): 1.0
Average Confusion Matrix (Train): [[90197.25     0.  ]
 [    0.    8162.75]]


### with random forest and feature selection and learning rate 0.1
Average Recall: 0.29788155028093105
Average Accuracy: 0.6041666666666666
Average Confusion Matrix: [[25.25  8.  ]
 [11.    3.75]]
Average Recall (Train): 1.0
Average Accuracy (Train): 1.0
Average Confusion Matrix (Train): [[90197.25     0.  ]
 [    0.    8162.75]]
'''

' \n### with three lin reg, no feature selection and learning rate 0.001 (no stacking anymore)\nTo shrink the effect of each tree in gradient boosting and reduce overfitting, \nyou can decrease the learning rate. The learning rate controls the contribution \nof each tree to the final ensemble model. A lower learning rate means that each \ntree makes a smaller adjustment to the predictions, which can help prevent \noverfitting by allowing more trees to be added to the ensemble without \nover-emphasizing the training data\n\n\n\n### with twice lin reg, no feature selection and learning rate 0.01\nAverage Recall: 0.39403953101708517\nAverage Accuracy: 0.6788194444444443\nAverage Confusion Matrix: [[24.5         8.75      ]\n [ 6.66666667  8.08333333]]\nAverage Recall (Train): 1.0\nAverage Accuracy (Train): 1.0\nAverage Confusion Matrix (Train): [[90197.25     0.  ]\n [    0.    8162.75]]\n\n\n### with random forest and feature selection and learning rate 0.1\nAverage Recall: 0.29788155028

## Grid Search

In [24]:
from sklearn.model_selection import GridSearchCV

# preprocess data
X_train_scaled = preprocessor.fit_transform(X_train, y_train)
X_train_preprocessed, y_train_preprocessed = smote.fit_resample(X_train_scaled, y_train)
X_test_preprocessed = preprocessor.transform(X_test)

# models
logistic_reg = LogisticRegression(max_iter=1000, C=0.1)  #  regularization parameter C
knn_classifier = KNeighborsClassifier(n_neighbors=1)
xgb_classifier = XGBClassifier(booster='gbtree', eval_metric='logloss', objective='binary:logistic',
                               random_state=13)

# create stacking instance
estimators_stacking = [
    ('lg', logistic_reg),
    ('knn', knn_classifier),
    ('rf', logistic_reg)#random_forest
]
final_estimator = xgb_classifier
stacking_classifier = StackingClassifier(estimators=estimators_stacking, final_estimator=final_estimator)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [1, 2, 3],
    'learning_rate': [0.1, 0.01, 0.001],
    'gamma': [0, 2, 5],
    'reg_alpha': [0, 0.2, 0.5],
    'min_child_weight': [1, 2, 3],
    'scale_pos_weight': [10, 12, 14]
}

# Create GridSearchCV instance
grid_search = GridSearchCV(estimator=stacking_classifier, param_grid=param_grid, cv=5, scoring='recall', n_jobs=-1)

# Fit GridSearchCV to the data
grid_search.fit(X_train_preprocessed, y_train_preprocessed)

# Get the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)