##Ensemble Method

In [10]:
# packages
import os
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import StackingClassifier

from sklearn.metrics import confusion_matrix, precision_score, f1_score
from matplotlib import pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [2]:
# mount your google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/My Drive/ms_wind_curtailment_prediction/lagged_curtailment_target_features.csv', sep = ';', index_col=0)

In [4]:
# Drop the rows to get appropriate test data
start_date = '2022-01-01'
end_date = '2022-12-31'
df = df.loc[start_date:end_date]

In [5]:
# features X and target y
X = df.drop(['redispatch', 'level'], axis = 1)
y = df['redispatch']

In [6]:
# preprocessing pipelines
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())#,
   # ('feature_selection', SelectKBest(score_func=f_classif, k = 20))
])

smote = SMOTE(random_state=13)

In [7]:
# importance of the last redispatch status for the training data
def last_redispatch(y_train, X_test):
    """
    tbd.
    """
    window_size = 2  #last 30 min
    last_redispatch_importance_train = []
    for i in range(len(y_train)):
        window_start = max(0, i - window_size)
        window_end = i
        importance = y_train.iloc[window_start:window_end].sum() # also incorporate the level ???
        last_redispatch_importance_train.append(importance)
    # forward fill the last value of 'last_redispatch_importance' for the test data with exponentially decreasing values
    decay_factor = 0.9
    last_redispatch_importance_test = [last_redispatch_importance_train[-1] * (decay_factor ** i) for i in range(len(X_test))]

    return last_redispatch_importance_train, last_redispatch_importance_test

In [24]:
# models
logistic_reg = LogisticRegression(max_iter=1000, C=0.1)
knn_classifier = KNeighborsClassifier(n_neighbors=1)
random_forest = RandomForestClassifier(max_depth=1, n_estimators=10, random_state=9)
xgb_classifier = XGBClassifier(booster='gbtree', reg_alpha=7, eval_metric='logloss', gamma = 5,
                              n_estimators=200, max_depth=6, learning_rate=0.1, objective='binary:logistic', random_state = 13, scale_pos_weight=20)


# cross-validation
n_splits = 70
test_size = 48 #(12h with 15 min intervalls)
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)

precision_scores = []
f1_scores = []
conf_matrices = []
precision_train_scores = []
f1_train_scores = []
conf_train_matrices = []

for train_index, test_index in tscv.split(X):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    X_train = X_train.copy()
    X_test = X_test.copy()

    # having at least one instance of redispatch 1 or 0 in test to avoid ill-defined precision/f1 scores
    if y_test.sum() == 0 or y_test.sum() == 1:
        continue

    # add importance of the last redispatch status
    last_redispatch_importance_train, last_redispatch_importance_test = last_redispatch(y_train, X_test)
    X_train['last_redispatch_importance'] = last_redispatch_importance_train
    X_test['last_redispatch_importance'] = last_redispatch_importance_test

    # preprocess data
    X_train_scaled = preprocessor.fit_transform(X_train, y_train)
    X_train_preprocessed, y_train_preprocessed = smote.fit_resample(X_train_scaled, y_train)
    X_test_preprocessed = preprocessor.transform(X_test)

    # stacking of heterogenous weak learners and xgboost
    estimators_stacking = [
        ('lg', logistic_reg),
        ('knn', knn_classifier),
        ('rf', random_forest)
    ]

    # Final estimator with XGBoost
    final_estimator = xgb_classifier

    stacking_classifier = StackingClassifier(estimators=estimators_stacking, final_estimator=final_estimator)

    # fit model
    stacking_classifier.fit(X_train_preprocessed, y_train_preprocessed)

    # Custom threshold
    custom_threshold = 0.3

    # Make predictions with probability estimates
    y_prob = stacking_classifier.predict_proba(X_test_preprocessed)
    y_prob_train = stacking_classifier.predict_proba(X_train_scaled)

    # Convert probability estimates to binary predictions based on the threshold
    y_pred = (y_prob[:, 1] > custom_threshold).astype(int)
    y_pred_train = (y_prob_train[:, 1] > custom_threshold).astype(int)

    #make predictions
    #y_pred = stacking_classifier.predict(X_test_preprocessed)
    #y_pred_train = stacking_classifier.predict(X_train_scaled)

    #print(y_test.value_counts())
    #print("last redispatch importance: ", X_test['last_redispatch_importance'])

    # evaluate
    precision_scores.append(precision_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    conf_matrices.append(confusion_matrix(y_test, y_pred))

    precision_train_scores.append(precision_score(y_train, y_pred_train))
    f1_train_scores.append(f1_score(y_train, y_pred_train))
    conf_train_matrices.append(confusion_matrix(y_train, y_pred_train))


# evaluation results
print("Average Scores:")
print("Precision:", sum(precision_scores) / len(precision_scores))
print("F1-Scores:", sum(f1_scores) / len(f1_scores))
print("Confusion Matrix:", sum(conf_matrices) / len(conf_matrices))
print("Precision (Train):", sum(precision_train_scores) / len(precision_train_scores))
print("F1-Scores (Train):", sum(f1_train_scores) / len(f1_train_scores))
print("Confusion Matrix (Train):", sum(conf_train_matrices) / len(conf_train_matrices))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Average Scores:
Precision: 0.5856481481481481
F1-Scores: 0.23217916261198646
Confusion Matrix: [[17.875       0.5       ]
 [25.33333333  4.29166667]]
Precision (Train): 0.9554011778720337
F1-Scores (Train): 0.9762928118500356
Confusion Matrix (Train): [[2.75178333e+04 2.54000000e+02]
 [9.00000000e+00 5.38516667e+03]]


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
'''
xgb_classifier = XGBClassifier(n_estimators=200, scale_pos_weight=20) and random forest
Average Scores:
Precision: 0.5856481481481481
F1-Scores: 0.23217916261198646
Confusion Matrix: [[17.875       0.5       ]
 [25.33333333  4.29166667]]
Precision (Train): 0.9554011778720337
F1-Scores (Train): 0.9762928118500356
Confusion Matrix (Train): [[2.75178333e+04 2.54000000e+02]
 [9.00000000e+00 5.38516667e+03]]

xgb_classifier = XGBClassifier(n_estimators=200, scale_pos_weight=20) AND threshold 0.3
Average Scores:
Precision: 0.5972222222222222
F1-Scores: 0.21781384973677995
Confusion Matrix: [[18.     0.375]
 [26.     3.625]]
Precision (Train): 0.9624964642630119
F1-Scores (Train): 0.9808142947701771
Confusion Matrix (Train): [[2.75613750e+04 2.10458333e+02]
 [6.66666667e-01 5.39350000e+03]]


xgb_classifier = XGBClassifier(n_estimators=200, scale_pos_weight=20)
Average Scores:
Precision: 0.5972222222222222
F1-Scores: 0.21781384973677995
Confusion Matrix: [[18.     0.375]
 [26.     3.625]]
Precision (Train): 0.9647228404390352
F1-Scores (Train): 0.9815330893810447
Confusion Matrix (Train): [[2.75747917e+04 1.97041667e+02]
 [5.50000000e+00 5.38866667e+03]]

xgb_classifier = XGBClassifier(scale_pos_weight=13)
Average Scores:
Precision: 0.5972222222222222
F1-Scores: 0.21641622569764649
Confusion Matrix: [[18.          0.375]
 [26.04166667  3.58333333]]
Precision (Train): 0.9669119531330703
F1-Scores (Train): 0.9821783677321881
Confusion Matrix (Train): [[2.75878333e+04 1.84000000e+02]
 [1.09583333e+01 5.38320833e+03]]


xgb_classifier = XGBClassifier(reg_alpha=7, gamma = 5, max_depth=6, scale_pos_weight=3)
Average Scores:
Precision: 0.5995370370370371
F1-Scores: 0.20899141419020648
Confusion Matrix: [[18.16666667  0.20833333]
 [26.33333333  3.29166667]]
Precision (Train): 0.9736545539046618
F1-Scores (Train): 0.9717621896797972
Confusion Matrix (Train): [[27630.70833333   141.125     ]
 [  162.125       5232.04166667]]


xgb_classifier = XGBClassifier(booster='gbtree', reg_alpha=5, eval_metric='logloss', gamma = 3,
                              n_estimators=100, max_depth=6, learning_rate=0.1, objective='binary:logistic')
Average Scores:
Precision: 0.5995370370370371
F1-Scores: 0.20899141419020648
Confusion Matrix: [[18.16666667  0.20833333]
 [26.33333333  3.29166667]]
Precision (Train): 0.9736545539046618
F1-Scores (Train): 0.9717621896797972
Confusion Matrix (Train): [[27630.70833333   141.125     ]
 [  162.125       5232.04166667]]
'''

## Random Grid Search

In [None]:
'''
# train/test data
cutoff_time = '2023-07-01'  # Define your cutoff time
train = df[df.index < cutoff_time]
test = df[df.index >= cutoff_time]
X_train = train.drop(columns=['redispatch'])
y_train = train['redispatch']
X_test = test.drop(columns=['redispatch'])
y_test = test['redispatch']

# preprocess data
X_train_scaled = preprocessor.fit_transform(X_train, y_train)
X_train_preprocessed, y_train_preprocessed = smote.fit_resample(X_train_scaled, y_train)
X_test_preprocessed = preprocessor.transform(X_test)
'''

X = df.drop(['redispatch', 'level'], axis = 1)
y = df['redispatch']

# Define base estimators
logistic_reg = LogisticRegression(max_iter=1000, C=0.1)
knn_classifier = KNeighborsClassifier(n_neighbors=1)
random_forest = RandomForestClassifier(max_depth=1, n_estimators=10, random_state=9)

# Define final estimator
xgb_classifier = XGBClassifier(booster='gbtree', eval_metric='logloss', objective='binary:logistic',
                               random_state=13)

# Define the stacking classifier
estimators_stacking = [
    ('lg', logistic_reg),
    ('knn', knn_classifier),
    ('rf', logistic_reg)
]

stacking_classifier = StackingClassifier(estimators=estimators_stacking, final_estimator=xgb_classifier)

stacking_pipeline = Pipeline([
    ('stacking', stacking_classifier)
])

# Define the parameter distributions for random search
param_distributions = {
    'stacking__final_estimator__n_estimators': randint(50, 100, 150),
    'stacking__final_estimator__max_depth': randint(1, 2, 3),
    'stacking__final_estimator__learning_rate': [0.1, 0.01, 0.001],
    'stacking__final_estimator__gamma': [0, 1, 3],
#    'stacking__final_estimator__reg_alpha': [0, 0.2, 0.5],
    'stacking__final_estimator__scale_pos_weight': [10, 12, 14]
}

# Create RandomizedSearchCV instance
random_search = RandomizedSearchCV(estimator=stacking_pipeline, param_distributions=param_distributions,
                                   n_iter=100, cv=5, scoring='precision', n_jobs=-1, random_state=42)

# Fit RandomizedSearchCV to the data
random_search.fit(X, y)

# Get the best parameters and best score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

KeyboardInterrupt: 

In [None]:
'''
# next: GRID SEARCH
- Decrease max_depth: Reduce the maximum depth of the trees. Smaller trees are less likely to overfit the training data.
- Increase min_child_weight = 3: This parameter specifies the minimum sum of instance weight (hessian) needed in a child. Increasing it makes the algorithm more conservative.
- Increase gamma: Gamma specifies the minimum loss reduction required to make a further partition on a leaf node of the tree. Increasing it makes the algorithm more conservative.
- Add more training data: If possible, adding more diverse training data can help the model generalize better.


### with two lin reg, no feature selection and learning rate 0.001 and target feature thing
Average Recall: 1.0
Average Accuracy: 0.3072916666666667
Average Confusion Matrix: [[ 0.   33.25]
 [ 0.   14.75]]
Average Recall (Train): 1.0
Average Accuracy (Train): 0.08297119759437158
Average Confusion Matrix (Train): [[    0.   90218.25]
 [    0.    8162.75]]


### with two lin reg, no feature selection and learning rate 0.001
Average Recall: 1.0
Average Accuracy: 0.3072916666666667
Average Confusion Matrix: [[ 0.   33.25]
 [ 0.   14.75]]
Average Recall (Train): 1.0
Average Accuracy (Train): 0.08297119759437158
Average Confusion Matrix (Train): [[    0.   90218.25]
 [    0.    8162.75]]

### with three lin reg, no feature selection and learning rate 0.001 (no stacking but bagging) (MAYBE AGAIN WITH FEATURE LAGGED TARGET?)
To shrink the effect of each tree in gradient boosting and reduce overfitting,
you can decrease the learning rate. The learning rate controls the contribution
of each tree to the final ensemble model. A lower learning rate means that each
tree makes a smaller adjustment to the predictions, which can help prevent
overfitting by allowing more trees to be added to the ensemble without
over-emphasizing the training data.
Average Recall: 1.0
Average Accuracy: 0.3072916666666667
Average Confusion Matrix: [[ 0.   33.25]
 [ 0.   14.75]]
Average Recall (Train): 1.0
Average Accuracy (Train): 0.08297119759437158
Average Confusion Matrix (Train): [[    0.   90218.25]
 [    0.    8162.75]]


### with twice lin reg, no feature selection and learning rate 0.01

Average Recall: 0.39403953101708517
Average Accuracy: 0.6788194444444443
Average Confusion Matrix: [[24.5         8.75      ]
 [ 6.66666667  8.08333333]]

Average Recall (Train): 1.0
Average Accuracy (Train): 1.0
Average Confusion Matrix (Train): [[90197.25     0.  ]
 [    0.    8162.75]]


### with random forest and feature selection and learning rate 0.1
Average Recall: 0.29788155028093105
Average Accuracy: 0.6041666666666666
Average Confusion Matrix: [[25.25  8.  ]
 [11.    3.75]]
Average Recall (Train): 1.0
Average Accuracy (Train): 1.0
Average Confusion Matrix (Train): [[90197.25     0.  ]
 [    0.    8162.75]]
'''

' \n### with three lin reg, no feature selection and learning rate 0.001 (no stacking anymore)\nTo shrink the effect of each tree in gradient boosting and reduce overfitting, \nyou can decrease the learning rate. The learning rate controls the contribution \nof each tree to the final ensemble model. A lower learning rate means that each \ntree makes a smaller adjustment to the predictions, which can help prevent \noverfitting by allowing more trees to be added to the ensemble without \nover-emphasizing the training data\n\n\n\n### with twice lin reg, no feature selection and learning rate 0.01\nAverage Recall: 0.39403953101708517\nAverage Accuracy: 0.6788194444444443\nAverage Confusion Matrix: [[24.5         8.75      ]\n [ 6.66666667  8.08333333]]\nAverage Recall (Train): 1.0\nAverage Accuracy (Train): 1.0\nAverage Confusion Matrix (Train): [[90197.25     0.  ]\n [    0.    8162.75]]\n\n\n### with random forest and feature selection and learning rate 0.1\nAverage Recall: 0.29788155028