##**Extra Trees Classifier**

In [1]:
# packages
import os
import numpy as np
import pandas as pd
import joblib

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, confusion_matrix, precision_score, f1_score
from sklearn.model_selection import TimeSeriesSplit
from matplotlib import pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf

In [2]:
# mount your google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# searching for files, load data and convert index to datetime type
def search_file(directory, filename):
    for root, dirs, files in os.walk(directory):
        if filename in files:
            return os.path.join(root, filename)
    return None

search_directory = '/content/drive/My Drive'
file_name = 'lagged_curtailment_target_features_etc.csv'
file_path = search_file(search_directory, file_name)

df_lagged = pd.read_csv(file_path, sep = ';', index_col=0)
df_lagged.index = pd.to_datetime(df_lagged.index)

In [5]:
# get desired df size
start_date = '2022-01-01'
end_date = '2023-06-30'
df_lagged = df_lagged.loc[start_date:end_date]

In [6]:
smote = SMOTE(k_neighbors=1, random_state=42)

# define features X and target y
X = df_lagged.drop(['redispatch', 'level'], axis = 1)
y = df_lagged['redispatch']

# hyperparameters
params = {
    'max_depth': 10,
    'min_samples_split': 5,
    'min_samples_leaf': 5,
    'max_features': 'log2',
    'n_estimators': 500,
    'random_state': 42,
    'class_weight': 'balanced',
    'bootstrap': True
}

# time series cross-validation
n_splits = 10
gap = 48  # 12 hour difference between train and test sets
tscv = TimeSeriesSplit(n_splits=n_splits, gap=gap)
threshold = 0.5

train_f1_scores = []
train_precision_scores = []
test_f1_scores = []
test_precision_scores = []

# iterate over each fold
for fold, (train_index, test_index) in enumerate(tscv.split(X), 1):
    print(f"Training on fold {fold}/{n_splits}")
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

    model = ExtraTreesClassifier(**params)
    model.fit(X_train_balanced, y_train_balanced)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # evaluation
    train_f1 = f1_score(y_train, y_train_pred, average='binary', zero_division=1)
    train_precision = precision_score(y_train, y_train_pred, average='binary', zero_division=1)
    test_f1 = f1_score(y_test, y_test_pred, average='binary', zero_division=1)
    test_precision = precision_score(y_test, y_test_pred, average='binary', zero_division=1)

    train_f1_scores.append(train_f1)
    train_precision_scores.append(train_precision)
    test_f1_scores.append(test_f1)
    test_precision_scores.append(test_precision)

    confusion_matrix = False
    if confusion_matrix:
      cm = confusion_matrix(y_test, y_test_pred)
      print(f"Confusion Matrix (Fold {fold}):")
      print(cm)
      print("\n")

avg_train_f1 = np.mean(train_f1_scores)
avg_train_precision = np.mean(train_precision_scores)
avg_test_f1 = np.mean(test_f1_scores)
avg_test_precision = np.mean(test_precision_scores)

# Print the results
print("Average Train F1 Score:", avg_train_f1)
print("Average Train Precision:", avg_train_precision)
print("\nAverage Test F1 Score:", avg_test_f1)
print("Average Test Precision:", avg_test_precision)

Training on fold 1/10
Training on fold 2/10
Training on fold 3/10
Training on fold 4/10
Training on fold 5/10
Training on fold 6/10
Training on fold 7/10
Training on fold 8/10
Training on fold 9/10
Training on fold 10/10
Average Train F1 Score: 0.6811096847995011
Average Train Precision: 0.5594604793765078

Average Test F1 Score: 0.3739929844817952
Average Test Precision: 0.3512463757589768


In [24]:
folder_path = '/content/drive/My Drive/wind_curtailment_prediction'

if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    print("Folder created successfully.")
else:
    print("Folder already exists.")

Folder already exists.


In [25]:
# safe XGBoost classifier
joblib.dump(model, '/content/drive/My Drive/wind_curtailment_prediction/extra_trees_classifier.pkl')

['/content/drive/My Drive/wind_curtailment_prediction/extra_trees_classifier.pkl']

**Extra: Grid Search**

In [None]:
# preprocess data
X_scaled = preprocessor.fit_transform(X)
X_preprocessed, y_preprocessed = smote.fit_resample(X_scaled, y)

# parameter grid
param_grid = {
    'n_estimators': [200, 250, 300],
    'max_depth': [None, 1, 2],
    'min_samples_leaf': [2, 3, 4],
    'min_samples_split': [1, 2, 3]
}

# timeseries split
test_size = 96
tscv = TimeSeriesSplit(test_size=test_size)

# XGBClassifier and GridSearchCV
extra_trees_clf = ExtraTreesClassifier(max_features='sqrt', random_state=42)
grid_search = GridSearchCV(estimator=extra_trees_clf, param_grid=param_grid, cv=tscv, scoring='precision', n_jobs=-1)

# fit
grid_search.fit(X_preprocessed, y_preprocessed)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

135 fits failed out of a total of 405.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py", line 340, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(


Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best Score: 1.0
