##**Binary Classification with XGBoost**



In [1]:
# load packages
import os
import numpy as np
import pandas as pd
import joblib

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix, precision_score, f1_score
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV

In [2]:
# mount your google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
# searching for files, load data and convert index to datetime type
def search_file(directory, filename):
    for root, dirs, files in os.walk(directory):
        if filename in files:
            return os.path.join(root, filename)
    return None

search_directory = '/content/drive/My Drive'
file_name = 'lagged_curtailment_target_features_extended.csv'
file_path = search_file(search_directory, file_name)

df_lagged = pd.read_csv(file_path, sep = ';', index_col=0)
df_lagged.index = pd.to_datetime(df_lagged.index)

In [18]:
# get desired df size
start_date = '2021-07-01'
end_date = '2023-11-30'
df_lagged = df_lagged.loc[start_date:end_date]

In [11]:
# impute, scale pipeline and smote (for class imbalance)
preprocessor = Pipeline([
    ('scaler', StandardScaler())
])

smote = SMOTE(k_neighbors=1, random_state=42)

X = df_lagged.drop(['redispatch', 'level'], axis = 1)
y = df_lagged['redispatch']

In [12]:
share_minority = y.value_counts().get(1, 0)/len(y)
print(share_minority)

0.1367589837491417


**K-fold cross-validation smoting train data**

In [23]:
# hyperparameters
params = {
    'max_depth': 3,
    'min_child_weight': 10,
    'gamma': 0.2,
    'subsample': 0.5,
    'colsample_bytree': 0.5,
    'booster': 'gbtree',
    'reg_alpha': 4,
    'reg_lambda': 4,
    'n_estimators': 100,
    'learning_rate': 0.1,
    'objective': 'binary:logistic',
    'random_state': 42,
    'verbosity': 0
}

# cross-validation
n_splits = 10
gap = 48  # 12 hour difference between train and test sets
tscv = TimeSeriesSplit(n_splits=n_splits, gap=gap)

precision_scores = []
f1_scores = []
conf_matrices = []
precision_train_scores = []
f1_train_scores = []
conf_train_matrices = []

for fold, (train_index, test_index) in enumerate(tscv.split(X), 1):
    print(f"Training on fold {fold}/{n_splits}")

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    xgboost_class = XGBClassifier(**params)
    xgboost_class.fit(X_train_resampled, y_train_resampled)

    threshold = 0.5
    y_pred_proba = xgboost_class.predict_proba(X_test)[:, 1]
    y_pred = (y_pred_proba >= threshold).astype(int)
    y_train_proba = xgboost_class.predict_proba(X_train)[:, 1]
    y_pred_train = (y_train_proba >= threshold).astype(int)

    # evaluate
    precision_scores.append(precision_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    conf_matrices.append(confusion_matrix(y_test, y_pred))
    precision_train_scores.append(precision_score(y_train, y_pred_train))
    f1_train_scores.append(f1_score(y_train, y_pred_train))
    conf_train_matrices.append(confusion_matrix(y_train, y_pred_train))


# print evaluation
print("Average Scores:")
print("Precision (Test):", np.array(precision_scores).mean())
print("F1-Scores (Test):", np.array(f1_scores).mean())

confusion_matrix_test = False
if confusion_matrix_test:
  average_conf_matrix = np.round(sum(conf_matrices) / len(conf_matrices)).astype(int)
  print("Average Confusion Matrix:")
  print(f"{'True Negative':<20} {'False Positive':<20}")
  print(f"{average_conf_matrix[0][0]:<20} {average_conf_matrix[0][1]:<20}")
  print(f"{'False Negative':<20} {'True Positive':<20}")
  print(f"{average_conf_matrix[1][0]:<20} {average_conf_matrix[1][1]:<20}")

print("Precision (Train):", np.array(precision_train_scores).mean())
print("F1-Scores (Train):", np.array(f1_train_scores).mean())

confusion_matrix_train = False
if confusion_matrix_train:
  average_conf_matrix_train = np.round(sum(conf_train_matrices) / len(conf_train_matrices)).astype(int)
  print("Average Confusion Matrix (Train):")
  print(f"{'True Negative':<20} {'False Positive':<20}")
  print(f"{average_conf_matrix_train[0][0]:<20} {average_conf_matrix_train[0][1]:<20}")
  print(f"{'False Negative':<20} {'True Positive':<20}")
  print(f"{average_conf_matrix_train[1][0]:<20} {average_conf_matrix_train[1][1]:<20}")

Training on fold 1/10
Training on fold 2/10
Training on fold 3/10
Training on fold 4/10
Training on fold 5/10
Training on fold 6/10
Training on fold 7/10
Training on fold 8/10
Training on fold 9/10
Training on fold 10/10
Average Scores:
Precision (Test): 0.33399479408470956
F1-Scores (Test): 0.3611525674440109
Precision (Train): 0.5235199507774875
F1-Scores (Train): 0.6257198818993992


In [20]:
folder_path = '/content/drive/My Drive/wind_curtailment_prediction'

if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    print("Folder created successfully.")
else:
    print("Folder already exists.")

Folder already exists.


In [21]:
# safe XGBoost classifier
joblib.dump(xgboost_class, '/content/drive/My Drive/wind_curtailment_prediction/xgboost_class.pkl')

['/content/drive/My Drive/wind_curtailment_prediction/xgboost_class.pkl']

**Extra: Grid Search**


In [None]:
# preprocess data
X_scaled = preprocessor.fit_transform(X)
X_preprocessed, y_preprocessed = smote.fit_resample(X_scaled, y)

# parameter grid
param_grid = {
    'n_estimators': [250, 300, 350],
    'max_depth': [None, 1, 2],
    'reg_alpha': [1, 2, 3],
    'reg_lambda': [1, 2, 3],
    'scale_pos_weight': [1, 2, 3]
}

# timeseries split
test_size = 96
tscv = TimeSeriesSplit(test_size=test_size)

# XGBClassifier and GridSearchCV
xgboost_class = XGBClassifier(booster='gbtree', eval_metric='logloss', objective='binary:logistic', random_state=13)
grid_search = GridSearchCV(estimator=xgboost_class, param_grid=param_grid, cv=tscv, scoring='precision', n_jobs=-1)

# fit
grid_search.fit(X_preprocessed, y_preprocessed)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'max_depth': None, 'n_estimators': 250, 'reg_alpha': 1, 'reg_lambda': 1, 'scale_pos_weight': 1}
Best Score: 1.0
