Inspiration: 
https://medium.com/mlearning-ai/time-series-forecasting-with-xgboost-and-lightgbm-predicting-energy-consumption-460b675a9cee

Vorgehen:
https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_quantile.html


10000 Variablen: 
- Sonst HistGradientBoost besser, allerdings keine Quantile Forecasts

In [37]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
import statsmodels.api as sm

from energy_consumption.feature_selection.extract import extract_energy_data, extract_all_features
from energy_consumption.help_functions import get_forecast_timestamps, create_submission_frame

energydata = pd.read_csv(
    'c:\\Users\\Maria\\Documents\\Studium\\Pyhton Projekte\\PTSFC\\energy_consumption\\feature_selection\\data\\historical_data.csv')
energydata['date_time'] = pd.to_datetime(
    energydata['date_time'], format='%Y-%m-%d %H:%M:%S')
energydata = energydata.set_index("date_time")[-10000:]

energydata_xgb = extract_all_features.get_energy_and_standardized_features(
    energydata, knn=True)

2022-10-01 21:00:00
2023-11-23 12:00:00


Hyperparameter Tuning: 
* To find the best hyperparameters for your GradientBoostingRegressor, you can use a hyperparameter tuning approach
* One commonly used method is GridSearchCV or RandomizedSearchCV 
* scikit-learnscikit-learn's current version doesn't directly support quantile regression as a loss function in its grid search

--> create custom scorer for quantile loss and use it with GridSearchCV or RandomizedSearchCV

In [31]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import TimeSeriesSplit
import numpy as np

y = energydata[['energy_consumption']]
X = energydata.drop(columns=['energy_consumption'])

# Define the quantile loss function as a scorer
def pinball_loss_scorer(y_true, y_pred, alpha):
    errors = y_true - y_pred
    mask = errors < 0
    loss = alpha * np.sum(errors[mask]) + (1 - alpha) * np.sum(-errors[~mask])
    return loss / len(y_true)

# Define the parameter grid to search over
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [2, 3, 4],
    'min_samples_leaf': list(range(5,15)),
    'min_samples_split': list(range(5,15))
}

# Create the time series split
tscv = TimeSeriesSplit(n_splits=5, test_size=100)
best_parameters = {}

for alpha in [0.025, 0.25, 0.5, 0.75, 0.975]:

    # Create a custom scorer for quantile loss
    quantile_scorer = make_scorer(
        pinball_loss_scorer, greater_is_better=False, alpha=alpha)
 
    # Create the GradientBoostingRegressor model
    gbr = GradientBoostingRegressor(loss="quantile", alpha=alpha)

    # Create the RandomizedSearchCV object
    random_search = RandomizedSearchCV(
        gbr,
        param_distributions=param_grid,
        scoring=quantile_scorer,
        cv=tscv,
        n_iter=5,  # Adjust the number of iterations based on your computational resources
        random_state=42,
        verbose=1
    )

    # Fit the model
    random_search.fit(X, y.values.ravel())

    # Get the best hyperparameters
    best_params = random_search.best_params_

    print(f"Best Hyperparameters for {alpha}", best_params)
    best_parameters.update({alpha: best_params}) 

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Hyperparameters for 0.025 {'n_estimators': 300, 'min_samples_split': 11, 'min_samples_leaf': 13, 'max_depth': 4, 'learning_rate': 0.01}
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Hyperparameters for 0.25 {'n_estimators': 300, 'min_samples_split': 11, 'min_samples_leaf': 13, 'max_depth': 4, 'learning_rate': 0.01}
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Hyperparameters for 0.5 {'n_estimators': 300, 'min_samples_split': 11, 'min_samples_leaf': 13, 'max_depth': 4, 'learning_rate': 0.01}
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Hyperparameters for 0.75 {'n_estimators': 300, 'min_samples_split': 11, 'min_samples_leaf': 13, 'max_depth': 4, 'learning_rate': 0.01}
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Hyperparameters for 0.975 {'n_estimators': 300, 'min_samples_split': 11, 'min_samples_leaf': 13, 'max_depth': 4, 'learning_rate': 0.01}

## Model with random parameters

In [32]:
from sklearn.metrics import mean_pinball_loss
from sklearn.ensemble import GradientBoostingRegressor

y_train, y_test = energydata[['energy_consumption']
                             ][:-100], energydata[['energy_consumption']][-100:]
X_train, X_test = energydata.drop(columns=['energy_consumption'])[
    :-100], energydata.drop(columns=['energy_consumption'])[-100:]

common_params = dict(
    learning_rate=0.05,
    n_estimators=200,
    max_depth=2,
    min_samples_leaf=9,
    min_samples_split=9,
)

predictions = pd.DataFrame()
pinball_losses = {}
for alpha in [0.025, 0.25, 0.5, 0.75, 0.975]:
    name = f'q{alpha}'
    gbr = GradientBoostingRegressor(
        loss="quantile", alpha=alpha, **common_params)
    quantile_model = gbr.fit(X_train, y_train)
    y_pred = quantile_model.predict(X_test)

    predictions[name] = y_pred
    pinball_losses.update({name: mean_pinball_loss(y_test, y_pred, alpha = alpha)})

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [33]:
pinball_losses

{'q0.025': 0.30979839619128274,
 'q0.25': 1.859079794576308,
 'q0.5': 2.263016476483537,
 'q0.75': 1.6353804454129328,
 'q0.975': 0.33482661538517966}

## Model with selected parameters 
* Light comparison to be sure that my scoring function works

In [34]:
# Model with selected parameters
from sklearn.metrics import mean_pinball_loss
from sklearn.ensemble import GradientBoostingRegressor

y_train, y_test = energydata[['energy_consumption']
                             ][:-100], energydata[['energy_consumption']][-100:]
X_train, X_test = energydata.drop(columns=['energy_consumption'])[
    :-100], energydata.drop(columns=['energy_consumption'])[-100:]

optimized_params = dict(
    learning_rate=0.01,
    n_estimators=300,
    max_depth=4,
    min_samples_leaf=13,
    min_samples_split=11,
)

predictions = pd.DataFrame()
pinball_losses = {}
for alpha in [0.025, 0.25, 0.5, 0.75, 0.975]:
    name = f'q{alpha}'
    gbr = GradientBoostingRegressor(
        loss="quantile", alpha=alpha, **optimized_params)
    quantile_model = gbr.fit(X_train, y_train)
    y_pred = quantile_model.predict(X_test)

    predictions[name] = y_pred
    pinball_losses.update(
        {name: mean_pinball_loss(y_test, y_pred, alpha=alpha)})

pinball_losses

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


{'q0.025': 0.31567073467041057,
 'q0.25': 1.8421321500418597,
 'q0.5': 2.276598038980722,
 'q0.75': 1.672949624905699,
 'q0.975': 0.34219537731819444}

In [42]:
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingRegressor

from energy_consumption.feature_selection.extract import extract_energy_data, extract_all_features
from energy_consumption.help_functions.drop_years import drop_years
from energy_consumption.help_functions import get_forecast_timestamps, create_submission_frame

optimized_params = dict(
    learning_rate=0.01,
    n_estimators=300,
    max_depth=4,
    min_samples_leaf=13,
    min_samples_split=11,
)


def get_XGBoost_forecasts(energydata=np.nan, indexes=[47, 51, 55, 71, 75, 79], quantiles=[0.025, 0.25, 0.5, 0.75, 0.975], periods=100):

    if type(energydata) == float:
        # use derived optimum for number of years
        energydata = extract_energy_data.get_data(num_years=7)

    energydata = extract_all_features.get_energy_and_standardized_features(
        energydata, knn=True)

    X = energydata.drop(columns=['energy_consumption'])
    y = energydata['energy_consumption']

    # create dataframe to store forecast quantiles
    energyforecast = get_forecast_timestamps.forecast_timestamps(
        energydata.index[-1])

    X_pred = extract_all_features.get_energy_and_standardized_features(
        energyforecast, knn=True)

    X, X_pred = drop_years(X, X_pred)

    quantile_df = pd.DataFrame()
    for alpha in [0.025, 0.25, 0.5, 0.75, 0.975]:
        name = f'q{alpha}'
        gbr = GradientBoostingRegressor(
            loss="quantile", alpha=alpha, **optimized_params)
        quantile_model = gbr.fit(X, y)
        y_pred = quantile_model.predict(X_pred)
        quantile_df[name] = y_pred

    quantile_df = quantile_df.iloc[indexes]

    # return quantile forecasts in terms of absolute evaluation
    abs_eval = (len(quantiles) != 5)
    if abs_eval == True:
        print('true')
        horizon = pd.date_range(start=energydata.index[-1] + pd.DateOffset(
            hours=1), periods=periods, freq='H')
        quantile_df.insert(
            0, 'date_time', [horizon[i] for i in indexes])

        return quantile_df

    # else: create submission frame
    else:
        forecast_frame = create_submission_frame.get_frame(
            quantile_df, indexes)
        forecast_frame = forecast_frame.drop(columns={'index'})
        horizon = pd.date_range(start=energydata.index[-1] + pd.DateOffset(
            hours=1), periods=periods, freq='H')
        forecast_frame.insert(
            0, 'date_time', [horizon[i] for i in indexes])

        return forecast_frame

In [43]:
forecasts = get_XGBoost_forecasts(energydata)
forecasts

2022-10-01 21:00:00
2023-11-23 12:00:00
2023-11-22 13:00:00
2023-11-27 16:00:00


Unnamed: 0,date_time,forecast_date,target,horizon,q0.025,q0.25,q0.5,q0.75,q0.975
0,2023-11-24 12:00:00,2024-01-15,energy,36 hour,40.140204,50.116355,58.44634,62.239184,64.565509
1,2023-11-24 16:00:00,2024-01-15,energy,40 hour,40.140204,50.116355,56.350162,59.042676,63.085466
2,2023-11-24 20:00:00,2024-01-15,energy,44 hour,40.140204,50.116355,56.350162,59.042676,63.085466
3,2023-11-25 12:00:00,2024-01-15,energy,60 hour,42.342401,48.282368,54.01518,55.7961,67.074725
4,2023-11-25 16:00:00,2024-01-15,energy,64 hour,42.342401,48.282368,52.76569,53.970208,65.702731
5,2023-11-25 20:00:00,2024-01-15,energy,68 hour,42.342401,48.282368,52.76569,53.970208,65.702731


In [39]:
forecasts

Unnamed: 0,date_time,q0.025,q0.25,q0.5,q0.75,q0.975
47,2023-11-24 12:00:00,40.185449,50.057719,58.344628,62.222855,64.500873
51,2023-11-24 16:00:00,40.185449,50.057719,56.241971,59.034353,63.082008
55,2023-11-24 20:00:00,40.185449,50.057719,56.241971,59.034353,63.082008
71,2023-11-25 12:00:00,42.418319,48.213743,54.111461,55.889315,67.02533
75,2023-11-25 16:00:00,42.418319,48.213743,52.794317,54.021648,65.750325
79,2023-11-25 20:00:00,42.418319,48.213743,52.794317,54.021648,65.750325


Maybe: Try out different parameters

In [None]:
parameters = {
    "max_depth": list(range(1, 7)),
    "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
    "n_estimators": [100, 300, 500],
}