In [1]:
import xgboost as xgb
from xgboost.callback import TrainingCallback
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, ParameterGrid
from sklearn.base import clone
# import wandb
import matplotlib.pyplot as plt

In [2]:
class CFG:
    n_in = 6
    n_test = 336
    wandb_project_name = 'electricity_demand_forecasting'
    wandb_run_name = 'xgboost'

In [3]:
class WandbCallback(TrainingCallback):
    def __init__(self, period=1):
        self.period = period

    def after_iteration(self, model, epoch, evals_log):
        if epoch % self.period == 0:
            # Assuming 'mae' is being logged. Adjust if a different metric name is used.
            mae = evals_log['train']['mae'][-1]
            # wandb.log({'train-mae': mae})
        return False

In [4]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True, target_var='TOTALDEMAND'):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    
    # Input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.drop(columns=target_var).shift(i))
        names += [('%s(t-%d)' % (df.columns[j], i)) for j in range(n_vars) if df.columns[j] != target_var]
    
    # Forecast sequence (t+1, ... t+n_out-1), only for predictors, not target
    # Note: We adjust n_out to n_out-1 to make room for the actual target variable at t
    for i in range(1, n_out):
        cols.append(df.drop(columns=target_var).shift(-i))
        names += [('%s(t+%d)' % (df.columns[j], i)) for j in range(n_vars) if df.columns[j] != target_var]

    # Add the target variable column at t (current timestep)
    cols.append(df[[target_var]])
    names.append('%s(t)' % target_var)
    
    # Combine everything
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    
    # Drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    
    return agg

In [5]:
def train_test_split(data, n_test):
    return data[:-n_test, :], data[-n_test:, :]

In [6]:
def diy_cv(model, param_grid, splits, trainX, trainy):
    best_score = float("inf")
    best_params = None
    for params in ParameterGrid(param_grid):
        scores = []
        for train_idx, val_idx in splits.split(trainX):
            clone_model = clone(model)
            clone_model.set_params(**params)
            X_train_fold, y_train_fold = trainX.iloc[train_idx], trainy.iloc[train_idx]  # Corrected to use .iloc for trainy
            X_val_fold, y_val_fold = trainX.iloc[val_idx], trainy.iloc[val_idx]  # Corrected to use .iloc for trainy
            clone_model.fit(X_train_fold, y_train_fold)
            predictions = clone_model.predict(X_val_fold)
            score = mean_absolute_error(y_val_fold, predictions)
            scores.append(score)
        avg_score = np.mean(scores)
        if avg_score < best_score:
            best_score = avg_score
            best_params = params
        print(f"Params: {params}, Avg MAE: {avg_score}")
    return best_params, best_score

In [7]:
# def log_evaluation(period=1, show_stdv=True):
#     def callback(env):
#         if env.iteration % period == 0:
#             # wandb.log({"Training MAE": env.evaluation_result_list[0][1], "Validation MAE": env.evaluation_result_list[1][1]})
#     return callback

In [8]:
# def wandb_callback():
#     def callback(env):
#         for i, eval_result in enumerate(env.evaluation_result_list):# wandb.log({f"{eval_result[0]}-{eval_result[1]}": eval_result[2]})
#     return callback

In [9]:
CFG = CFG()

In [10]:
config_dict = {
        "n_in": 6,
        "n_test": 30,
        "wandb_project_name": 'electricity_demand_forecasting',
        "wandb_run_name": 'xgboost',
        "param_grid": {
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.3],
            'n_estimators': [100, 500, 1000],
        }
    }

In [11]:
# initialize W&B
# wandb.init(
#     project=CFG.wandb_project_name,
#     name=CFG.wandb_run_name,
#     config=config_dict
# )

In [12]:
# load data
df = pd.read_csv('./../data/NSW/final_df.csv', index_col=0)

In [13]:
df

In [14]:
# prepare data
data = series_to_supervised(df, n_in=CFG.n_in, n_out=1, dropnan=True, target_var='TOTALDEMAND')
data

In [15]:
# Assuming `data` is your DataFrame after calling series_to_supervised
last_column = data.columns[-1]

if last_column == 'TOTALDEMAND(t)':
    print("The target variable is in the right position.")
else:
    print(f"The target variable is not in the right position, the last column is: {last_column}")

In [16]:
n_obs = CFG.n_in * len(df.columns)
n_obs

In [17]:
# split into input and outputs, with the last CFG.n_test rows for testing
train, test = train_test_split(data.values, CFG.n_test)
trainX, trainy = train[:, :-1], train[:, -1]
testX, testy = test[:, :-1], test[:, -1]

In [18]:
train

In [19]:
test

In [20]:
# further split training into train and validation sets for early stopping
# nominally use the last 10% of the training data as validation
n_val = int(len(trainX) * 0.1)
trainX, valX = trainX[:-n_val], trainX[-n_val:]
trainy, valy = trainy[:-n_val], trainy[-n_val:]

In [21]:
n_val

In [22]:
trainX

In [23]:
valX

In [24]:
trainy

In [25]:
valy

In [26]:
# Convert the datasets into DMatrix format
dtrain = xgb.DMatrix(trainX, label=trainy)
dval = xgb.DMatrix(valX, label=valy)  # validation set
dtest = xgb.DMatrix(testX)  # test set for final evaluation

In [27]:
dtrain

In [28]:
dval

In [29]:
dtest

In [30]:
# def model parameters
params = {
    'max_depth': 3,
    'learning_rate': 0.01,
    'n_estimators': 1000,
    'objective': 'reg:squarederror',
    'eval_metric': 'mae'
}  # example params, adjust based on best_params if you're using them

In [31]:
# train the model
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=[(dtrain, 'train'), (dval, 'eval')],
    # add validation set in evals if available
    early_stopping_rounds=50,
    callbacks=[WandbCallback()]
)

In [32]:
# evaluate the model
yhat = bst.predict(dtest)
error = mean_absolute_error(testy, yhat)

In [33]:
# assuming yhat is the prediction array and testy is the actual target
# values from the test set
actual = testy
predicted = yhat

In [34]:
# generate a time index for plotting.
# since we have 30-minute intervals, this can be represented similarly
# assuming the test set starts immediately after your training and
# val sets, we can calculate the start date as follows
# this requires your original DataFrame (df) to have a datetime index
test_start_date = df.index[-len(testy)]  # get the start date for test set

In [35]:
# generate a date range for the test set
test_dates = pd.date_range(
    start=test_start_date,
    periods=len(testy),
    freq='30min'
)

In [36]:
# Plotting
plt.figure(figsize=(15, 7))
plt.plot(test_dates, actual, label='Actual', marker='.', linestyle='-',
             linewidth=1.0)
plt.plot(test_dates, predicted, label='Predicted', marker='.',
             linestyle='--', linewidth=1.0)
plt.title('Test Set: Actual vs Predicted Demand')
plt.xlabel('Date Time')
plt.ylabel('Total Demand')
plt.legend()
plt.tight_layout()

In [37]:
# can focus on a smaller time frame for a more detailed view
# plt.xlim([pd.Timestamp('2010-08-05'), pd.Timestamp('2010-08-12')])

In [38]:
plt.show()

In [39]:
# close the plot to free up memory
plt.close()

In [40]:
# log the test MAE
# wandb.log({"Test MAE": error})

In [41]:
# finish the W&B run
# wandb.finish()