In [1]:
import numpy as np
import pandas as pd

import xgboost as xgb
from sklearn.metrics import mean_absolute_error

from tqdm.notebook import tqdm

In [2]:
from pathlib import Path  
df_true= pd.read_csv("./data/cleaned_actual.csv")
df_pred= pd.read_csv("./data/cleaned_forecasts.csv")
df_pred
df_true = df_true.rename(columns={"Time": 'time', 
                                 "Load (kW)": "load_kw_true", 
                                 "Pressure_kpa": "pres_kpa_true",
                                 'Cloud Cover (%)': 'cld_pct_true',
                                 'Humidity (%)': 'hmd_pct_true',
                                 'Temperature (C)': 'temp_c_true',
                                 'Wind Direction (deg)': 'wd_deg_true',
                                 'Wind Speed (kmh)':'ws_kmh_true'})
df_pred = df_pred.rename(columns={"Time": 'time', 
                                 "Pressure (kpa)": "pres_kpa_pred",
                                 'Cloud Cover (%)': 'cld_pct_pred',
                                 'Temperature (C)': 'temp_c_pred',
                                 'Wind Direction (deg)': 'wd_deg_pred',
                                 'Wind Speed (kmh)':'ws_kmh_pred'})
result = pd.merge(df_true, df_pred, on="time")
result['time']= pd.to_datetime(result['time'])

#filepath = Path('./data/data_cleaned.csv')  
#filepath.parent.mkdir(parents=True, exist_ok=True)  
#result.to_csv(filepath)  

#df = pd.read_csv("./data/data_cleaned.csv")
df = result 
df = df.set_index('time')

In [3]:
df_true = df_true.rename(columns={"Time": 'time', 
                                 "Load (kW)": "load_kw", 
                                 "Pressure_kpa": "pres_kpa_true",
                                 'Cloud Cover (%)': 'cld_pct_true',
                                 'Humidity (%)': 'hmd_pct_true',
                                 'Temperature (C)': 'temp_c_true',
                                 'Wind Direction (deg)': 'wd_deg_true',
                                 'Wind Speed (kmh)':'ws_kmh_true'})
#df_true = df_true.set_index('time')

In [4]:
df_true_lag = pd.DataFrame([])

#create lag
for col in df_true.columns:
    df_true_lag[str(col)+'_lag168'] = df_true[str(col)].shift(168)
df_true_lag['time'] = pd.to_datetime(df_true['time'])

#retrieve time
df_true_lag = df_true_lag.drop('time_lag168', axis=1)

#retrieve load_kw
df_true_lag['load_kw'] = df_true['load_kw'] 

df_true_lag

Unnamed: 0,load_kw_lag168,pressure_kpa_lag168,cloud_cover_pct_lag168,humidity_pct_lag168,temperature_c_lag168,wind_direction_deg_lag168,wind_speed_kmh_lag168,time,load_kw
0,,,,,,,,2017-03-18 00:00:00+00:00,1.031472e+06
1,,,,,,,,2017-03-18 01:00:00+00:00,1.007206e+06
2,,,,,,,,2017-03-18 02:00:00+00:00,9.861084e+05
3,,,,,,,,2017-03-18 03:00:00+00:00,9.707610e+05
4,,,,,,,,2017-03-18 04:00:00+00:00,9.622584e+05
...,...,...,...,...,...,...,...,...,...
33963,968907.4321,1011.175,10.25,57.333333,9.446886,47.0,14.0675,2021-01-31 03:00:00+00:00,9.774149e+05
33964,965731.4321,1011.375,9.75,58.500000,7.721886,43.0,11.0125,2021-01-31 04:00:00+00:00,9.717784e+05
33965,967659.9321,1011.900,12.50,58.666667,6.201886,46.5,14.2775,2021-01-31 05:00:00+00:00,9.738789e+05
33966,968280.9321,1012.700,10.50,58.666667,6.196886,54.0,12.3000,2021-01-31 06:00:00+00:00,9.792199e+05


In [9]:

def train_best_model(train_data, val_data, param_grid):
    """
    This function trains an XGBoost model using a grid search over the parameter grid 
    and returns the model with the smallest mean absolute error (MAE) on the validation data.
    
    Args:
    train_data (pd.DataFrame): The training data.
    val_data (pd.DataFrame): The validation data.
    param_grid (dict): The grid of parameters to search over.
    
    Returns:
    best_model (xgb.Booster): The model with the smallest MAE on the validation data.
    best_params (dict): The parameters of the best model.
    best_mae (float): The MAE of the best model on the validation data.
    """
    
    best_model = None
    best_params = None
    best_mae = float('inf')
    
    # Extract features and labelsx
    X_train = train_data.loc[:, ~train_data.columns.isin(['load_kw'])]
    y_train = train_data['load_kw']
    X_val = val_data.loc[:, ~val_data.columns.isin(['load_kw'])]
    y_val = val_data['load_kw']
    print(X_train.info(), y_train.info())
    
    # Create DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    # Iterate over all combinations of parameters
    for params in tqdm(param_grid):
        model = xgb.train(params, dtrain, 
                          num_boost_round=100, 
                          evals=[(dtrain, 'train'), (dval, 'val')], 
                          early_stopping_rounds=10, verbose_eval=False)

        # Predict on validation set and calculate MAE
        val_preds = model.predict(dval)
        mae = mean_absolute_error(y_val, val_preds)

        # Update best model if current model has lower MAE
        if mae < best_mae:
            best_model = model
            best_params = params
            best_mae = mae

    return best_model, best_params, best_mae


In [10]:
def forecast_xgboost(model, test_data):
    """
    This function forecasts using an XGBoost model.
    
    Args:
    model (xgb.Booster): The XGBoost model.
    test_data (pd.DataFrame): The test data.
    
    Returns:
    preds (numpy array): The forecasts.
    """
    
    X_test = test_data.filter(like='lag168', axis=1)
    dtest = xgb.DMatrix(X_test)
    preds = model.predict(dtest)
    
    return preds 

In [11]:
#train and validation data
end_date = pd.to_datetime(df_true_lag['time'].iloc[-1])
start_date = end_date - pd.DateOffset(years=3)

#extract 3 years of data
train_val_data = df_true_lag.loc[df_true_lag['time'].between(start_date, end_date)]

#drop time axis
train_val_data.index = train_val_data['time']
train_val_data = train_val_data.drop('time', axis=1)

n = len(train_val_data)
train_data = train_val_data[:int(n*0.9)]
val_data= train_val_data[int(n*0.9):]

In [12]:
# Define the grid of parameters to search over
param_grid = [
    {'max_depth': depth, 'eta': eta, 'subsample': subsample, 'colsample_bytree': colsample_bytree,
     'objective': 'reg:absoluteerror', 'eval_metric': 'mae'}
    for depth in range(2, 11, 2)
    for eta in [i / 10 for i in range(1, 4)]
    for subsample in [i / 10 for i in range(5, 11)]
    for colsample_bytree in [i / 10 for i in range(5, 11)]
]

train_best_model(train_data=train_data, val_data=val_data, param_grid=param_grid)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 23674 entries, 2018-01-31 07:00:00+00:00 to 2020-10-13 16:00:00+00:00
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   load_kw_lag168             23674 non-null  float64
 1   pressure_kpa_lag168        23674 non-null  float64
 2   cloud_cover_pct_lag168     23674 non-null  float64
 3   humidity_pct_lag168        23674 non-null  float64
 4   temperature_c_lag168       23674 non-null  float64
 5   wind_direction_deg_lag168  23674 non-null  float64
 6   wind_speed_kmh_lag168      23674 non-null  float64
dtypes: float64(7)
memory usage: 1.4 MB
<class 'pandas.core.series.Series'>
DatetimeIndex: 23674 entries, 2018-01-31 07:00:00+00:00 to 2020-10-13 16:00:00+00:00
Series name: load_kw
Non-Null Count  Dtype  
--------------  -----  
23674 non-null  float64
dtypes: float64(1)
memory usage: 369.9 KB
None None


  0%|          | 0/540 [00:00<?, ?it/s]

(<xgboost.core.Booster at 0x13eb4b9d0>,
 {'max_depth': 4,
  'eta': 0.1,
  'subsample': 0.8,
  'colsample_bytree': 0.9,
  'objective': 'reg:absoluteerror',
  'eval_metric': 'mae'},
 25847.04336769289)