In [1]:
import os, sys
import plotly.express as px
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import darts
from darts.dataprocessing.transformers.boxcox import BoxCox
from darts.models import LightGBMModel, XGBModel, LinearRegressionModel, TFTModel
from darts.metrics import smape, mape, mase, mse, rmse, r2_score, mae
from darts.dataprocessing.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, RobustScaler   
from darts.dataprocessing.transformers.scaler import Scaler
from darts.utils.missing_values import extract_subseries

from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from torch.optim.lr_scheduler import ReduceLROnPlateau
from pytorch_lightning.callbacks import ModelCheckpoint
import torch
from wandb.xgboost import WandbCallback

import wandb
wandb.login()


import warnings
warnings.filterwarnings('ignore')

# Set seed
np.random.seed(42)

# Set working directory
os.chdir(r"..") # should be the git repo root directory
print("Current working directory: " + os.getcwd())
repo_name = 'net-load-forecasting'
assert os.getcwd()[-len(repo_name):] == "net-load-forecasting", "Working directory is not the git repo root directory"


from utils.utils import *

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnikolaushouben[0m ([33mwattcast[0m). Use [1m`wandb login --relogin`[0m to force relogin


Current working directory: c:\Users\nik\Desktop\Berkeley_Projects\net-load-forecasting


In [2]:
clean_data_path = os.path.join(os.getcwd(),'data','clean_data')
model_data_path = os.path.join(os.getcwd(),'data','model_data')

In [12]:
# run parameters

model_types = ['integrated', 'additive', 'direct']

model_type = model_types[0]

config_dataset = {
    'model_type': model_type,
    'METER': '2',
    'META': '1',
    'train_end': '2015-10-01',
    'val_end' : '2016-04-01',
    'lookback_in_hours' : 24,
    'liklihood': None,
    'holiday': True,
    'use_datetime_encoding': False,
    'boxcox': False,
    'config.eval_metrics' : [mae, mse, rmse]
}

# derived parameters



config = build_config(config_dataset)

In [8]:


def load_data(config):

    '''

    Function to load the data for the different model setups.

    Parameters
    ----------
    config : Config
        Config object with the model setup parameters.

    Returns
    -------


    '''

    df = pd.read_hdf(os.path.join(clean_data_path, "data_net_load_forecasting.h5"), key=f"{config.temp_resolution}min/netload")
    df_irr = pd.read_hdf(os.path.join(clean_data_path, "data_net_load_forecasting.h5"), key=f"{config.temp_resolution}min/weather")
    df_irr.rename({'temperature': 'temp_air'}, axis=1, inplace=True)
    df_pv_forecast = pd.read_hdf(os.path.join(model_data_path, "pv_model_results.h5"), key=f"{config.temp_resolution}min/pv_forecast_META-{config.META}")

    df_train_int = df.loc[:config.train_end]
    df_val_int = df.loc[config.train_end:config.val_end]
    df_test_int = df.loc[config.val_end:]

    df_cov_dir_train = df_irr.loc[:config.train_end]
    df_cov_dir_val = df_irr.loc[config.train_end:config.val_end]
    df_cov_dir_test = df_irr.loc[config.val_end:]

    df_cov_pv_train = df_pv_forecast.loc[:config.train_end]
    df_cov_pv_val = df_pv_forecast.loc[config.train_end:config.val_end]
    df_cov_pv_test = df_pv_forecast.loc[config.val_end:]

    df_train_add = df_train_int + df_cov_pv_train.values
    df_train_add[df_train_add < 0] = 0
    df_val_add = df_val_int + df_cov_pv_val.values
    df_val_add[df_val_add < 0] = 0
    df_test_add = df_test_int + df_cov_pv_test.values
    df_test_add[df_test_add < 0] = 0

    # In this study we are comparing three different model setups: integrated, additive and direct for net load forecasting
    model_setups = {
                    'integrated': {'target': (df_train_int, df_val_int, df_test_int), 'covs': (df_cov_pv_train, df_cov_pv_val, df_cov_pv_test)},
                    'additive': {'target': (df_train_add, df_val_add, df_test_add), 'covs': (None, None, None)},
                    'direct': {'target': (df_train_int, df_val_int, df_test_int), 'covs': (df_cov_dir_train, df_cov_dir_val, df_cov_dir_test)}
                    }
    data = model_setups[config.model_type] 
    
    return data


In [10]:
def darts_data_pipeline(config, data):

    '''

    Function to transform the data into darts.TimeSeries format and apply the data pipeline.

    Parameters
    ----------
    config : Config

    data : dict
        Dictionary with the data for the different model setups.

    Returns
    -------
    piped_data : dict
        Dictionary with the transformed data for the different model setups.

    pipeline : darts.dataprocessing.pipeline.Pipeline
        Pipeline object with the data pipeline.


    '''


    
    df_train, df_val, df_test = data['target']
    df_cov_train, df_cov_val, df_cov_test = data['covs']
    

    # Into Darts format
    ts_train = darts.TimeSeries.from_dataframe(df_train, freq=str(config.temp_resolution) + 'min')
    ts_train = extract_subseries(ts_train)
    ts_val = darts.TimeSeries.from_dataframe(df_val, freq=str(config.temp_resolution) + 'min')
    ts_val = extract_subseries(ts_val)
    ts_test = darts.TimeSeries.from_dataframe(df_test, freq=str(config.temp_resolution) + 'min')
    ts_test = extract_subseries(ts_test)


    if config.model_setup == 'additive':
        ts_cov_train = None
        ts_cov_val = None
        ts_cov_test = None
    else:
        ts_cov_train = darts.TimeSeries.from_dataframe(df_cov_train, freq=str(config.temp_resolution) + 'min')
        ts_cov_val = darts.TimeSeries.from_dataframe(df_cov_val, freq=str(config.temp_resolution) + 'min')
        ts_cov_test = darts.TimeSeries.from_dataframe(df_cov_test, freq=str(config.temp_resolution) + 'min')

    # Reviewing subseries to make sure they are long enough

    min_len = config.n_lags + config.n_ahead

    ts_train, ts_cov_train = review_subseries(ts_train, min_len, ts_cov_train)
    ts_val, ts_cov_val = review_subseries(ts_val,  min_len, ts_cov_val)
    ts_test, ts_cov_test = review_subseries(ts_test, min_len, ts_cov_test)


    # getting the index of the longest subseries, to be used for evaluation later
    longest_ts_val_idx = get_longest_subseries_idx(ts_val)
    longest_ts_test_idx = get_longest_subseries_idx(ts_test)


    # Load pipeline
    pipeline = Pipeline( # missing values have been filled in the 'data_prep.ipynb'
                        [
                        #BoxCox() if config_modeldesign['boxcox'] else None,
                        Scaler(MinMaxScaler()),
                        ]
                        )

    ts_train_piped = pipeline.fit_transform(ts_train)
    ts_val_piped = pipeline.transform(ts_val)[longest_ts_val_idx]
    ts_test_piped = pipeline.transform(ts_test)[longest_ts_test_idx]
    trg_train_inversed = pipeline.inverse_transform(ts_train_piped, partial=True) # inverse transform the target, we need the original values for the evaluation
    trg_val_inversed = pipeline.inverse_transform(ts_val_piped, partial=True)# inverse transform the target, we need the original values for the evaluation
    trg_test_inversed = pipeline.inverse_transform(ts_test_piped, partial=True) # inverse transform the target, we need the original values for the evaluation

    if config.model_setup == 'additive':
        ts_cov_train_piped = None
        ts_cov_val_piped = None
        ts_cov_test_piped = None

    else:  
        # Future Covariate Pipeline
        pipeline_weather = Pipeline([Scaler(MinMaxScaler())])
        ts_cov_train_piped = pipeline_weather.fit_transform(ts_cov_train)
        ts_cov_val_piped = pipeline_weather.transform(ts_cov_val)[longest_ts_val_idx]
        ts_cov_test_piped = pipeline_weather.transform(ts_cov_test)[longest_ts_test_idx]

    piped_data = {'target': (ts_train_piped, ts_val_piped, ts_test_piped), 'covs': (ts_cov_train_piped, ts_cov_val_piped, ts_cov_test_piped), 'target_inversed': (trg_train_inversed, trg_val_inversed, trg_test_inversed)}

    return piped_data, pipeline



In [193]:
predictions_per_model = {}
scores_per_model = {}

for model_setup, data in model_setups.items():

    print(f"Model setup: {model_setup}")


    model = LightGBMModel(lags=config.n_lags,
                    lags_future_covariates= None if model_setup == 'additive' else [0],
                    add_encoders=config.datetime_encoding   , 
                    output_chunk_length=config.n_ahead, 
                    likelihood=None,
                    random_state=42
                    )

    model.fit(ts_train_piped, future_covariates = ts_cov_train_piped)

    print('Evaluating on validation set')
    predictions, scores = predict_testset(config, model, 
                                    ts_val_piped, 
                                    ts_cov_val_piped,
                                    pipeline,
                                    )
    
    predictions.columns = ['prediction_'+model_setup]

    # subtracting the covariates from the predictions, since we are predicting the net load
    if model_setup == 'additive':
        predictions -= df_cov_pv_val.reindex(predictions.index).values

    predictions_per_model[model_setup] = predictions
    scores_per_model[model_setup] = scores





Model setup: integrated
Evaluating on validation set
Model setup: additive
Evaluating on validation set
Model setup: direct
Evaluating on validation set


In [194]:
df_metrics = pd.DataFrame.from_dict(scores_per_model).T
df_metrics

Unnamed: 0,mae,mse,rmse,r2_score
integrated,0.0637,0.009905,0.097028,0.158798
additive,0.067353,0.011362,0.103556,0.260561
direct,0.063358,0.010274,0.097645,0.313132


In [195]:
df_predictions = pd.concat(predictions_per_model.values(), axis=1)




In [197]:
print("Plotting predictions...")
df_compare = pd.merge(trg_val_inversed.pd_dataframe(), df_predictions, left_index=True, right_index=True, how = 'left')

px.line(df_compare)

Plotting predictions...
