In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

In [30]:
def format_data(df):
    """
    Formats the dataframe, removes duplicates and fills NaN values.
    """
    df.drop('Day-ahead Total Load Forecast [MW] - BZN|NL', axis=1, inplace=True)
    df.columns = ['Time', 'Load']
    df[['Date', 'End']] = df['Time'].str.split(' - ', expand=True)
    df.set_index(pd.to_datetime(df['Date'], format='%d.%m.%Y %H:%M'), inplace=True)
    df.drop(['Time', 'Date', 'End'], axis=1, inplace=True)
    df.sort_index(inplace=True)

    df = df[~df.index.duplicated(keep='first')].copy()
    df.ffill(inplace=True)

    return df

In [34]:
def import_weather_data(start_year, end_year, scaler, weather_fields, training_data=True):
    """
    Imports and formats weather data for specified period (inclusive).
    If the data is validation data, set training_data to False.
    """
    weather_dict = {}

    for data_year in range(start_year, end_year+1):
        weather_dict[data_year] = pd.read_csv(f'data/weather_{data_year}.csv', low_memory=False)

    weather_full = pd.concat(weather_dict.values())
    weather_full_columns = [column.strip() for column in weather_full.columns]
    weather_full.columns = weather_full_columns

    weather_full = weather_full.map(lambda x: x.strip() if isinstance(x, str) else x)
    weather_full.replace('', np.NaN, inplace=True)

    weather_full[weather_full.select_dtypes(include='object').columns] = (
    weather_full[weather_full.select_dtypes(include='object').columns].astype(float))

    weather_full.ffill(inplace=True)

    weather_full_grouped = weather_full.groupby(['YYYYMMDD', 'HH']).mean()

    weather_full_grouped = pd.concat([weather_full_grouped] * 4)
    weather_full_grouped.sort_values(by=['YYYYMMDD', 'HH'], axis=0, inplace=True)

    temp_index = [pd.to_datetime(f'{start_year}-01-01 00:00:00')]
    for i in range(len(weather_full_grouped)-1):
        temp_index.append(temp_index[i] + pd.Timedelta('00:15:00'))

    weather_full_grouped.index = pd.to_datetime(temp_index)

    std_weather = StandardScaler()

    if training_data:
        weather_full_transformed = pd.DataFrame(scaler.fit_transform(weather_full_grouped),
                                           columns=weather_full_grouped.columns,
                                           index=weather_full_grouped.index)
    else:
        weather_full_transformed = pd.DataFrame(scaler.transform(weather_full_grouped),
                                           columns=weather_full_grouped.columns,
                                           index=weather_full_grouped.index)

    weather_full_transformed = weather_full_transformed[weather_fields]

    return weather_full_transformed

In [1]:
def add_weekday_holiday(df, holidays, add_weekday=False):
    """
    Adds holiday days to a DataFrame. If specified, adds weekdays as well.
    """
    if add_weekday:
        df['Weekday'] = df.index.weekday
        df = pd.get_dummies(df, columns=['Weekday'], drop_first=True)
        df.rename(columns={old_name: new_name for old_name, new_name in zip(
            [column for column in df.columns if 'Weekday' in column],
            ['Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])}, inplace=True)

        for col in ['Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']:
            df[col] = df[col].astype(float)
    
    df['Holiday'] = pd.to_datetime(df.index.date).isin(pd.to_datetime(holidays)).astype(float)
    
    return df

In [39]:
def add_weather_data(df_load, df_weather, weather_fields):
    """
    Adds weather data based on a specified DataFrame.
    """
    start_date = df_load.iloc[0].name

    df_load = df_load.merge(df_weather.loc[start_date:, weather_fields],
                           left_index=True, right_index=True)

    return df_load

In [42]:
def add_lagging_data(df, lag_weeks=0, lag_days=0, lag_intervals=0, ref_column='Load'):
    """
    Returns a dataframe with lagging feature data equal to specified number of prior periods.
    """
    # 1) add the 24-hour lagged value
    column_values = np.array([df[ref_column].shift(96)])
    column_names = np.array(['Load_1_day_lag'])
    
    #df['Load_24_hour_lag'] = df[ref_column].shift(96)

    # 2) add the lagged 15-min intervals following the lagged 24-hour value
    for interval in range(1, lag_intervals+1):
        column_values = np.concatenate([column_values, [df[ref_column].shift(96+interval)]])
        column_names = np.concatenate([column_names, [f'Load_24_hour_lag+{interval}']])

    # 3) add the lagged daily matching values in weekly periods
    for day in range(2, lag_weeks*7+1):
        column_values = np.concatenate([column_values, [df[ref_column].shift(96*day)]])
        column_names = np.concatenate([column_names, [f'Load_{day}_day_lag']])

    # 4) add additional lagged daily matching values
    for day in range(1, lag_days+1):
        column_values = np.concatenate([column_values, [df[ref_column].shift(96*(lag_weeks*7+day))]])
        column_names = np.concatenate([column_names, [f'Load_{lag_weeks*7+day}_day_lag']])
                
    df = pd.concat([df, pd.DataFrame(column_values.T, columns=column_names,
                                         index=df.index)], axis=1)

    # drop the rows that will include NaN values from shifting
    if lag_weeks==0 and lag_days==0:
        df.drop(df.index[:96+lag_intervals], inplace=True)
    else:
        df.drop(df.index[:(lag_weeks*7+lag_days)*96], inplace=True)    
    
    return df

In [57]:
# Helper function for splitting X_train, X_valid
def prep_X_y(X_train, X_valid=None, ref_column='Load'):
    """
    Splits X_train, X_valid into X_train, y_train, X_valid, y_valid.
    """
    y_train = X_train[ref_column]
    X_train.drop(ref_column, axis=1, inplace=True)

    if X_valid is not None:
        y_valid = X_valid[ref_column]
        X_valid.drop(ref_column, axis=1, inplace=True)
        return X_train, y_train, X_valid, y_valid
    else:
        return X_train, y_train

In [32]:
def plot_results(y_true, y_pred, figsize=(12, 6), title=None):
    """
    Plots y_true vs y_pred, and delta graph.
    """
    fig, ax = plt.subplots(2, 1, figsize=figsize, sharex=True)
    y_true.plot(ax=ax[0], label='Actual')
    y_pred.plot(ax=ax[0], label='Predicted')
    ax[0].legend()
    ax[0].set_ylabel('Load (MW)')
    ax[0].grid()

    ax[0].set_ylim(y_true.min()//1000*1000, (y_true.max()//1000+1)*1000)
    ax[0].set_title(title)

    delta = y_pred - y_true
    delta.plot(ax=ax[1])
    ax[1].axhline(0)
    ax[1].set_ylabel('Delta (MW)')
    ax[1].grid()
    ax[1].set_ylim(delta.min()//1000*1000, (delta.max()//1000+1)*1000)
    plt.xlabel(None)
    print(f'MAE: {mean_absolute_error(y_true, y_pred):.2f}')