In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

In [2]:
def format_data(df):
    """
    Formats the dataframe, removes duplicates and fills NaN values.
    """
    df.drop('Day-ahead Total Load Forecast [MW] - BZN|NL', axis=1, inplace=True)
    df.columns = ['Time', 'Load']
    df[['Date', 'End']] = df['Time'].str.split(' - ', expand=True)
    df.set_index(pd.to_datetime(df['Date'], format='%d.%m.%Y %H:%M'), inplace=True)
    df.drop(['Time', 'Date', 'End'], axis=1, inplace=True)
    df.sort_index(inplace=True)

    df = df[~df.index.duplicated(keep='first')].copy()
    df.ffill(inplace=True)

    return df

In [3]:
def add_weekday_holiday(df, holidays, add_weekday=False):
    """
    Adds holiday days to a DataFrame. If specified, adds weekdays as well.
    """
    if add_weekday:
        df['Weekday'] = df.index.weekday
        df = pd.get_dummies(df, columns=['Weekday'], drop_first=True)
        df.rename(columns={old_name: new_name for old_name, new_name in zip(
            [column for column in df.columns if 'Weekday' in column],
            ['Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])}, inplace=True)

        for col in ['Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']:
            df[col] = df[col].astype(float)
    
    df['Holiday'] = pd.to_datetime(df.index.date).isin(pd.to_datetime(holidays)).astype(float)
    
    return df

In [4]:
def plot_results(y_true, y_pred, figsize=(12, 6), title=None):
    """
    Plots y_true vs y_pred, and delta graph.
    """
    fig, ax = plt.subplots(2, 1, figsize=figsize, sharex=True)
    y_true.plot(ax=ax[0], label='Actual')
    y_pred.plot(ax=ax[0], label='Predicted')
    ax[0].legend()
    ax[0].set_ylabel('Load (MW)')
    ax[0].grid()

    ax[0].set_ylim(y_true.min()//1000*1000, (y_true.max()//1000+1)*1000)
    ax[0].set_title(title)

    delta = y_pred - y_true
    delta.plot(ax=ax[1])
    ax[1].axhline(0)
    ax[1].set_ylabel('Delta (MW)')
    ax[1].grid()
    ax[1].set_ylim(delta.min()//1000*1000, (delta.max()//1000+1)*1000)
    plt.xlabel(None)
    print(f'MAE: {mean_absolute_error(y_true, y_pred):.2f}')

In [5]:
def clean_anomalies(df, ref_column='Load'):
    """
    Cleans DataFrame by filtering on values with a >20% decrease from the prior 15-min value,
    and setting to the prior value.
    """
    df_copy = df.copy()
    
    indexes = df_copy[df_copy[ref_column].rolling(window=2).apply(lambda x: (x.iloc[1] - x.iloc[0]) / x.iloc[0]) < -0.2].index
    for ind in indexes:
        df_copy.loc[ind, ref_column] = df_copy.loc[ind - pd.Timedelta('00:15:00'), ref_column]

    return df_copy