In [368]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import sklearn as sk
import statsmodels.api as sm
import pmdarima as pm
import warnings
from pmdarima.model_selection import train_test_split
from darts import utils
from darts.models import RegressionModel
from darts.models import BlockRNNModel
from darts.models import NaiveSeasonal
from darts.dataprocessing.transformers import Scaler
from darts import TimeSeries
from darts.metrics import mape
from darts.metrics import rmse

In [None]:
#path to the data folder, edit if necesssary
path = "./data/Generated Data/"

#Price Files 
gas_prices = 'gas_prices.csv'
power_prices = 'power_prices.csv'

#Master files
master_power = 'master_competitor_market_prices_power.csv'
master_gas = 'master_competitor_market_prices_gas.csv'

#Strategy Index aggregated files
power_strat = 'Strat_competitor_market_prices_power.csv'
gas_strat = 'Strat_competitor_market_prices_gas.csv'


In [None]:
#Reading in relevant datasets
df_power = pd.read_csv(path + master_power)
df_gas = pd.read_csv(path + master_gas)
df_power_strat = pd.read_csv(path + power_strat)
df_gas_strat = pd.read_csv(path + gas_strat)
df_power_prices = pd.read_csv(path + power_prices)
df_gas_prices = pd.read_csv(path + gas_prices)

df_power_strat

In [None]:
#Setting right indices for price data
df_power_prices.set_index('Dates', drop = True, inplace= True)
df_gas_prices.set_index('date', drop = True, inplace= True)

#Datetime conversion 
df_power_prices.index = pd.to_datetime(df_power_prices.index)
df_gas_prices.index = pd.to_datetime(df_gas_prices.index)


In [None]:
def querySingleTariffRange(rank = (1,5), post_code = 81737, consumption_range = 2500, df_power = df_power) -> pd.DataFrame:

    res = df_power.loc[(df_power['rank'] >= rank[0]) & (df_power['rank'] <= rank[1]) & (df_power.post_code == post_code) & (df_power.consumption_range_kwh == consumption_range)].copy()
    if res.shape[0] > 0:
        #Generating date indexed data
        res["valid_range"] = res.apply(lambda x: pd.date_range(x["date_valid_from"], x["date_valid_to"]), axis=1)
        res = res.explode("valid_range").copy()

        #Aggregating for mean household prices per day
        res = res.groupby('valid_range').agg({'price_kwh':'mean'})
    return res

In [None]:
def eval_linear_model(model, target, covariates, days = 1, train_size = 0.8):

    #Defining test dataset
    p_train, _ = target.split_before(train_size)
    model.fit(p_train, past_covariates= covariates)
        
    # We backtest the model on the last part of the timeseries:
    backtest = model.historical_forecasts(series=target, 
                                            past_covariates=covariates,
                                            start=train_size, 
                                            retrain=True,
                                            verbose=False, 
                                            forecast_horizon=days)
    return mape(target, backtest)


def evaluate_models(target, covariates, lags, lags_covs, days = 1, train_size = 0.80):
    days = max(days, 1)
    best_score, best_cfg = float("inf"), None
    for l in lags:
        for c in lags_covs:
            try:
                reg_model = RegressionModel(lags = l, lags_past_covariates= c, output_chunk_length=days)
                mape = eval_linear_model(reg_model, target, covariates, days, train_size)
                if mape < best_score:
                    best_score, best_cfg = mape, (l, c)
            except:
                print('error in model eval')
                continue
    print('Best ModelA%s MAPE=%.6f' % (best_cfg, best_score))
    return best_cfg

def scaleTimeSeries(timeseries):
    scaler = Scaler()
    series_scaled = scaler.fit_transform(timeseries)
    return series_scaled

In [378]:
def provideForecasts(postcode, consumption_range, rank_range, household_data, price_data, days):

    #Querying dataset
    df = querySingleTariffRange(rank_range, postcode, consumption_range, household_data)
    
    if not df.shape[0]:
        print('No data found.')
        return

    #Shifting prices by 1 to avoid look-ahead bias
    real_prices = df.price_kwh
    df = df.shift(1)
    df.price_kwh = real_prices

    #Merging price data
    df = df.join(price_data)

    #Adding rolling moving average as additional covariate
    df['moving_average'] = df.avg_price.ewm(alpha=0.1, adjust=False).mean()

    #Dropping resulting NA column
    df.dropna(inplace=True)
    
    ##Building TimeSeries objects, and filling in missing date indices
    past_covs = utils.missing_values.fill_missing_values(TimeSeries.from_dataframe(df[['avg_price', 'moving_average']], fill_missing_dates= True))
    prices = utils.missing_values.fill_missing_values(TimeSeries.from_dataframe(df[['price_kwh']], fill_missing_dates= True))

    #Scaling both timeseries
    past_covs = scaleTimeSeries(past_covs)
    prices = scaleTimeSeries(prices)

    #Defining train datasets
    past_covs_train = past_covs[:-days]
    prices_train = prices[:-days]
    prices_valid = prices[-days:]

    #Defining hyperparam grid for linear regression model
    lags = [1,2,3,4,5]
    cov_lags = [1,2,3,4,5]

    #Finding optimal lags
    l, c = evaluate_models(prices_train, past_covs_train, lags, cov_lags, days)

    #Defining Benchmark
    benchmark = NaiveSeasonal(K=1)
    benchmark.fit(prices_train)
    bm = benchmark.predict(days)


    
    reg_model = RegressionModel(l, c, output_chunk_length=days)
    reg_model.fit(prices_train, past_covs_train)
    rnn_model = BlockRNNModel(model="LSTM", n_epochs=100, random_state=42, input_chunk_length= 1, output_chunk_length=days)
    rnn_model.fit(prices_train, past_covariates=past_covs_train, verbose = False)
    preds_rnn = rnn_model.predict(days, series = prices_train, past_covariates= past_covs_train)
    preds_reg = reg_model.predict(days, series = prices_train, past_covariates= past_covs_train)

    prices_valid.plot(label = 'timeseries')
    bm.plot(label = 'benchmark')
    preds_rnn.plot(label = 'LSTM predictions')
    preds_reg.plot(label = 'Regression predictions')

    print('LSTM MAPE: ', mape(preds_rnn, prices_valid))
    print('Regression MAPE: ', mape(preds_reg, prices_valid))
    print('Benchmark MAPE: ', mape(bm, prices_valid))


    res = preds_reg.values().reshape(1, len(preds_reg))
    res = np.append(res, preds_rnn.values().reshape(1, len(preds_rnn)))
    return res

In [None]:
import warnings
warnings.filterwarnings("ignore")

#Defining postcodes and consumption ranges
df_master = pd.DataFrame({'Postcodes': [], 'Consumption Range': [], 'Rank range': []})
df_res = pd.DataFrame(columns=list(range(1,15))).add_suffix('_linreg')
df_res = pd.concat([df_res, pd.DataFrame(columns=list(range(1,15))).add_suffix('_RNN')])
df_res

#Defining postcodes 
gas_postcodes = df_gas.post_code.unique()[:10]
pow_postcodes = df_power.post_code.unique()[:10]

#Consumption ranges
consumption_ranges = [2500, 3000, 10000]

#Rank ranges
ranks = [(1,5), (10, 15), (20, 25)]

for p in gas_postcodes:
    for c in consumption_ranges:
        for r in ranks:
            print(p, c, r)
            try:
                res = provideForecasts(p, c, r, df_gas, df_gas_prices, 14)
                row = [p, c, r]
                df_master.loc[len(df_master)] = row
                df_res.loc[len(df_res)] = res
            except:
                continue

predictions = pd.concat([df_master, df_res], axis = 1)


In [345]:
predictions.to_csv(path + 'Predictions.csv', index= False)