In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import time
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

import scipy.stats as stats

import properscoring as prscore

from sklearn.utils import resample
from sklearn.tree import DecisionTreeRegressor

## Read and preprocess  the dataset

In [None]:
df = pd.read_csv('power_weather_data.csv')

# csv file MUST contain 'date' and 'Power' fields
# optional: weather data

In [None]:
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y %H:%M')

In [None]:
df['hour'] = df['date'].apply(lambda x: x.hour )
df['month'] = df['date'].apply(lambda x: x.month)

In [None]:
P = df['Power']

PowerData = pd.concat([P.shift(3), P.shift(2), P.shift(1)], axis=1)
PowerData.columns = ['t-45', 't-30', 't-15']

df = pd.concat([df, PowerData.reindex(df.index)], axis=1)
    
df = df.fillna(0)

## Time horizons

In [None]:
weeks = [['2018-03-01', '2019-03-15']]

val_days = 14

n_points_day = 4 * 24

## Set the dataframes

In [None]:
dfs = []

for w in weeks:
    
    w_start = datetime.strptime(w[0]+" 00:00", '%Y-%m-%d %H:%M')
    w_end = datetime.strptime(w[1]+" 23:59", '%Y-%m-%d %H:%M')
    
    dfs.append(df[(df['date'] > w_start) & (df['date'] < w_end)])
    
n_sets = len(dfs)

## Train Test Split

In [None]:
X_train_ = []
X_test_ = []
y_train_ = []
y_test_ = []

x_scaler = []
y_scaler = []

t_train = []
t_test = []

for i in range(n_sets):

    train = dfs[i][:int(-n_points_day*val_days)]
    test = dfs[i][int(-n_points_day*val_days):]
    
    X_tr = train.drop(['Power','date'], axis=1).values
    X_t = test.drop(['Power','date'], axis=1).values
    
    y_tr = train['Power'].values
    y_t = test['Power'].values
    
    x_sc = MinMaxScaler()
    y_sc = MinMaxScaler()
#     x_sc = StandardScaler()
#     y_sc = StandardScaler()
    x_sc.fit(X_tr)
    y_sc.fit(y_tr.reshape(-1, 1))  #reshape only because fit needs a 2d array
    x_scaler.append(x_sc)
    y_scaler.append(y_sc)
    
    X_train_.append(x_sc.transform(X_tr))
    X_test_.append(x_sc.transform(X_t))
    y_train_.append(y_sc.transform(y_tr.reshape(-1, 1)))
    y_test_.append(y_sc.transform(y_t.reshape(-1, 1)))
    
    t_train.append(dfs[i].iloc[:int(-n_points_day*val_days)]['date'].values)
    t_test.append(dfs[i].iloc[int(-n_points_day*val_days):]['date'].values)

## Bootstrapping

In [None]:
alpha = 0.95
p_mean = 50
p_lower = ((1.0-alpha)/2.0) * 100
p_upper = (alpha+((1.0-alpha)/2.0)) * 100

In [None]:
mean_all = []
lower_all = []
upper_all = []

start = time.time()

for i in range(n_sets):
    
    X_train_i = X_train[i]
    y_train_i = y_train[i]
    X_test_i = X_test[i]  

    n_iterations = 1000
    n_train_examples = X_train_i.shape[0]
    n_test_examples = X_test_i.shape[0]
    n_size = int(n_train_examples * 0.50)
    
    predictions_all = np.zeros((n_test_examples, n_iterations))  #array to store the various predictions for each model [N_test_point x bootstrap_iterations]

    for b in range(n_iterations):
        
        # prepare train and test sets
        x_train_res, y_train_res = resample(X_train_i, y_train_i, n_samples=n_size)
        
        # fit model
        model = DecisionTreeRegressor(max_depth=3)
        model.fit(x_train_res, y_train_res)
        
        # evaluate model
        y_first = model.predict(X_test_i[:3])

        y_3 = y_first[3-3]
        y_2 = y_first[3-2]
        y_1 = y_first[3-1]
        for j in range(3, X_test[i].shape[0]):
            X_test_i[j][-3] = y_3
            X_test_i[j][-2] = y_2
            X_test_i[j][-1] = y_1
            y_pred_j = model.predict(X_test_i[j].reshape(1, -1))
            y_3 = y_2
            y_2 = y_1
            y_1 = y_pred_j
        # end of multi-step ahead
        
        predictions = model.predict(X_test_i)
        predictions = y_scaler[i].inverse_transform(predictions.reshape(1, -1))
        
        predictions_all[:,b] = predictions
    
    mean = np.zeros((n_test_examples,1))
    lower = np.zeros((n_test_examples,1))
    upper = np.zeros((n_test_examples,1))
    
    # Estimate distributions for each time step
    for j in range(n_test_examples):
        mean[j] = np.percentile(predictions_all[j,:], p_mean) 
        lower[j] = np.percentile(predictions_all[j,:], p_lower)
        upper[j] = np.percentile(predictions_all[j,:], p_upper)
        
    
    mean_all.append(mean)
    lower_all.append(lower)
    upper_all.append(upper)
        
end = time.time()
# print((end - start)/(len(dfs)))   
print((end-start))

## Evaluation

In [None]:
def PICP_func(y, lower, upper):
    sum_points = 0
    for i, yi in enumerate(y):
        if lower[i] <= yi <= upper[i]:
            sum_points += 1
    
    return sum_points / len(y)

def PINAW_func(y, lower, upper):
    PIAW = np.mean(upper - lower)
    R = np.max(y) - np.min(y)
    PINAW = PIAW / R
    
    return PINAW

In [None]:
y = []
y_hat = []
upper_hat = []
lower_hat = []

MAE_all = []
RMSE_all = []
MBE_all = []
CRPS_all = []

for i in range(n_sets):
    
    y_test_i = y_test[i]
    
    mean = mean_all[i]
    lower = lower_all[i]
    upper = upper_all[i]
    
    mean = mean.flatten()
    lower = lower.flatten()
    upper = upper.flatten()
    
    real_y_test = y_scaler[i].inverse_transform(y_test_i)
    real_y_test = real_y_test.flatten()
    
    
    y_hat.append(mean)
    y.append(real_y_test)
    lower_hat.append(lower)
    upper_hat.append(upper)
    
    # Deterministic metrics
    MAE = mean_absolute_error(real_y_test, mean)
    RMSE = mean_squared_error(real_y_test, mean, squared=False)
    MBE = np.mean(mean - real_y_test)
    print(f'MAE: {MAE:.3f}')
    print(f'RMSE: {RMSE:.3f}')
    print(f'MBE: {MBE:.3f}')
    
    # Probabilistic metrics
    PICP = PICP_func(real_y_test, lower, upper)
    PINAW = PINAW_func(real_y_test, lower, upper)
    C = prscore.crps_gaussian(real_y_test, mu=mean, sig=(((upper-lower)/4)+0.001))
    CRPS = C.mean()
    print(f'PICP: {PICP:.3f}')
    print(f'PINAW: {PINAW:.3f}')
    print(f'CRPS: {CRPS:.3f}')
    print('\n')

## Plotting

In [None]:
plt.figure(figsize=(20,6))

i = 0  # index of training set

k = 0  # index for a spesific day
j = list(range((n_points_day*k),n_points_day*(k+7)))  # indeces for one week

x = list(range(len(y[i][j])))

# Plot predictive means as blue line, observations as gray dots
plt.plot(x, y[i][j], 'k', markersize=10, label='Observations')
plt.plot(x, y_hat[i][j], 'b-', markersize=10, label='Observations')

# plt.plot(lower_hat[i])
# plt.plot(upper_hat[i])

plt.fill_between(x, lower_hat[i][j], upper_hat[i][j], alpha=0.5, fc='b', ec='None')

# plt.legend(['Observed Data', 'Mean', 'Confidence'])
plt.xlabel('Time')
plt.ylabel('Power')