In [28]:
import pandas as pd
import numpy as np

import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Lasso
from scipy.stats import norm

from energy_consumption.help_functions import get_energy_data, dummy_mapping, handle_outstanding_dp, get_forecast_timestamps, create_submission_frame

In [4]:
energydata = get_energy_data.get_data()

# get dummies
energydata = dummy_mapping.get_mappings(energydata)

# quantile regression data
y_ec = energydata['energy_consumption']
X_ec = energydata.drop(
    columns=['energy_consumption'])
X_ec = sm.add_constant(X_ec, has_constant="add")

# include interaction terms
poly_input = PolynomialFeatures(interaction_only=True, include_bias=False)
X_interaction = poly_input.fit_transform(X_ec)

100%|██████████| 102/102 [01:05<00:00,  1.56it/s]


In [6]:
# Lasso Regression 
lasso = Lasso(alpha=0.001)
lasso.fit(X_interaction, y_ec)

In [10]:
indexes = [47, 51, 55, 71, 75, 79]
# create dataframe to store forecast quantiles

energyforecast = get_forecast_timestamps.forecast_timestamps(
    energydata.index[-1])
energyforecast = dummy_mapping.get_mappings(energyforecast)
X_fc = sm.add_constant(energyforecast, has_constant='add')

poly_forecast = PolynomialFeatures(
    interaction_only=True, include_bias=False)
X_fc_interaction = poly_forecast.fit_transform(X_fc)

y_predictions = lasso.predict(X_fc_interaction)
selected_forecasts = y_predictions[indexes]
selected_forecasts

array([60.6483951 , 57.62034883, 55.18695189, 60.6483951 , 57.62034883,
       55.18695189])

In [40]:
# Estimate forecast standard deviations

from sklearn.metrics import mean_squared_error
from math import sqrt

predicitions_historical = lasso.predict(X_interaction)
historical_values = y_ec
model_variance_est = mean_squared_error(historical_values, predicitions_historical)

def estimate_forecast_std(model_variance,horizon): 
    return sqrt(model_variance)*sqrt(round(horizon/24,1))

forecast_std = np.array([estimate_forecast_std(
    model_variance_est, horizon) for horizon in [48, 52, 56, 72, 76, 80]])
forecast_std

array([ 8.94702537,  9.38371938,  9.59461581, 10.95782344, 11.31719138,
       11.49266218])

In [41]:
quantiles = [0.025, 0.25, 0.5, 0.75, 0.975]
column_names = [f'q{q}' for q in [0.025, 0.25, 0.5, 0.75, 0.975]]
quantile_df = pd.DataFrame(np.nan, index=range(5), columns=column_names)
quantile_df

Unnamed: 0,q0.025,q0.25,q0.5,q0.75,q0.975
0,,,,,
1,,,,,
2,,,,,
3,,,,,
4,,,,,


In [44]:
def get_quantiles(mean_est, std_est, quantiles = [0.025, 0.25, 0.5, 0.75, 0.975]):

    column_names = [f'q{q}' for q in quantiles]
    quantile_df = pd.DataFrame(np.nan, index=range(6), columns=column_names)

    # input two np.arrays 
    for i in range(6): 
         quantile_df.loc[i] = np.array(mean_est[i] + std_est[i]*norm.ppf(quantiles, loc=0))
    return quantile_df

forecast_frame = create_submission_frame.get_frame(get_quantiles(selected_forecasts, forecast_std))
forecast_frame.drop(columns={'index'})

Unnamed: 0,forecast_date,target,horizon,q0.025,q0.25,q0.5,q0.75,q0.975
0,2023-11-28,energy,36 hour,43.112548,54.613718,60.648395,66.683072,78.184243
1,2023-11-28,energy,40 hour,39.228597,51.291126,57.620349,63.949571,76.012101
2,2023-11-28,energy,44 hour,36.38185,48.715482,55.186952,61.658422,73.992053
3,2023-11-28,energy,60 hour,39.171456,53.257456,60.648395,68.039335,82.125334
4,2023-11-28,energy,64 hour,35.439061,49.987019,57.620349,65.253678,79.801636
5,2023-11-28,energy,68 hour,32.661748,47.435269,55.186952,62.938635,77.712156
