In [None]:
!pip install darts

In [None]:
# import packages
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from darts import TimeSeries, concatenate

In [None]:
Y_df = pd.read_csv('df_load.csv')

In [None]:
Y_df['datetime'] = pd.to_datetime(Y_df['ds'])
Y_df.drop('ds', axis=1, inplace=True)
Y_df.set_index('datetime', inplace=True)
series = TimeSeries.from_series(Y_df)

# create calendar variables features
series = series.add_datetime_attribute('hour')
series = series.add_datetime_attribute('dayofweek')
series = series.add_datetime_attribute('month')
series = series.add_datetime_attribute('quarter')
series = series.add_datetime_attribute('day')
series = series.add_datetime_attribute('year')
series = series.add_holidays(country_code='ITA') #holidays

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

#linear trend
Y_df['linear_trend'] = range(1, len(Y_df) + 1)

# week*hour variable
Y_df['weho'] = Y_df['hour'] * Y_df['dayofweek']

# Perform one-hot encoding
data_encoded = pd.get_dummies(Y_df, columns=['dayofweek', 'hour', 'month', 'weho'], drop_first=True)

# polynomial terms for temperatures
data_encoded['temperature_squared'] = data_encoded['temperature_2m_rom'] ** 2
data_encoded['temperature_cubed'] = data_encoded['temperature_2m_rom'] ** 3

# Iterate over each month column
for month_col in data_encoded.filter(like='month_').columns:
    data_encoded[month_col + '_temperature'] = data_encoded[month_col] * data_encoded['temperature_2m_rom']

# Create interaction terms between hour and temperatures
for hour_col in data_encoded.filter(like='hour_').columns:
        data_encoded[hour_col + '_temperature'] = data_encoded[hour_col] * data_encoded['temperature_2m_rom']

for hour_col in data_encoded.filter(like='hour_').columns:
        data_encoded[hour_col + '_temperature'] = data_encoded[hour_col] * data_encoded['temperature_2m_rom']



#Concatenate the features
X = data_encoded[['linear_trend','temperature_2m_rom', 'temperature_squared', 'temperature_cubed'] +
                data_encoded.filter(like='month_').columns.tolist() +
                 data_encoded.filter(like='hour_').columns.tolist() +
                 data_encoded.filter(like='weho_').columns.tolist()]


y_df = pd.concat([Y_df['y'], X], axis=1)

In [None]:
# test and validation indexes
test_index = len(y_df) - 365*24
validation_index = test_index - 365*24

# Split the dataset into training, validation, and test sets
Y_train_df = y_df.iloc[:validation_index]
Y_val_df = y_df.iloc[validation_index:test_index]
Y_test_df = y_df.iloc[test_index:]
Y_trainval_df = y_df.iloc[:test_index, :]

In [None]:
X_train = Y_train_df.drop('y', axis=1)
X_val = Y_val_df.drop('y', axis=1)
X_test = Y_test_df.drop('y', axis=1)
X_trainval = Y_trainval_df.drop('y', axis=1)

y_train = Y_train_df['y']
y_val = Y_val_df['y']
y_test = Y_test_df['y']
y_trainval = Y_trainval_df['y']

In [None]:
# Fit the regression model on the training set
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)

# predict the validation set
preds_val = reg_model.predict(X_val)
preds_df_val = pd.DataFrame(preds_val, columns=['prediction'], index=Y_val_df.index)
validation_df = pd.concat([Y_val_df['y'], preds_df_val], axis=1)
validation_df.rename({'y': 'actual'}, axis=1, inplace=True)

In [None]:
# Fit the regression model on the trainval set
reg_model = LinearRegression()
reg_model.fit(X_trainval, y_trainval)

# predict the test set
preds_test = reg_model.predict(X_test)
preds_df_test = pd.DataFrame(preds_test, columns=['pred_q2'], index=Y_test_df.index)
test_df = pd.concat([Y_test_df['y'], preds_df_test], axis=1)
test_df.rename({'y': 'actual'}, axis=1, inplace=True)

In [None]:
# Function to calculate residuals
def calculate_residuals(predictions, true_values):
    return true_values - predictions

# Function to perform empirical bootstrap
def empirical_bootstrap(validation_df, test_predictions, num_bootstrap_samples, confidence_level):
    np.random.seed(42)  # Set random seed for reproducibility

    # compute residuals distribution on validation set
    validation_residuals = calculate_residuals(validation_df['prediction'], validation_df['actual'])

    bootstrap_predictions = []
    num_samples = len(test_predictions)

    for _ in range(num_bootstrap_samples):
        bootstrap_residuals = np.random.choice(validation_residuals, size=num_samples, replace=True)
        bootstrap_predictions.append(test_predictions + bootstrap_residuals)

    # Calculate confidence intervals
    lower_quantile = (1 - confidence_level) / 2
    upper_quantile = 1 - lower_quantile
    lower_bound = np.percentile(bootstrap_predictions, lower_quantile * 100, axis=0)
    upper_bound = np.percentile(bootstrap_predictions, upper_quantile * 100, axis=0)

    return lower_bound, upper_bound

# Define the number of bootstrap samples and the desired confidence level
num_bootstrap_samples = 1000
confidence_level = 0.80


test_predictions = test_df['MLR_q2']

# Call the empirical_bootstrap function
lower_bound, upper_bound = empirical_bootstrap(validation_df, test_predictions, num_bootstrap_samples, confidence_level)

test_df['MLR_q1'] = lower_bound #10th quantile
test_df['MLR_q3'] = upper_bound #90th quantile

In [None]:
# save predictions
test_df.to_csv('MLR_predictions.csv')