In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('data/train.csv')
train.head()

Unnamed: 0,time,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,coco,el_price,consumption
0,2021-09-01 00:00:00+03:00,11.2,10.3,94.0,,,320.0,7.2,16.7,1012.6,2.0,0.09016,0.577
1,2021-09-01 01:00:00+03:00,10.7,9.6,93.0,,,320.0,7.2,13.0,1012.6,2.0,0.09251,0.594
2,2021-09-01 02:00:00+03:00,9.9,9.0,94.0,,,320.0,7.2,13.0,1012.2,2.0,0.0889,0.685
3,2021-09-01 03:00:00+03:00,10.0,8.4,90.0,,,330.0,7.2,13.0,1011.9,1.0,0.08735,1.016
4,2021-09-01 04:00:00+03:00,9.0,8.1,94.0,,,300.0,3.6,13.0,1011.4,2.0,0.08688,0.677


In [3]:
test = pd.read_csv('data/test.csv')
test.head()

Unnamed: 0,time,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,coco,el_price
0,2022-08-25 00:00:00+03:00,21.1,15.2,69.0,0.0,,340.0,9.0,9.3,1022.0,2.0,0.25533
1,2022-08-25 01:00:00+03:00,20.1,15.1,73.0,0.0,,30.0,6.0,14.8,1022.0,2.0,0.19492
2,2022-08-25 02:00:00+03:00,20.1,15.1,73.0,0.0,,320.0,7.0,13.0,1022.0,2.0,0.18853
3,2022-08-25 03:00:00+03:00,18.7,17.0,90.0,0.0,,0.0,4.0,11.1,1022.4,4.0,0.19947
4,2022-08-25 04:00:00+03:00,18.1,17.1,94.0,0.0,,280.0,7.0,11.1,1022.0,3.0,0.21192


In [4]:
# Count missing values
train.isna().sum()

time              0
temp              0
dwpt              0
rhum              0
prcp           6433
snow           8473
wdir              0
wspd              0
wpgt              0
pres              0
coco            196
el_price          0
consumption       2
dtype: int64

In [5]:
def set_time_index(df):
  # [1444, 4972] - iloc, where time is changing
  df['time'] = pd.to_datetime(df['time'], utc=True).dt.tz_convert('Europe/Tallinn')
  df.set_index('time', drop=True, inplace=True)
  return df

In [6]:
train = set_time_index(train)
train.head()

Unnamed: 0_level_0,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,coco,el_price,consumption
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-09-01 00:00:00+03:00,11.2,10.3,94.0,,,320.0,7.2,16.7,1012.6,2.0,0.09016,0.577
2021-09-01 01:00:00+03:00,10.7,9.6,93.0,,,320.0,7.2,13.0,1012.6,2.0,0.09251,0.594
2021-09-01 02:00:00+03:00,9.9,9.0,94.0,,,320.0,7.2,13.0,1012.2,2.0,0.0889,0.685
2021-09-01 03:00:00+03:00,10.0,8.4,90.0,,,330.0,7.2,13.0,1011.9,1.0,0.08735,1.016
2021-09-01 04:00:00+03:00,9.0,8.1,94.0,,,300.0,3.6,13.0,1011.4,2.0,0.08688,0.677


In [7]:
test = set_time_index(test)
test.head()

Unnamed: 0_level_0,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,coco,el_price
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-08-25 00:00:00+03:00,21.1,15.2,69.0,0.0,,340.0,9.0,9.3,1022.0,2.0,0.25533
2022-08-25 01:00:00+03:00,20.1,15.1,73.0,0.0,,30.0,6.0,14.8,1022.0,2.0,0.19492
2022-08-25 02:00:00+03:00,20.1,15.1,73.0,0.0,,320.0,7.0,13.0,1022.0,2.0,0.18853
2022-08-25 03:00:00+03:00,18.7,17.0,90.0,0.0,,0.0,4.0,11.1,1022.4,4.0,0.19947
2022-08-25 04:00:00+03:00,18.1,17.1,94.0,0.0,,280.0,7.0,11.1,1022.0,3.0,0.21192


In [8]:
def get_corr_consumption(df):
  df_corr = df.corr(method="pearson")
  print(df_corr.shape)
  print("Correlation with consumption:")
  return pd.DataFrame(df_corr["consumption"].sort_values(ascending=False))

In [9]:
df_corr_train_consumption = get_corr_consumption(train)
df_corr_train_consumption

(12, 12)
Correlation with consumption:


Unnamed: 0,consumption
consumption,1.0
coco,0.133539
rhum,0.094408
wpgt,0.072206
wspd,0.04884
snow,0.004163
wdir,-0.005009
prcp,-0.010668
pres,-0.06894
el_price,-0.12474


In [10]:
# train_lags = train.copy()
# columns_for_lags = ['temp', 'dwpt', 'el_price']
# lags = [1, 24, 48, 72]
# for col in columns_for_lags:
#   for lag in lags:
#     train_lags[f'{col}_lag_{lag}'] = train_lags[col].shift(lag)
# train_lags.drop(index=train_lags.index[:max(lags)], axis=1, inplace=True)
# train_lags.head()

In [11]:
from darts import TimeSeries

In [12]:
# Remove time zone
train.index = train.index.tz_localize(None)

In [13]:
iloc_start_time = 5000

In [14]:
ts_C = TimeSeries.from_series(train.iloc[iloc_start_time:]['consumption'])
ts_C

In [16]:
ts_feat = TimeSeries.from_series(train.iloc[iloc_start_time:].drop('consumption', axis=1))

In [20]:
train.iloc[iloc_start_time:].drop('consumption', axis=1)

Unnamed: 0_level_0,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,coco,el_price
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-03-28 08:00:00,3.4,1.1,85.0,,,230.0,10.8,33.3,997.8,18.0,0.11845
2022-03-28 09:00:00,4.5,1.0,78.0,,,260.0,14.4,37.0,997.0,4.0,0.11844
2022-03-28 10:00:00,5.4,1.0,73.0,,,260.0,18.0,40.8,996.3,7.0,0.09548
2022-03-28 11:00:00,7.6,1.0,63.0,,,260.0,18.0,42.6,995.2,4.0,0.08712
2022-03-28 12:00:00,8.3,0.5,58.0,,,270.0,14.4,44.5,994.3,3.0,0.01435
...,...,...,...,...,...,...,...,...,...,...,...
2022-08-24 19:00:00,27.1,17.0,54.0,0.0,,180.0,4.0,11.1,1020.0,2.0,0.53494
2022-08-24 20:00:00,25.1,17.1,61.0,0.0,,220.0,6.0,11.1,1021.0,1.0,0.49990
2022-08-24 21:00:00,24.9,18.1,66.0,0.0,,150.0,4.0,9.3,1020.9,4.0,0.43149
2022-08-24 22:00:00,22.1,17.0,73.0,0.0,,160.0,6.0,9.3,1021.0,2.0,0.55203


In [21]:
ts_X = TimeSeries.from_dataframe(test)

In [17]:
print("components (columns) of consumption time series:", ts_C.components)
print("duration:",ts_C.duration)
print("frequency:",ts_C.freq)
print("frequency:",ts_C.freq_str)
print("has date time index? (or else, it must have an integer index):",ts_C.has_datetime_index)
print("deterministic:",ts_C.is_deterministic)
print("univariate:",ts_C.is_univariate)

components (columns) of consumption time series: Index(['consumption'], dtype='object', name='component')
duration: 149 days 15:00:00
frequency: <Hour>
frequency: H
has date time index? (or else, it must have an integer index): True
deterministic: True
univariate: True


In [18]:
print("components (columns) of feature time series:", ts_feat.components)
print("duration:",ts_feat.duration)
print("frequency:",ts_feat.freq)
print("frequency:",ts_feat.freq_str)
print("has date time index? (or else, it must have an integer index):",ts_feat.has_datetime_index)
print("deterministic:",ts_feat.is_deterministic)
print("univariate:",ts_feat.is_univariate)

components (columns) of feature time series: Index(['temp', 'dwpt', 'rhum', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres',
       'coco', 'el_price'],
      dtype='object', name='component')
duration: 149 days 15:00:00
frequency: <Hour>
frequency: H
has date time index? (or else, it must have an integer index): True
deterministic: True
univariate: False


In [22]:
print("components (columns) of consumption time series:", ts_X.components)
print("duration:",ts_X.duration)
print("frequency:",ts_X.freq)
print("frequency:",ts_X.freq_str)
print("has date time index? (or else, it must have an integer index):",ts_X.has_datetime_index)
print("deterministic:",ts_X.is_deterministic)
print("univariate:",ts_X.is_univariate)

components (columns) of consumption time series: Index(['temp', 'dwpt', 'rhum', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres',
       'coco', 'el_price'],
      dtype='object', name='component')
duration: 6 days 23:00:00
frequency: <Hour>
frequency: H
has date time index? (or else, it must have an integer index): True
deterministic: True
univariate: False


In [23]:
from darts.dataprocessing.transformers import Scaler

In [24]:
scaler_consumption = Scaler()
scaler_consumption.fit_transform(ts_C)
ts_C_train_trf = scaler_consumption.transform(ts_C)

scaler_feat = Scaler()
scaler_feat.fit_transform(ts_feat)
ts_feat_train_trf = scaler_feat.transform(ts_feat) 
ts_X_trf = scaler_feat.transform(ts_X)

In [None]:
from darts.models import TransformerModel

In [None]:
LOAD = False         # True = load previously saved model from disk?  False = (re)train the model
SAVE = "\_TForm_model.pth.tar"   # file name to save the model under

EPOCHS = 200
INLEN = 32          # input size
FEAT = 32           # d_model = number of expected features in the inputs, up to 512    
HEADS = 4           # default 8
ENCODE = 4          # encoder layers
DECODE = 4          # decoder layers
DIM_FF = 128        # dimensions of the feedforward network, default 2048
BATCH = 32          # batch size
ACTF = "relu"       # activation function, relu (default) or gelu
SCHLEARN = None     # a PyTorch learning rate scheduler; None = constant rate
LEARN = 1e-3        # learning rate
VALWAIT = 1         # epochs to wait before evaluating the loss on the test/validation set
DROPOUT = 0.1       # dropout rate
N_FC = 7 * 24       # output size

RAND = 42           # random seed
N_SAMPLES = 100     # number of times a prediction is sampled from a probabilistic model
N_JOBS = 3          # parallel processors to use;  -1 = all processors


In [None]:
model = TransformerModel(
                    input_chunk_length = INLEN,
                    output_chunk_length = N_FC,
                    batch_size = BATCH,
                    n_epochs = EPOCHS,
                    model_name = "Transformer_price",
                    nr_epochs_val_period = VALWAIT,
                    d_model = FEAT,
                    nhead = HEADS,
                    num_encoder_layers = ENCODE,
                    num_decoder_layers = DECODE,
                    dim_feedforward = DIM_FF,
                    dropout = DROPOUT,
                    activation = ACTF,
                    random_state=RAND,
                    optimizer_kwargs={'lr': LEARN},
                    add_encoders={"cyclic": {"future": ["hour", "dayofweek", "month"]}},
                    save_checkpoints=True,
                    force_reset=True,
                    pl_trainer_kwargs={
                      "accelerator": "gpu",
                      "devices": [0]
                    }
                    )

In [None]:
model.fit(ts_C_train_trf, 
            past_covariates=ts_feat_train_trf, 
            verbose=True)

In [None]:
ts_feat_full_trf = concatenate([ts_feat_train_trf, ts_X_trf], axis=0)

In [None]:
ts_C_pred = model.predict(  n=N_FC,
                            past_covariates=ts_feat_full_trf,
                            num_samples=1,   
                            n_jobs=N_JOBS, 
                            verbose=True)

In [None]:
ts_C_pred