In [1]:
from torch.utils.data import Dataset
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import r2_score

import os
import sys
import datetime
import json

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

with open("../../_ynyt/prediction/config.env") as f:
    for line in f.readlines():
        if len(line) > 2:
            k, v = line[:-2].split('=')
            os.environ[k] = v
    
from ynyt.data import Preprocessor
from ynyt.features import BaseFeatures, FeatureCombiner
from ynyt.utils import json_read

import warnings
warnings.filterwarnings('ignore')

In [55]:
config = {
    "zero_hour": ["2019-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"],
    "base_features": {
        "rolling":[
            [
                "y",
                {
                    "ar_d": 1, 'ar_D': 15,
                }
            ]
        ],
        "target": ["target", "y", 6],
        "time_based": {"features_on": {"weekday": True, "weekday_plus": False, "weekhours": False}}
        },
    'harmonics': {'K': {'week': 6, 'year': 0, 'day': 6}},
    "add_features": ["distance", "duration", "passengers", "cost", "tips", "vendor", "dol"], 
    "normalizer": {"path": "../artifacts/transformers", 
                   "to_normalize": ["h", "distance", "duration", "passengers", "cost", "tips", "vendor", "velocity"],
                   "target": "y"},
    "combinations": [],
    "D_combinations": []
}

with open('configs/config.json', 'w') as f:
    json.dump(config, f)


In [3]:
path_preprocessed = './../_ynyt/data/preprocessed'

In [5]:
period = [datetime.datetime(2022, 4, 17), datetime.datetime(2022, 5, 31)]
setting = {'features': True, 'D': False, 'hours': True, 'weekday': True}
dataset = YNYT(period, mode='train', path_preprocessed=path_preprocessed, setting=setting)
x, r, y = dataset[10]

print(dataset.X.shape)
print(x.shape, r.shape, y.shape)

assert len(dataset) == dataset.bf.data.t.max() + 1 - dataset.horizon - dataset.seq_len + 1

(39, 714, 55)
torch.Size([39, 4, 55, 1]) torch.Size([1, 660]) torch.Size([6, 55])


In [34]:
# test
x, y = dataset[0]
print(x.shape, y.shape)

for k in [0, 10, 337]:
    test = dataset.bf.data[dataset.bf.data.t==dataset.seq_len+k-1]
    x, y = dataset[k]
    assert (test.y_1.values - x[:55, -1]).sum() == 0
    assert (test.loc[:, dataset.bf.config['add_features'][0]].values - x[55:110, -1]).sum() == 0
    assert (test.loc[:, dataset.bf.feature_groups['target']].values - y).sum() == 0

ValueError: too many values to unpack (expected 2)

In [50]:
#dataset.normalize_back(y)

<img src="../transformers/seerte.jpeg"/>

## Dimensions
### original paper:

**temporal embedding**

element embedding: $X \in \mathbb{R}^{b\times m\times l} \to X^e \in \mathbb{R}^{b\times ml \times d}$, where $b$ - batch size, $l$ - length on the sequence, $m$ - number of zones, $d$ embedding dim

positional embedding: $p \in \mathbb{R}^l \to P \in \mathbb{R}^{b \times ml \times d}$

$X^t = X^{e_t} + P$

**spatio embedding**

var embedding: $s \in \mathbb{R}^m \to S \in \mathbb{R}^{b \times ml \times d}$

$X^s = X^{e_s} + S$

**spatio-temporal embedding**

$X^{st} = X^{e_{st}} + P$


### changes:

element embedding: $X \in \mathbb{R}^{b\times mq\times l} \to X^e \in \mathbb{R}^{b\times ml \times d}$, where $q$ - number of features

## Self-attentions
<img src="../transformers/heads.jpeg"/>

**temporal encoder**: $m$ heads

**spatio encoder**: $l$ heads

**spatio-temporal encoder**: $h$ heads

In [5]:
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torch.nn import functional as F
from torch import nn

from torchmetrics import MeanAbsolutePercentageError, MeanAbsoluteError, R2Score

from torch.utils.data import Dataset
import numpy as np
import pandas as pd

import sys
import os

from tqdm import tqdm
from functools import partialmethod

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from sttre import *

In [45]:
class Transformer(nn.Module):
    def __init__(self, input_shape,
                 embed_size, num_layers, forward_expansion, heads, device, dropout, regression_head, horizon=6):

        super(Transformer, self).__init__()
        self.device = device

        self.batch_size, self.num_features, self.seq_len, self.num_var, _ = input_shape
        self.embed_size = embed_size

        self.element_embedding_temporal = nn.Linear(self.seq_len*self.num_features, embed_size*self.seq_len)
        self.element_embedding_spatial = nn.Linear(self.num_var*self.num_features, embed_size*self.num_var)
        
        self.pos_embedding = nn.Embedding(self.seq_len, embed_size)
        self.variable_embedding = nn.Embedding(self.num_var, embed_size)

        self.temporal = Encoder(seq_len=self.seq_len,
                                embed_size=embed_size,
                                num_layers=num_layers,
                                heads=self.num_var,
                                device=self.device,
                                forward_expansion=forward_expansion,
                                module='temporal',
                                rel_emb=True)

        self.spatial = Encoder(seq_len=self.num_var,
                               embed_size=embed_size,
                               num_layers=num_layers,
                               heads=self.seq_len,
                               device=self.device,
                               forward_expansion=forward_expansion,
                               module = 'spatial',
                               rel_emb=True)

        self.spatiotemporal = Encoder(seq_len=self.seq_len*self.num_var,
                                      embed_size=embed_size,
                                      num_layers=num_layers,
                                      heads=heads,
                                      device=self.device,
                                      forward_expansion=forward_expansion,
                                      module = 'spatiotemporal',
                                      rel_emb=True)
        
        factor = regression_head['flatt_factor']
        
        self.flatter = nn.Sequential(nn.Linear(embed_size, embed_size // factor),
                                     nn.LeakyReLU(),
                                     nn.Dropout(dropout),
                                     nn.Flatten()
                                    )

        # additional features
        self.add_dim = horizon * 55 * regression_head['add_features']
        d_out = (embed_size // factor) * self.seq_len * 3 * self.num_var + self.add_dim
        
        if regression_head['heads'] == 1:
            output_size = self.num_var * horizon
            self.head = []
            for i, l in enumerate(regression_head['layers']):
                l1 = self.num_var * horizon * l
                if i == 0:
                    self.head.append(nn.BatchNorm1d(d_out))
                    self.head.append(nn.Linear(d_out, l1))
                else:
                    l0 = self.num_var * horizon * regression_head['layers'][i - 1]
                    self.head.append(nn.BatchNorm1d(l0))
                    self.head.append(nn.Linear(l0, l1))
                self.head.append(nn.LeakyReLU())
                self.head.append(nn.Dropout(regression_head['dropout_head']))
            self.head.append(nn.BatchNorm1d(l1))
            self.head.append(nn.Linear(l1, self.num_var * horizon))
            self.head = nn.Sequential(*self.head)
        
    def forward(self, x, regressors, dropout):
        batch_size = len(x)

        #process/embed input for spatio-temporal module
        positions = torch.arange(0, self.seq_len
                                ).expand(batch_size, self.num_var, self.seq_len
                                        ).reshape(batch_size, self.num_var * self.seq_len
                                                 ).to(self.device)
        x_spatio_temporal = x.reshape(batch_size, self.num_var, self.seq_len*self.num_features)
        x_spatio_temporal = self.element_embedding_temporal(x_spatio_temporal
                                    ).reshape(batch_size, self.num_var * self.seq_len, self.embed_size)
        x_spatio_temporal = F.dropout(self.pos_embedding(positions) + x_spatio_temporal, dropout)
        x_spatio_temporal = torch.unsqueeze(x_spatio_temporal, -1)

        #process/embed input for temporal module
        positions = torch.arange(0, self.seq_len
                                ).expand(batch_size, self.num_var, self.seq_len
                                        ).reshape(batch_size, self.num_var * self.seq_len
                                                 ).to(self.device)
        
        x_temporal = x.view(batch_size, self.num_var, self.seq_len*self.num_features)
        x_temporal = self.element_embedding_temporal(x_temporal
                                    ).reshape(batch_size, self.num_var * self.seq_len, self.embed_size)
        x_temporal = F.dropout(self.pos_embedding(positions) + x_temporal, dropout)
        x_temporal = torch.unsqueeze(x_temporal, -1)
        
        #process/embed input for spatial module
        vars = torch.arange(0, self.num_var).expand(batch_size, self.seq_len, self.num_var).reshape(batch_size, self.num_var*self.seq_len).to(self.device)
        
        x_spatial = x.view(batch_size, self.seq_len, self.num_features*self.num_var)
        x_spatial = self.element_embedding_spatial(x_spatial).reshape(batch_size, self.num_var * self.seq_len, self.embed_size)
        x_spatial = F.dropout(self.variable_embedding(vars) + x_spatial, dropout)
        x_spatial = torch.unsqueeze(x_spatial, -1)
        
        out1 = self.temporal(x_temporal)
        out2 = self.spatial(x_spatial)
        out3 = self.spatiotemporal(x_spatio_temporal)
        
        out = torch.cat((out1, out2, out3), 1)
        
        out = self.flatter(out)

        if self.add_dim > 0:
            out = torch.cat([out.unsqueeze(1), regressors.squeeze(-1)], dim=2).squeeze(1)

        out = self.head(out)

        return out


In [1]:
from sttre.sttre import train_val

regression_head = {'heads': 1, 'dropout_head': 0.1, 'layers': [16, 8], 'add_features': 2, 'flatt_factor': 2}
data_setting = {'features': True, 'D': True, 'hours': True, 'weekday': True}
period = {'train': [datetime.datetime(2020, 10, 1), datetime.datetime(2022, 4, 30)], 
          'val': [datetime.datetime(2022, 5, 1), datetime.datetime(2022, 7, 31)]}
params = {'embed_size': 64, 'heads': 4, 'num_layers': 2, 'dropout': 0.1, 
          'forward_expansion': 1, 'lr': 0.0005, 'batch_size': 128, 'seq_len': 4, 'epoches': 100,
          'device': 'cpu',
         }


model = train_val(period=period, 
                  regression_head=regression_head, data_setting=data_setting, 
                  verbose=True, horizon=horizon, **params)

ImportError: attempted relative import with no known parent package

In [47]:
regression_head = {'heads': 1, 'dropout_head': 0.1, 'layers': [16, 8], 'add_features': 2, 'flatt_factor': 2}
data_setting = {'features': True, 'D': True, 'hours': True, 'weekday': True}
period = {'train': [datetime.datetime(2020, 10, 1), datetime.datetime(2022, 4, 30)], 
          'val': [datetime.datetime(2022, 5, 1), datetime.datetime(2022, 7, 31)]}
params = {'embed_size': 64, 'heads': 4, 'num_layers': 2, 'dropout': 0.1, 
          'forward_expansion': 1, 'lr': 0.0005, 'batch_size': 128, 'seq_len': 4, 'epoches': 100}

    device = torch.device("mps")
    device = 'cpu'

NUM_EPOCHS = 100



horizon = 6
d = 64
h = 4
seq_len = 4
num_layers = 2
forward_expansion = 1
dropout = 0.15
lr = 0.0005
batch_size = 128

model = train_val(period, epoches, 
                  embed_size, heads, num_layers, dropout, forward_expansion, lr, batch_size, seq_len, 
                  batch_size, seq_len, regression_head, data_setting, device, 
                  verbose=True, horizon=horizon)

cpu!
Transformer(
  (element_embedding_temporal): Linear(in_features=196, out_features=256, bias=True)
  (element_embedding_spatial): Linear(in_features=2695, out_features=3520, bias=True)
  (pos_embedding): Embedding(4, 64)
  (variable_embedding): Embedding(55, 64)
  (temporal): Encoder(
    (fc_out): Linear(in_features=64, out_features=64, bias=True)
    (layers): ModuleList(
      (0-1): 2 x EncoderBlock(
        (attention): SelfAttention(
          (values): Linear(in_features=64, out_features=64, bias=True)
          (keys): Linear(in_features=64, out_features=64, bias=True)
          (queries): Linear(in_features=64, out_features=64, bias=True)
          (fc_out): Linear(in_features=64, out_features=64, bias=True)
        )
        (norm1): BatchNorm1d(220, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (norm2): BatchNorm1d(220, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (feed_forward): Sequential(
          (0): Linear(in_fe

epoch: 49, train loss: 0.010059, val loss: 0.011535, val mae: 0.082538, val r2: 0.716899
epoch: 50, train loss: 0.009742, val loss: 0.010453, val mae: 0.077748, val r2: 0.741511
epoch: 51, train loss: 0.009753, val loss: 0.010351, val mae: 0.076525, val r2: 0.744086
epoch: 52, train loss: 0.010197, val loss: 0.010851, val mae: 0.078961, val r2: 0.732728
epoch: 53, train loss: 0.009908, val loss: 0.010725, val mae: 0.078920, val r2: 0.735369
epoch: 54, train loss: 0.010133, val loss: 0.011195, val mae: 0.080099, val r2: 0.724843
epoch: 55, train loss: 0.010111, val loss: 0.010936, val mae: 0.079295, val r2: 0.729916
epoch: 56, train loss: 0.010458, val loss: 0.011487, val mae: 0.081481, val r2: 0.715116
epoch: 57, train loss: 0.010113, val loss: 0.010858, val mae: 0.078875, val r2: 0.730412
epoch: 58, train loss: 0.010490, val loss: 0.010890, val mae: 0.079062, val r2: 0.730974


KeyboardInterrupt: 

In [None]:
epoch: 0, train loss: 0.022740, val loss: 0.010971, val mae: 0.078174, val r2: 0.715576
epoch: 1, train loss: 0.011180, val loss: 0.017958, val mae: 0.099584, val r2: 0.532309

In [14]:
# **

NUM_EPOCHS = 100

d = 32
h = 4
seq_len = 4
num_layers = 2
forward_expansion = 1
dropout = 0.1
lr = 0.00005
batch_size = 128

model_2 = train_val(d, h, num_layers, dropout, forward_expansion, lr, batch_size, seq_len, verbose=True)

epoch: 0, train loss: 0.022521, val loss: 0.016710, val mae: 0.102509, val r2: 0.599304
epoch: 1, train loss: 0.013898, val loss: 0.014178, val mae: 0.092142, val r2: 0.658461
epoch: 2, train loss: 0.010964, val loss: 0.012187, val mae: 0.084001, val r2: 0.706509
epoch: 3, train loss: 0.009008, val loss: 0.011310, val mae: 0.080319, val r2: 0.727264
epoch: 4, train loss: 0.007942, val loss: 0.010689, val mae: 0.077674, val r2: 0.742635
epoch: 5, train loss: 0.007223, val loss: 0.010010, val mae: 0.074908, val r2: 0.759120
epoch: 6, train loss: 0.006706, val loss: 0.009710, val mae: 0.073383, val r2: 0.766474
epoch: 7, train loss: 0.006305, val loss: 0.009212, val mae: 0.071253, val r2: 0.778462
epoch: 8, train loss: 0.005955, val loss: 0.008944, val mae: 0.069936, val r2: 0.785152
epoch: 9, train loss: 0.005673, val loss: 0.008569, val mae: 0.068158, val r2: 0.794209
epoch: 10, train loss: 0.005436, val loss: 0.008323, val mae: 0.067021, val r2: 0.800280
epoch: 11, train loss: 0.005232

epoch: 93, train loss: 0.002907, val loss: 0.005766, val mae: 0.053165, val r2: 0.862702
epoch: 94, train loss: 0.002899, val loss: 0.005544, val mae: 0.052021, val r2: 0.867944
epoch: 95, train loss: 0.002891, val loss: 0.005473, val mae: 0.051588, val r2: 0.869322
epoch: 96, train loss: 0.002874, val loss: 0.005394, val mae: 0.051219, val r2: 0.871194
epoch: 97, train loss: 0.002888, val loss: 0.005462, val mae: 0.051709, val r2: 0.869722
epoch: 98, train loss: 0.002895, val loss: 0.005748, val mae: 0.053049, val r2: 0.862968
epoch: 99, train loss: 0.002895, val loss: 0.006060, val mae: 0.054606, val r2: 0.855958


In [20]:
# *

NUM_EPOCHS = 100

d = 64
h = 4
seq_len = 6
num_layers = 2
forward_expansion = 1
dropout = 0.15
lr = 0.00005
batch_size = 128

model_1 = train_val(d, h, num_layers, dropout, forward_expansion, lr, batch_size, seq_len, verbose=True)

epoch: 0, train loss: 0.019370, val loss: 0.015780, val mae: 0.098585, val r2: 0.623975
epoch: 1, train loss: 0.013873, val loss: 0.013926, val mae: 0.090102, val r2: 0.668033
epoch: 2, train loss: 0.009853, val loss: 0.012343, val mae: 0.084003, val r2: 0.704577
epoch: 3, train loss: 0.007697, val loss: 0.011234, val mae: 0.079773, val r2: 0.731614
epoch: 4, train loss: 0.006733, val loss: 0.010664, val mae: 0.077258, val r2: 0.745611
epoch: 5, train loss: 0.006147, val loss: 0.009851, val mae: 0.073878, val r2: 0.765137
epoch: 6, train loss: 0.005718, val loss: 0.009531, val mae: 0.072521, val r2: 0.772791
epoch: 7, train loss: 0.005411, val loss: 0.009172, val mae: 0.070858, val r2: 0.781374
epoch: 8, train loss: 0.005157, val loss: 0.008953, val mae: 0.069775, val r2: 0.786838
epoch: 9, train loss: 0.004950, val loss: 0.008513, val mae: 0.067860, val r2: 0.797126
epoch: 10, train loss: 0.004793, val loss: 0.008561, val mae: 0.067897, val r2: 0.796141
epoch: 11, train loss: 0.004647

epoch: 93, train loss: 0.003005, val loss: 0.007921, val mae: 0.063606, val r2: 0.812466
epoch: 94, train loss: 0.003074, val loss: 0.006166, val mae: 0.055542, val r2: 0.852889
epoch: 95, train loss: 0.002945, val loss: 0.005486, val mae: 0.052181, val r2: 0.868622
epoch: 96, train loss: 0.002779, val loss: 0.005450, val mae: 0.051982, val r2: 0.869284
epoch: 97, train loss: 0.002677, val loss: 0.005635, val mae: 0.052778, val r2: 0.865175
epoch: 98, train loss: 0.002650, val loss: 0.006048, val mae: 0.054723, val r2: 0.855591
epoch: 99, train loss: 0.002625, val loss: 0.006194, val mae: 0.055484, val r2: 0.852408


In [None]:
NUM_EPOCHS = 100

d = 64
h = 4
seq_len = 6
num_layers = 2
forward_expansion = 1
dropout = 0.15
lr = 0.00005
batch_size = 256

model_1 = train_val(d, h, num_layers, dropout, forward_expansion, lr, batch_size, seq_len, verbose=True)

In [None]:
NUM_EPOCHS = 100

d = 32
h = 4
seq_len = 4
num_layers = 2
forward_expansion = 1
dropout = 0.1
lr = 0.00005
batch_size = 64

model_1 = train_val(d, h, num_layers, dropout, forward_expansion, lr, batch_size, seq_len, verbose=True)

In [None]:
worse:
    

NUM_EPOCHS = 100

d = 64
seq_len = 6
batch_size = 64

d = 16
seq_len = 4
batch_size = 128

d = 96
h = 4
batch_size = 128

d = 64
seq_len = 4
dropout = 0.15
lr = 0.00005
batch_size = 256

In [10]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [17]:
period = [datetime.datetime(2022, 5, 1), datetime.datetime(2022, 5, 31)]
test_data = YNYT(period, seq_len=seq_len, mode='test')

device = torch.device("mps")

def prdict(data, model, device=device):
    device = device
    inference_dataloader = torch.utils.data.DataLoader(data, batch_size=len(test_data))
    loss_fn = torch.nn.MSELoss()
    res = []
    fact = []
    with torch.no_grad():
        for i, data in enumerate(inference_dataloader):
            inputs, regressors, labels = data
            regressors = regressors.unsqueeze(-1)
            labels = torch.flatten(labels, start_dim=1)
            output = model(inputs.to(device), regressors.to(device), 0)                
            loss = loss_fn(output, labels.to(device))
            labels = labels.view(-1, 6).detach().cpu().numpy()
            output = output.view(-1, 6).detach().cpu().numpy()
            res.append(output)
            fact.append(labels)
    return res, fact


In [18]:
pred, fact = prdict(test_data, model_2)

for h in range(1, 7):
    y = fact[0][:, h-1]
    y_pred = pred[0][:, h-1]
    r2 = round(r2_score(y, y_pred), 5)
    mae = round(mean_absolute_error(y, y_pred), 5)
    mse = round(mean_squared_error(y, y_pred), 5)
    print(f'h={h}, mse: {mse}, mae: {mae}, r2: {r2}')

h=1, mse: 0.005289999768137932, mae: 0.05127999931573868, r2: 0.8736
h=2, mse: 0.00535999983549118, mae: 0.051589999347925186, r2: 0.87221
h=3, mse: 0.0053900000639259815, mae: 0.051669999957084656, r2: 0.87162
h=4, mse: 0.0053400001488626, mae: 0.05149000138044357, r2: 0.87252
h=5, mse: 0.005350000225007534, mae: 0.05172999948263168, r2: 0.87213
h=6, mse: 0.005289999768137932, mae: 0.051430001854896545, r2: 0.87361


In [181]:
# old 

pred, fact = prdict(test_data, model_3)

for h in range(1, 7):
    y = fact[0][:, h-1]
    y_pred = pred[0][:, h-1]
    r2 = round(r2_score(y, y_pred), 5)
    mae = round(mean_absolute_error(y, y_pred), 5)
    mse = round(mean_squared_error(y, y_pred), 5)
    print(f'h={h}, mse: {mse}, mae: {mae}, r2: {r2}')

h=1, mse: 0.005549999885261059, mae: 0.052730001509189606, r2: 0.87241
h=2, mse: 0.00558000011369586, mae: 0.0528700016438961, r2: 0.87173
h=3, mse: 0.005619999952614307, mae: 0.05291999876499176, r2: 0.87082
h=4, mse: 0.0056500001810491085, mae: 0.05307000130414963, r2: 0.87019
h=5, mse: 0.005630000028759241, mae: 0.053040001541376114, r2: 0.87059
h=6, mse: 0.0055599999614059925, mae: 0.05274999886751175, r2: 0.87214


In [None]:
0.90793
0.87887