In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import time
import math
from IPython import display

# Data

In [2]:
df = pd.read_csv('../data/ticker_data_preprocessed.csv', index_col=0)
print(df.shape)
df.head()

(482, 1196)


Unnamed: 0,2018-01-03,2018-01-04,2018-01-05,2018-01-08,2018-01-09,2018-01-10,2018-01-11,2018-01-12,2018-01-16,2018-01-17,...,2022-09-20,2022-09-21,2022-09-22,2022-09-23,2022-09-26,2022-09-27,2022-09-28,2022-09-29,2022-09-30,sector
A,0.025444,-0.007501,0.015988,0.002146,0.024554,-0.013655,0.000141,0.013136,-0.006971,0.011652,...,-0.019737,-0.012955,-0.016524,-0.007316,-0.009475,-0.005723,0.017351,-0.007921,-0.009695,Healthcare
AAL,-0.012266,0.006305,-0.00038,-0.009877,-0.000959,0.032642,0.049089,0.036335,-0.00838,0.003105,...,-0.016889,-0.052971,-0.039305,-0.039339,-0.028665,0.03457,0.03912,-0.039216,-0.017143,Industrials
AAP,0.009049,0.036899,0.010631,-0.007042,-0.00808,0.000905,0.02134,0.026472,-0.017595,0.01273,...,-0.013735,-0.002231,-0.008399,-0.021997,-0.01757,0.010337,0.025171,-0.02241,-0.020794,Consumer Cyclical
AAPL,-0.000174,0.004645,0.011385,-0.003714,-0.000115,-0.000229,0.00568,0.010326,-0.005082,0.016516,...,0.015665,-0.020268,-0.006375,-0.015124,0.00226,0.006566,-0.012652,-0.049119,-0.030039,Technology
ABBV,0.015649,-0.005703,0.017408,-0.016022,0.007538,-0.005487,-0.004213,0.010779,0.021427,0.018246,...,-0.006239,-0.010298,0.019243,0.00035,-0.012932,0.003612,0.020322,-0.013001,-0.059627,Healthcare


In [3]:
df_pct = df.drop(['sector'], axis=1).T
df_pct.index = pd.to_datetime(df_pct.index)

df_pct.head()

Unnamed: 0,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADI,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
2018-01-03,0.025444,-0.012266,0.009049,-0.000174,0.015649,0.003722,0.0173,0.002211,0.004615,0.012406,...,-0.010834,-0.006693,0.01964,-0.003426,0.012193,-0.000858,0.006932,0.019863,-0.001183,0.004598
2018-01-04,-0.007501,0.006305,0.036899,0.004645,-0.005703,-0.002225,0.017516,-0.001697,0.011841,-0.001094,...,0.005415,-0.007791,0.001384,-0.000149,0.006676,0.01018,-0.001441,0.01976,0.004147,0.005964
2018-01-05,0.015988,-0.00038,0.010631,0.011385,0.017408,0.012104,0.015408,0.00289,0.008249,0.004053,...,0.006671,-0.007003,-0.000806,0.014051,-0.001874,0.005828,0.009941,0.015576,0.000393,0.011444
2018-01-08,0.002146,-0.009877,-0.007042,-0.003714,-0.016022,0.016576,0.027086,-0.002882,0.007991,0.001745,...,-0.013314,0.00748,0.004496,0.006781,0.003611,0.00169,0.001905,0.009951,-0.004914,0.011996
2018-01-09,0.024554,-0.000959,-0.00808,-0.000115,0.007538,0.006398,0.009432,0.0017,0.003335,-0.002069,...,0.006778,-0.011667,-0.004246,-0.041728,0.000288,-0.002651,-0.016083,0.030643,0.023509,0.011719


In [4]:
df_train = df_pct[df_pct.index < '2020-05-19']
df_test = df_pct[(df_pct.index >= '2020-05-19')&(df_pct.index != max(df_pct.index))]

df_train = df_train.T.reset_index(drop=True)
df_test = df_test.T.reset_index(drop=True)

In [5]:
df_train.shape, df_test.shape

((482, 597), (482, 597))

In [6]:
class TSDataset(Dataset):
    def __init__(self, df, threshold):
        self.df = df
        self.threshold = threshold

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        row = self.df.loc[idx]
        
        past_values = row[row.index < self.threshold].values
        past_dates = row[row.index < self.threshold].index
        past_years = pd.Series(past_dates).dt.year.values - 2018
        past_months = pd.Series(past_dates).dt.month.values - 1
        past_days = pd.Series(past_dates).dt.day.values - 1
        
        future_values = row[row.index >= self.threshold].values
        future_dates = row[row.index >= self.threshold].index
        future_years = pd.Series(future_dates).dt.year.values - 2018
        future_months = pd.Series(future_dates).dt.month.values - 1
        future_days = pd.Series(future_dates).dt.day.values - 1
        
        return {'past_values': torch.tensor(past_values, dtype=torch.float32),
                'past_years': torch.tensor(past_years, dtype=int),
                'past_months': torch.tensor(past_months, dtype=int),
                'past_days': torch.tensor(past_days, dtype=int),
                'future_values': torch.tensor(future_values, dtype=torch.float32),
                'future_years': torch.tensor(future_years, dtype=int),
                'future_months': torch.tensor(future_months, dtype=int),
                'future_days': torch.tensor(future_days, dtype=int),}

In [7]:
train_dataset = TSDataset(df_train, '2019-01-01')
test_dataset = TSDataset(df_test, '2021-05-15')

In [32]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

In [33]:
for batch in train_dataloader:
    print(batch['past_values'].shape)
    break

for batch in test_dataloader:
    print(batch['past_values'].shape)
    break

torch.Size([64, 250])
torch.Size([64, 250])


# Model

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
device

device(type='cuda')

In [12]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
    def forward(self, query, key, value, mask = None):
        
        batch_size = query.shape[0]
        
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, value len, hid dim]
                
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        
        #Q = [batch size, query len, hid dim]
        #K = [batch size, key len, hid dim]
        #V = [batch size, value len, hid dim]
                
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        #Q = [batch size, n heads, query len, head dim]
        #K = [batch size, n heads, key len, head dim]
        #V = [batch size, n heads, value len, head dim]
                
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        #energy = [batch size, n heads, query len, key len]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1)
                
        #attention = [batch size, n heads, query len, key len]
                
        x = torch.matmul(self.dropout(attention), V)
        
        #x = [batch size, n heads, query len, head dim]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        
        #x = [batch size, query len, n heads, head dim]
        
        x = x.view(batch_size, -1, self.hid_dim)
        
        #x = [batch size, query len, hid dim]
        
        x = self.fc_o(x)
        
        #x = [batch size, query len, hid dim]
        
        return x, attention

In [13]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [batch size, seq len, hid dim]
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        
        #x = [batch size, seq len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x

In [14]:
class EncoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim,  
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [batch size, src len, hid dim]
        #src_mask = [batch size, 1, 1, src len] 
                
        #self attention
        _src, _ = self.self_attention(src, src, src)
        
        #dropout, residual connection and layer norm
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        #positionwise feedforward
        _src = self.positionwise_feedforward(src)
        
        #dropout, residual and layer norm
        src = self.ff_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        return src

In [15]:
class Encoder(nn.Module):
    def __init__(self, 
                 input_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim,
                 dropout, 
                 device):
        super().__init__()

        self.device = device
        self.input_dim = input_dim
        
        self.tok_embedding = nn.Linear(input_dim, hid_dim)
        self.year_embedding = nn.Embedding(5, hid_dim)
        self.month_embedding = nn.Embedding(12, hid_dim)
        self.day_embedding = nn.Embedding(31, hid_dim)
        
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim,
                                                  dropout, 
                                                  device) 
                                     for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, src, past_dates):
        
        #src = [batch size, src len]
        #src_mask = [batch size, 1, 1, src len]
        
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
#         pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        years, months, days = past_dates
        pos_embedding = self.year_embedding(years.to(self.device))\
                        + self.month_embedding(months.to(self.device))\
                        + self.day_embedding(days.to(self.device))
        
        #pos = [batch size, src len]
        noise = torch.normal(0, 0.01, size=src.shape).to(self.device) if self.training else 0
        tok_embedding = self.tok_embedding(src + noise).squeeze(0).repeat(self.input_dim,1)

        src = self.dropout(tok_embedding.view(batch_size, src_len, -1) * self.scale + pos_embedding)
        
        #src = [batch size, src len, hid dim]
        
        for layer in self.layers:
            src = layer(src)
            
        #src = [batch size, src len, hid dim]
            
        return src

In [16]:
class DecoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask):
        
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
        
        #self attention
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        
        #dropout, residual connection and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
            
        #trg = [batch size, trg len, hid dim]
            
        #encoder attention
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src)
        
        #dropout, residual connection and layer norm
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
                    
        #trg = [batch size, trg len, hid dim]
        
        #positionwise feedforward
        _trg = self.positionwise_feedforward(trg)
        
        #dropout, residual and layer norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return trg, attention

In [17]:
class Decoder(nn.Module):
    def __init__(self, 
                 output_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device):
        super().__init__()
        
        self.device = device
        self.output_dim = output_dim
        
        self.tok_embedding = nn.Linear(output_dim, hid_dim)
        self.year_embedding = nn.Embedding(5, hid_dim)
        self.month_embedding = nn.Embedding(12, hid_dim)
        self.day_embedding = nn.Embedding(31, hid_dim)
        
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim, 
                                                  dropout, 
                                                  device)
                                     for _ in range(n_layers)])
        
        self.fc_out = nn.Linear(hid_dim, 1)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, trg, future_dates, enc_src, trg_mask):
        
        #trg = [batch size, trg len]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        
#         pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        years, months, days = future_dates
        pos_embedding = self.year_embedding(years.to(self.device))\
                        + self.month_embedding(months.to(self.device))\
                        + self.day_embedding(days.to(self.device))
        
        #pos = [batch size, trg len]
        tok_embedding = self.tok_embedding(trg).squeeze(0).repeat(self.output_dim, 1)

        trg = self.dropout(tok_embedding.view(batch_size, trg_len, -1) * self.scale + pos_embedding)
                
        #trg = [batch size, trg len, hid dim]
        
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask)
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        output = self.fc_out(trg)
        
        #output = [batch size, trg len, output dim]
            
        return output, attention

In [18]:
class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder, 
                 decoder, 
                 device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def make_trg_mask(self, trg):
        
        #trg = [batch size, trg len]
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()
        
        #trg_sub_mask = [trg len, trg len]
            
        trg_mask = trg_sub_mask
        
        #trg_mask = [batch size, 1, trg len, trg len]
        
        return trg_mask

    def forward(self, src, past_dates, trg, future_dates):
        
        #src = [batch size, src len]
        #trg = [batch size, trg len]
                
        trg_mask = self.make_trg_mask(trg)
        
        #src_mask = [batch size, 1, 1, src len]
        #trg_mask = [batch size, 1, trg len, trg len]
        
        enc_src = self.encoder(src, past_dates)
        
        #enc_src = [batch size, src len, hid dim]
                
        output, attention = self.decoder(trg, future_dates, enc_src, trg_mask)
        
        #output = [batch size, trg len, output dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return output, attention

In [19]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [20]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in tqdm(enumerate(iterator), desc='TRAIN', total=len(iterator)):
        
        src = batch['past_values'].to(device)
        past_dates = batch['past_years'].to(device), batch['past_months'].to(device), batch['past_days'].to(device)
        trg = batch['future_values'].to(device)
        future_dates = batch['future_years'][:,:-1].to(device), batch['future_months'][:,:-1].to(device), batch['future_days'][:,:-1].to(device)
        
        optimizer.zero_grad()
        
        output, _ = model(src, past_dates, trg[:,:-1], future_dates)
                
        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]
            
        output_dim = output.shape[-1]
        
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
                
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
            
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [21]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in tqdm(enumerate(iterator), desc='EVAL', total=len(iterator)):

            src = batch['past_values'].to(device)
            past_dates = batch['past_years'].to(device), batch['past_months'].to(device), batch['past_days'].to(device)
            trg = batch['future_values'].to(device)
            future_dates = batch['future_years'][:,:-1].to(device), batch['future_months'][:,:-1].to(device), batch['future_days'][:,:-1].to(device)

            output, _ = model(src, past_dates, trg[:,:-1], future_dates)
            
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [22]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# Training

In [23]:
INPUT_DIM = 250
OUTPUT_DIM = 346
HID_DIM = 100

In [24]:
ENC_LAYERS = 2
ENC_HEADS = 4
ENC_PF_DIM = 100
ENC_DROPOUT = 0.2

DEC_LAYERS = 2
DEC_HEADS = 4
DEC_PF_DIM = 100
DEC_DROPOUT = 0.2

In [25]:
seed = 8
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f0011ed7150>

In [26]:
enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

In [27]:
model = Seq2Seq(enc, dec, device).to(device)

In [28]:
model.apply(initialize_weights);

In [29]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [30]:
criterion = nn.MSELoss()

In [34]:
N_EPOCHS = 20
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_dataloader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, test_dataloader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        # torch.save(model.state_dict(), 'tut6-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

TRAIN:   0%|          | 0/8 [00:00<?, ?it/s]

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 01 | Time: 0m 4s
	Train Loss: 2.421 | Train PPL:  11.253
	 Val. Loss: 0.201 |  Val. PPL:   1.222


TRAIN:   0%|          | 0/8 [00:00<?, ?it/s]

EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 02 | Time: 0m 4s
	Train Loss: 0.386 | Train PPL:   1.471
	 Val. Loss: 0.084 |  Val. PPL:   1.088


TRAIN:   0%|          | 0/8 [00:00<?, ?it/s]

EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 03 | Time: 0m 4s
	Train Loss: 0.272 | Train PPL:   1.312
	 Val. Loss: 0.029 |  Val. PPL:   1.030


TRAIN:   0%|          | 0/8 [00:00<?, ?it/s]

EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 04 | Time: 0m 4s
	Train Loss: 0.180 | Train PPL:   1.197
	 Val. Loss: 0.002 |  Val. PPL:   1.002


TRAIN:   0%|          | 0/8 [00:00<?, ?it/s]

EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 05 | Time: 0m 4s
	Train Loss: 0.125 | Train PPL:   1.133
	 Val. Loss: 0.021 |  Val. PPL:   1.021


TRAIN:   0%|          | 0/8 [00:00<?, ?it/s]

EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 06 | Time: 0m 4s
	Train Loss: 0.088 | Train PPL:   1.092
	 Val. Loss: 0.010 |  Val. PPL:   1.010


TRAIN:   0%|          | 0/8 [00:00<?, ?it/s]

EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 07 | Time: 0m 4s
	Train Loss: 0.065 | Train PPL:   1.067
	 Val. Loss: 0.021 |  Val. PPL:   1.021


TRAIN:   0%|          | 0/8 [00:00<?, ?it/s]

EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 08 | Time: 0m 4s
	Train Loss: 0.053 | Train PPL:   1.055
	 Val. Loss: 0.028 |  Val. PPL:   1.028


TRAIN:   0%|          | 0/8 [00:00<?, ?it/s]

EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 09 | Time: 0m 4s
	Train Loss: 0.045 | Train PPL:   1.046
	 Val. Loss: 0.039 |  Val. PPL:   1.040


TRAIN:   0%|          | 0/8 [00:00<?, ?it/s]

EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 10 | Time: 0m 4s
	Train Loss: 0.039 | Train PPL:   1.040
	 Val. Loss: 0.002 |  Val. PPL:   1.002


TRAIN:   0%|          | 0/8 [00:00<?, ?it/s]

EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 11 | Time: 0m 4s
	Train Loss: 0.035 | Train PPL:   1.036
	 Val. Loss: 0.012 |  Val. PPL:   1.012


TRAIN:   0%|          | 0/8 [00:00<?, ?it/s]

EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 12 | Time: 0m 4s
	Train Loss: 0.031 | Train PPL:   1.031
	 Val. Loss: 0.017 |  Val. PPL:   1.017


TRAIN:   0%|          | 0/8 [00:00<?, ?it/s]

EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 13 | Time: 0m 4s
	Train Loss: 0.027 | Train PPL:   1.028
	 Val. Loss: 0.012 |  Val. PPL:   1.012


TRAIN:   0%|          | 0/8 [00:00<?, ?it/s]

EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 14 | Time: 0m 4s
	Train Loss: 0.025 | Train PPL:   1.025
	 Val. Loss: 0.006 |  Val. PPL:   1.006


TRAIN:   0%|          | 0/8 [00:00<?, ?it/s]

EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 15 | Time: 0m 4s
	Train Loss: 0.022 | Train PPL:   1.022
	 Val. Loss: 0.007 |  Val. PPL:   1.007


TRAIN:   0%|          | 0/8 [00:00<?, ?it/s]

EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 16 | Time: 0m 4s
	Train Loss: 0.021 | Train PPL:   1.021
	 Val. Loss: 0.002 |  Val. PPL:   1.002


TRAIN:   0%|          | 0/8 [00:00<?, ?it/s]

EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 17 | Time: 0m 4s
	Train Loss: 0.018 | Train PPL:   1.019
	 Val. Loss: 0.003 |  Val. PPL:   1.003


TRAIN:   0%|          | 0/8 [00:00<?, ?it/s]

EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 18 | Time: 0m 4s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 0.002 |  Val. PPL:   1.002


TRAIN:   0%|          | 0/8 [00:00<?, ?it/s]

EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 19 | Time: 0m 4s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 0.001 |  Val. PPL:   1.001


TRAIN:   0%|          | 0/8 [00:00<?, ?it/s]

EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch: 20 | Time: 0m 4s
	Train Loss: 0.014 | Train PPL:   1.015
	 Val. Loss: 0.003 |  Val. PPL:   1.003


In [35]:
test_loss = evaluate(model, test_dataloader, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

EVAL:   0%|          | 0/8 [00:00<?, ?it/s]

| Test Loss: 0.003 | Test PPL:   1.003 |


## Embeddings

In [37]:
past_values = torch.tensor(df_train.T[df_train.T.index < '2019-01-01'].T.values, dtype=torch.float32)
past_dates = df_train.T[df_train.T.index < '2019-01-01'].index
past_years = torch.tensor(pd.Series(past_dates).dt.year.values - 2018, dtype=int)
past_months = torch.tensor(pd.Series(past_dates).dt.month.values - 1, dtype=int)
past_days = torch.tensor(pd.Series(past_dates).dt.day.values - 1, dtype=int)

In [38]:
src = past_values.to(device)
past_dates = past_years.to(device), past_months.to(device), past_days.to(device)

In [39]:
# "embedding" layer
embds1 = model.encoder.tok_embedding(src)

In [40]:
# first encoder layer
tok_emb = model.encoder.tok_embedding(src).repeat(INPUT_DIM,1).view(src.shape[0], src.shape[1], -1)
scale = torch.sqrt(torch.FloatTensor([100])).to(device)
years, months, days = past_dates
pos_emb = model.encoder.year_embedding(years.to(device))\
                + model.encoder.month_embedding(months.to(device))\
                + model.encoder.day_embedding(days.to(device))
src_scaled = tok_emb * scale + pos_emb
embds2 = model.encoder.layers[0](src_scaled).reshape(src_scaled.shape[0], src_scaled.shape[1], -1).mean(dim=1)

In [41]:
# whole encoder
embds3 = model.encoder(src, past_dates).mean(dim=1)

In [42]:
# sum of hidden layers
embds4 = embds1 + embds2 + embds3

In [43]:
# sum of hidden layers except first
embds5 = embds2 + embds3

In [44]:
for i, embds in enumerate([embds1, embds2, embds3, embds4, embds5]):
    embds = pd.DataFrame(embds.cpu().detach().numpy())
    embds.index = df_pct.columns
    embds.to_csv(f"../results/transformer_embds{i+1}.csv")