# Toy Problem - Transformer Application to Regression

https://github.com/oliverguhr/transformer-time-series-prediction/tree/master

Description: This example is from the repo above. It contains 2 PyTorch models for a transformer-based time series prediction. The dataset is stored in ./daily-min-temperatures.csv

In [1]:
import torch 
import torch.nn as nn
import numpy as np 
import time 
import math
from matplotlib import pyplot

In [2]:
torch.manual_seed(0)
np.random.seed(0)

In [3]:
calculate_loss_over_all_values = False

In [4]:
# S = source sequence length
# T = target sequence length 
# N = batch size 
# E = feature number

In [5]:
input_window = 100 
output_window = 5
batch_size = 10 
device = torch.device("curda" if torch.cuda.is_available() else "cpu")

In [6]:
class PositionalEncoding(nn.Module): # define a pytorch module that inherits nn.Module
    """
    Positional encoding layer for transformer model. 

    Layer injects info about the relative or absolute position of the sequence, without adding learnable parameters. 

    Uses sin and cos fcns of different frequencies to encode position info. 

    Args: 
        d_model (int): dimension of the embedding space 
        max_len (int, optional): max sequence length supported. Default is 5000 

    Attriutes: 
        pe (Tensor): Fixed positional encoding matrix of space (max_len, 1, d_model)
    
    """

    # the "Attributes" part documents the instance variables inside the __init__

    def __init__(self, d_model, max_len=5000): # creates init method 
        super(PositionalEncoding, self).__init__() # super() lets us avoid referring to the base class explicitly
        # https://stackoverflow.com/questions/576169/understanding-python-super-with-init-methods
        pe = torch.zeros(max_len, d_model) # create empty matrix of shape max_len X d_model to hold the positional encodings
        # row: position i.e. 0, 1, 2, 
        # column: dim of the embedding 

        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) # vector of positions [0, 1, 2, 3, ..., 4999]
        # unsqueeze reshapes vector from [max_len,] to [max_len, 1] to enable broadcasting 

        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        # division term for the sin and cos fcns 
        # torch.arange(0, d_model, 2).float(): starts at 0, ends at d_model, step size = 2 
        # -ln(10000)
        # torch.exp = exp 
        # this comes from "Attention is all you need" paper where sin and cos fcns of different frequencies are used where each dimension of the positional encoding corresponds to a sine
        # PE at dim i = PE_(pos, 2i) = sin(pos/10000^(2i/d_model))
        # PE_(pos, 2i+1) = cos(pos/10000^(2i/d_model)) 

        pe[:, 0::2] = torch.sin(position * div_term)
        # at even indices: sin(position * frequency)
    
        pe[:, 1::2] = torch.cos(position * div_term)
        # at odd indices: cos(position * frequency)

        pe = pe.unsqueeze(0).transpose(0,1)
        # pe.unsqueeze(0) == adds a batch dimension so the shape becomes: [1, max_len, d_model]
        # .transpose(0,1) == swaps the first and second dimensions such that the new shape is [max_len, 1, d_model]

        self.register_buffer('pe', pe)
        # saving pe tensor. Tensor which is not a parameter, but should be part of the module's state. Used for tensors that need to be on the same device as the module. 
        # it's a fixed tensor stored with the model and moved to the GPU/CPU automatically 
        # this is NOT updated during backprop 

    def forward(self, x): # during the forward pass, x is the input with shape [sequence length, batch_size, d_model]
        return x + self.pe[:x.size(0), :] # add the pe for the len of the input, x 

In [7]:
# EXAMPLE USE OF POSITIONAL ENCODING
positional_encoder = PositionalEncoding(d_model = 512)
sample_x = torch.randn(100, 32, 512) # tensor filled with random numbers from a standard normal distribution of shape [100, 32, 512]
sample_encode = positional_encoder(sample_x)

In [8]:
print(sample_x[0:3, 0:3, 0:3])

tensor([[[-1.1258, -1.1524, -0.2506],
         [-0.5461, -0.6302, -0.6347],
         [-1.0841, -0.1287, -0.6811]],

        [[-0.5518,  1.5398,  1.0036],
         [-0.4424,  0.2087,  0.0160],
         [ 1.2970, -0.4725,  0.3149]],

        [[-0.9780,  0.6038, -1.7178],
         [-0.3399, -0.2990,  1.8007],
         [ 0.6786,  0.5225, -0.0246]]])


In [9]:
print(sample_encode[0:3, 0:3, 0:3])

tensor([[[-1.1258, -0.1524, -0.2506],
         [-0.5461,  0.3698, -0.6347],
         [-1.0841,  0.8713, -0.6811]],

        [[ 0.2896,  2.0801,  1.8254],
         [ 0.3990,  0.7490,  0.8379],
         [ 2.1385,  0.0678,  1.1368]],

        [[-0.0687,  0.1877, -0.7814],
         [ 0.5694, -0.7151,  2.7371],
         [ 1.5879,  0.1063,  0.9118]]])


In [None]:
class TransAm(nn.Module): 
    def 