# Model Playground

Sources:
- Time-series Transformer guide: <https://towardsdatascience.com/the-time-series-transformer-2a521a0efad3>
- Time2Vec embedding: <https://arxiv.org/pdf/1907.05321.pdf>

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from typing import List

## Model Definition

In [2]:
import torch
import torch.nn

### Transformer for Time-Series Forecasting

In [3]:
def create_attn_mask(length: int, device: str = None):
    """Generate mask used for attention mechanisms.

    Masks are a lower-triangular matrix of zeros
    with the other entries taking value "-inf".

    Args:
        length (int): Length of square-matrix dimension.
        device (str, optional): PyTorch device.

    Examples:

        >>> create_mask(3)
        tensor([[0., -inf, -inf],
                [0., 0., -inf],
                [0., 0., 0.]])
    """
    # Get lower-triangular matrix of ones.
    mask = torch.tril(torch.ones(length, length, device=device))

    # Replace 0 -> "-inf" and 1 -> 0.0
    mask = (
        mask
        .masked_fill(mask == 0, float("-inf"))
        .masked_fill(mask == 1, float(0.0))
    )
    return mask

In [4]:
class TimeSeriesTransformer(torch.nn.Module):

    def __init__(self,
        n_encoder_inputs: int,
        n_decoder_inputs: int,
        n_outputs: int = 1,
        d_model: int = 512,
        dropout: float = 0.1,
        batch_first: bool = False,
        ):
        super().__init__()

        self.batch_first = batch_first

        # Linear transformation from input-feature space into arbitrary n-dimension space.
        # This is similar to a word embedding used in NLP tasks.
        self.encoder_projection = torch.nn.Linear(in_features=n_encoder_inputs, out_features=d_model)
        self.decoder_projection = torch.nn.Linear(in_features=n_decoder_inputs, out_features=d_model)

        # Transformer encoder/decoder layers.
        encoder_layer = torch.nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=8, # Number of multihead-attention models.
            dropout=dropout,
            dim_feedforward=4*d_model,
            batch_first=batch_first,
        )
        decoder_layer = torch.nn.TransformerDecoderLayer(
            d_model=d_model,
            nhead=8, # Number of multihead-attention models.
            dropout=dropout,
            dim_feedforward=4*d_model,
            batch_first=batch_first,
        )
        self.encoder = torch.nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=8)
        self.decoder = torch.nn.TransformerDecoder(decoder_layer=decoder_layer, num_layers=8)

        # Linear output layer.
        # We typically only predict a single data point at a time, so output features is typically 1.
        self.linear = torch.nn.Linear(in_features=d_model, out_features=n_outputs)


    def encode(self, src):
        # Transform source into arbitrary feature space.
        x = self.encoder_projection(src)

        # # Create source mask.
        # if self.batch_first:
        #     src_length, batch_size = src.size(1), src.size(0)
        # else:
        #     src_length, batch_size = src.size(0), src.size(1)
        # src_mask = create_attn_mask(length=src_length, device=src.device)

        # # Pass the linear transformation through the encoder layers.
        # x = self.encoder(x, mask=src_mask)
        x = self.encoder(x)

        return x


    def decode(self, tgt, memory):
        # Transform target into arbitrary feature space.
        x = self.decoder_projection(tgt)

        # Create target attention mask.
        if self.batch_first:
            tgt_length, batch_size = tgt.size(1), tgt.size(0)
        else:
            tgt_length, batch_size = tgt.size(0), tgt.size(1)
        tgt_mask = create_attn_mask(length=tgt_length, device=tgt.device)

        # Pass the linear transformation through the decoder layers.
        x = self.decoder(tgt=x, memory=memory, tgt_mask=tgt_mask)

        # Pass the output of the decoder through the linear prediction layer.
        x = self.linear(x)

        return x


    def forward(self, x):
        src, tgt = x
        y = self.encode(src)
        y = self.decode(tgt=tgt, memory=y)
        return y

    
    def step(self, batch):
        src, tgt_int, tgt_out = batch

## Load Dataset: Beijing PM2.5 

In [5]:
import torch.utils.data

In [6]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

In [7]:
if IN_COLAB:
    from google.colab import drive
    drive.mount("/content/gdrive")
    dataset_root = "/content/gdrive/My Drive/Virginia Tech/graduate/research/makassar/repos/makassar-ml/datasets/"
else:
    dataset_root = "../datasets/"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [8]:
class BeijingPM25Dataset(torch.utils.data.Dataset):

    def __init__(self, path: str):

        # Read the input file.
        fields = ['year','month','day','hour','DEWP','TEMP','PRES','Is','Ir'] # Specific columns to use.
        self.df = pd.read_csv(path, usecols=fields)

        # # Create single date column from independent year/month/day columns.
        # self.df = self.df.assign(date=pd.to_datetime(df[['year','month','day','hour']]))

        # Add health scores to the dataset for specific plants.
        # These scores are normalized between [0,1].
        features = ['tomato', 'sunflower', 'cucumber']
        self.df = self.df.assign(**{feat:np.random.uniform(0.0, 1.0, size=self.df.shape[0]) for feat in features})

        # Separate dataset into source (input) and target (output).
        # self.src = df[['date', 'DEWP', 'TEMP', 'PRES', 'Is', 'Ir']].to_numpy()
        # self.src = self.df[['year','month','day','hour', 'DEWP', 'TEMP', 'PRES', 'Is', 'Ir']].to_numpy()
        #
        # self.src = self.df[['DEWP', 'TEMP', 'PRES', 'Is', 'Ir']].to_numpy()
        # self.tgt = self.df[['tomato', 'sunflower', 'cucumber']].to_numpy()
        #
        # self.data = self.df[['DEWP', 'TEMP', 'PRES', 'Is', 'Ir','tomato', 'sunflower', 'cucumber']].to_numpy()
        self.src = self.df[['DEWP']].to_numpy()
        self.tgt = self.df[['tomato']].to_numpy()

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        src = torch.tensor(self.src[index], dtype=torch.float)
        tgt = torch.tensor(self.tgt[index], dtype=torch.float)
        return src, tgt
        # return torch.tensor(self.data[index], dtype=torch.float)

In [9]:
class BeijingSlidingWindowDataset(torch.utils.data.Dataset):
    def __init__(self, dataset: torch.utils.data.Dataset, window_size: int, horizon_size: int):
        self.window_size = window_size
        self.horizon_size = horizon_size

        # Split original dataset into windows.
        self.src_in, self.src_out = [], []
        self.tgt_in, self.tgt_out = [], []
        for i in range(0, len(dataset), horizon_size):
            src_in, tgt_in = dataset[i:i+window_size]
            src_out, tgt_out = dataset[i+window_size:i+window_size+horizon_size]
            self.src_in.append(src_in)
            self.src_out.append(src_out)
            self.tgt_in.append(tgt_in)
            self.tgt_out.append(tgt_out)
        # windows = [
        #     (
        #         dataset[i:i+window_size],
        #         dataset[i+window_size:i+window_size+horizon_size],
        #     ) 
        #     for i in range(0, len(dataset), horizon_size)
        # ]
        # self.n_windows = len(windows)
        # self.src, self.tgt = tuple(zip(*windows))
        #
        # self.src_in = torch.stack(self.src_in, dim=0)
        # self.src_out = torch.stack(self.src_out, dim=0)
        # self.tgt_in = torch.stack(self.tgt_in, dim=0)
        # self.tgt_out = torch.stack(self.tgt_out, dim=0)

        # Pad any partial sequences.
        self.src_in = torch.nn.utils.rnn.pad_sequence(self.src_in, batch_first=True, padding_value=0.0)
        self.src_out = torch.nn.utils.rnn.pad_sequence(self.src_out, batch_first=True, padding_value=0.0)
        self.tgt_in = torch.nn.utils.rnn.pad_sequence(self.tgt_in, batch_first=True, padding_value=0.0)
        self.tgt_out = torch.nn.utils.rnn.pad_sequence(self.tgt_out, batch_first=True, padding_value=0.0)

    def __len__(self):
        return self.src_in.shape[0]
    
    def __getitem__(self, index):
        return self.src_in[index], self.tgt_in[index], self.src_out[index], self.tgt_out[index]

In [10]:
class NewBeijingSlidingWindowDataset(torch.utils.data.Dataset):
    def __init__(self, dataset: torch.utils.data.Dataset, window_size: int):
        self.window_size = window_size

        # Split original dataset into windows.
        self.src, self.tgt = [], []
        for i in range(0, len(dataset)):
            src, tgt = dataset[i:i+window_size]
            self.src.append(src)
            self.tgt.append(tgt)

        # Pad any partial sequences.
        self.src = torch.nn.utils.rnn.pad_sequence(self.src, batch_first=True, padding_value=0.0)
        self.tgt = torch.nn.utils.rnn.pad_sequence(self.tgt, batch_first=True, padding_value=0.0)

    def __len__(self):
        return self.src.shape[0]
    
    def __getitem__(self, index):
        return self.src[index], self.tgt[index]

In [11]:
# Load the dataset from file.
csvfile = os.path.join(dataset_root, "beijing_pm2.5", "PRSA_data_2010.1.1-2014.12.31.csv")
dataset = BeijingPM25Dataset(csvfile)

In [12]:
# Create train/test split.
trainsplit = 0.75
n_records = len(dataset)
split_idx = int(n_records*trainsplit)
train_dataset = torch.utils.data.Subset(dataset, list(range(split_idx)))
test_dataset = torch.utils.data.Subset(dataset, list(range(split_idx, n_records)))

In [13]:
# Create sliding window dataset.
window_size = 24 # Number of historic data points.
horizon_size = 5 # Number of prediction points in the future.
# swdataset = BeijingSlidingWindowDataset(dataset, window_size=window_size, horizon_size=horizon_size)
train_windows = NewBeijingSlidingWindowDataset(train_dataset, window_size=window_size)
test_windows = NewBeijingSlidingWindowDataset(test_dataset, window_size=window_size)

In [14]:
# Create a dataset loader to assist with batching.
batch_size = 32
train_loader = torch.utils.data.DataLoader(train_windows, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_windows, batch_size=batch_size, shuffle=False)

In [15]:
# Create start-of-sequence (SOS) vector.
sos_vector = torch.ones(dataset.tgt.shape[-1])*-1.0

In [16]:
# # Test the data loader.
# for b,batch in enumerate(loader):
#     src_in,tgt_in, src_out,tgt_out = batch
#     print(src_out[0,0],tgt_out[0,0])
#     print(dataset[24])
#     print(f'[{b}] src_in.shape',src_in.shape)
#     print(f'[{b}] tgt_in.shape',tgt_in.shape)
#     print(f'[{b}] src_out.shape',src_out.shape)
#     print(f'[{b}] tgt_out.shape',tgt_out.shape)
#     break

In [17]:
# # Test the data loader.
# for b,batch in enumerate(train_loader):
#     src,tgt = batch
#     print(src[0,0],tgt[0,0])
#     print(dataset[0])
#     print(f'[{b}] src.shape',src.shape)
#     print(f'[{b}] tgt.shape',tgt.shape)
#     break

In [18]:
# a = torch.ones(23,8)
# b = torch.ones(24,8)
# pad = (0,0,1,1)
# # print(a.view((24,8)).shape)
# print(torch.nn.functional.pad(a, pad).shape)
# print(torch.nn.functional.pad(b, pad).shape)
# print(torch.nn.utils.rnn.pad_sequence([a,b], batch_first=True, padding_value=0.0)[0,-1])

## Training

In [19]:
import time
from contextlib import contextmanager
@contextmanager
def timing(description='Elapsed time'):
    """Context manager to print elapsed time from call."""
    start_time = time.time()
    yield
    stop_time = time.time()
    print(f"{description}: {stop_time - start_time} seconds")

In [20]:
def smape_loss(y: torch.Tensor, y_pred: torch.Tensor):
    return 2*(y - y_pred).abs() / (y.abs() + y_pred.abs())

In [21]:
def train(model, loader, optimizer, criterion, epochs, device='cpu') -> List[float]:
    model.train() # Turn on training mode.

    losses = []
    for e in range(epochs):
        running_loss = 0.0
        for i, batch in enumerate(loader):
            # inputs, targets = batch # Unpack the batch tuple.
            # src_in,tgt_in,_,tgt_out = batch
            src,tgt = batch
            
            # Send data to device.
            # src_in = src_in.to(device)
            # tgt_in = tgt_in.to(device)
            # tgt_out = tgt_out.to(device)
            src = src.to(device)
            tgt = tgt.to(device)

            # Shift target sequences so that SOS vector can be inserted.
            sos_batch = sos_vector.view(1,1,sos_vector.size(0)).repeat(tgt.shape[0],1,1)
            tgt_shifted = torch.cat((sos_batch, tgt[:, :-2]), dim=1)
            tgt_y = tgt[:, 1:]


            # Evaluate the model.
            # tgt_pred = model((src_in,tgt_in))
            tgt_pred = model((src, tgt_shifted))
            # tgt_pred = model((src[:, :-1], tgt_in))

            print('tgt.shape',tgt.shape)
            print('tgt_y.shape',tgt_y.shape)
            print('sos_vector.shape',sos_vector.shape)
            print('sos_batch.shape', sos_batch.shape)
            print('tgt_shifted.shape', tgt_shifted.shape)
            print('tgt_pred.shape', tgt_pred.shape)
            print()
            # print(tgt_pred.view(-1, tgt_out.shape[-1]).shape)
            # print(tgt_out.view(-1, tgt_out.shape[-1]).shape)
            # print('inputs.shape',inputs.shape)
            # print('targets.shape',targets.shape)
            # print('targets_pred.shape',targets_pred.shape)

            # Compute losses.
            loss = criterion(tgt_pred, tgt_y)

            # Zero the gradient, back-propagate, and step the optimizer.
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Accumulate the loss for this epoch.
            running_loss += loss.item()
            if i >= 2: break

        # Report epoch results.
        print(f'Epoch {e}: loss {running_loss}')
        losses.append(losses)
    return losses

In [22]:
# Set runtime device.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Prediction problem setup.
#
# Given 24 hours of data points, predict the next 1 hour of data points.
n_encoder_inputs = 1 #24 # Number of data points in input sequence.
n_decoder_inputs = 1 #8 # Number of data points in output sequence.
n_outputs = 1 # Number of output data points.

d_model = 512 # Latent dimension.
dropout = 0.1

# Create new model.
model = TimeSeriesTransformer(
    n_encoder_inputs,
    n_decoder_inputs,
    n_outputs,
    d_model,
    dropout,
    batch_first=True,
)

# Train the model.
# Display training time too.
epochs = 1
lr = 1e-3
with timing():
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.MSELoss(reduction='mean')
    train(model, loader=train_loader, optimizer=optimizer, criterion=criterion, epochs=epochs, device=device)

tgt.shape torch.Size([32, 24, 1])
tgt_y.shape torch.Size([32, 23, 1])
sos_vector.shape torch.Size([1])
sos_batch.shape torch.Size([32, 1, 1])
tgt_shifted.shape torch.Size([32, 23, 1])
tgt_pred.shape torch.Size([32, 23, 1])

tgt.shape torch.Size([32, 24, 1])
tgt_y.shape torch.Size([32, 23, 1])
sos_vector.shape torch.Size([1])
sos_batch.shape torch.Size([32, 1, 1])
tgt_shifted.shape torch.Size([32, 23, 1])
tgt_pred.shape torch.Size([32, 23, 1])

tgt.shape torch.Size([32, 24, 1])
tgt_y.shape torch.Size([32, 23, 1])
sos_vector.shape torch.Size([1])
sos_batch.shape torch.Size([32, 1, 1])
tgt_shifted.shape torch.Size([32, 23, 1])
tgt_pred.shape torch.Size([32, 23, 1])

Epoch 0: loss 106.95018434524536
Elapsed time: 17.702310800552368 seconds


In [36]:
def evaluate(model, history, horizon, device='cpu'):
    model.eval()
    model.to(device)

    # sos = sos_vector.view(1,1,sos_vector.size(0))
    sos = sos_vector.view(1,1,sos_vector.size(0)).repeat(history.shape[0],1,1)
    for i in range(horizon):
        out = model((history, sos))
    print('out.shape', out.shape)
    return out

IndentationError: ignored

In [44]:
for batch in test_loader:
    src, tgt = batch
    for i in range(src.shape[0]):
        print(src.shape, tgt.shape, src[i].view(1,*src.shape[1:]).shape)
        out = evaluate(model, src[i].view(1,*src.shape[1:]), 5, device=device)
        print(out.shape)
        print(smape_loss(out, tgt[i]))
        break
    break

torch.Size([32, 24, 1]) torch.Size([32, 24, 1]) torch.Size([1, 24, 1])
out.shape torch.Size([1, 1, 1])
torch.Size([1, 1, 1])
tensor([[[1.1928],
         [1.8505],
         [1.5840],
         [1.7564],
         [1.8933],
         [1.6984],
         [1.8085],
         [1.5899],
         [1.1856],
         [1.4727],
         [1.5721],
         [1.9533],
         [1.9637],
         [1.4715],
         [1.8771],
         [1.1999],
         [1.8688],
         [1.6952],
         [1.5914],
         [1.2477],
         [1.2189],
         [1.7047],
         [1.9770],
         [1.9383]]], grad_fn=<DivBackward0>)
