# Model Playground

Sources:
- Time-series Transformer guide: <https://towardsdatascience.com/the-time-series-transformer-2a521a0efad3>
- Time2Vec embedding: <https://arxiv.org/pdf/1907.05321.pdf>

In [12]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

## Load Datasets

In [13]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

In [14]:
if IN_COLAB:
    from google.colab import drive
    drive.mount("/content/gdrive")
    dataset_root = "/content/gdrive/My Drive/Virginia Tech/graduate/research/makassar/repos/makassar-ml/datasets/"
else:
    dataset_root = "../datasets/"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


### Dataset: Beijing PM2.5

In [15]:
import torch.utils.data

In [16]:
class BeijingPM25Dataset(torch.utils.data.Dataset):

    def __init__(self, path: str):

        # Read the input file.
        fields = ['year','month','day','hour','DEWP','TEMP','PRES','Is','Ir'] # Specific columns to use.
        self.df = pd.read_csv(path, usecols=fields)

        # # Create single date column from independent year/month/day columns.
        # self.df = self.df.assign(date=pd.to_datetime(df[['year','month','day','hour']]))

        # Add health scores to the dataset for specific plants.
        # These scores are normalized between [0,1].
        features = ['tomato', 'sunflower', 'cucumber']
        self.df = self.df.assign(**{feat:np.random.uniform(0.0, 1.0, size=self.df.shape[0]) for feat in features})

        # Separate dataset into source (input) and target (output).
        # self.src = df[['date', 'DEWP', 'TEMP', 'PRES', 'Is', 'Ir']].to_numpy()
        self.src = self.df[['year','month','day','hour', 'DEWP', 'TEMP', 'PRES', 'Is', 'Ir']].to_numpy()
        self.tgt = self.df[['tomato', 'sunflower', 'cucumber']].to_numpy()

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        src = self.src[index]
        tgt = self.tgt[index]
        return src, tgt

In [17]:
# Load the dataset from file.
csvfile = os.path.join(dataset_root, "beijing_pm2.5", "PRSA_data_2010.1.1-2014.12.31.csv")
dataset = BeijingPM25Dataset(csvfile)

In [18]:
# Create a dataset loader to assist with batching.
loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=False)

## Model Definition

In [19]:
import torch.nn

#### Transformer for Time-Series Forecasting

In [20]:
class TimeSeriesTransformer(torch.nn.Module):

    def __init__(self,
        n_encoder_inputs: int,
        n_decoder_inputs: int,
        d_model: int = 512,
        dropout: float = 0.1,
        batch_first: bool = False,
        ):
        super().__init__()

        # Linear transformation from input-feature space into arbitrary n-dimension space.
        # This is similar to a word embedding used in NLP tasks.
        self.encoder_projection = torch.nn.Linear(in_features=n_encoder_inputs, out_features=d_model)
        self.decoder_projection = torch.nn.Linear(in_features=n_decoder_inputs, out_features=d_model)

        # Transformer encoder/decoder layers.
        encoder_layer = torch.nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=8, # Number of multihead-attention models.
            dropout=dropout,
            dim_feedforward=4*d_model,
            batch_first=batch_first,
        )
        decoder_layer = torch.nn.TransformerDecoderLayer(
            d_model=d_model,
            nhead=8, # Number of multihead-attention models.
            dropout=dropout,
            dim_feedforward=4*d_model,
            batch_first=batch_first,
        )
        self.encoder = torch.nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=8)
        self.decoder = torch.nn.TransformerDecoder(decoder_layer=decoder_layer, num_layers=8)

        # Linear output layer.
        # We only predict a single data point at a time, so output features is 1.
        self.linear = torch.nn.Linear(in_features=d_model, out_features=1)


    def encode(self, src):
        pass


    def decode(self, tgt, memory):
        pass


    def forward(self, src, tgt):
        x = self.encode(src)
        x = self.decode(tgt=tgt, memory=x)
        return x

In [21]:
# Prediction problem setup.
#
# Given 24 hours of data points, predict the next 1 hour of data points.
n_encoder_inputs = 24 # Number of data points in input sequence.
n_decoder_inputs = 1 # Number of data points in output sequence.

d_model = 512 # Latent dimension.
dropout = 0.1

# Create new model.
model = TimeSeriesTransformer(
    n_encoder_inputs,
    n_decoder_inputs,
    d_model,
    dropout,
)