In [1]:
from models.classifier import TransformerEncoder
import torch

In [2]:
hparams = {
    "hidden_size": 96,  # size of the hidden layers and embeddings
    "hidden_ff": 128,  # size of the position-wise feed-forward layer
    "n_encoders": 4,  # number of encoder blocks
    "n_heads": 8,  # number of attention heads in the multiheadattention module
    "n_local": 2,  # number of local attention heads
    "local_window_size": 4,  # size of the window for local attention
    'batch_size': 12,
    "max_length": 500,  # maximum length of the input sequence
    "vocab_size": 1000,  # size of the vocabulary
    "learning_rate": 0.001,
    "num_epochs": 30,
    "attention_type": "performer",
    "norm_type": "rezero",
    "num_random_features": 32,  # number of random features for the Attention module (Performer uses this)
    "emb_dropout": 0.1,  # dropout for the embedding block
    "fw_dropout": 0.1,  # dropout for the position-wise feed-forward layer
    "att_dropout": 0.1,  # dropout for the multiheadattention module
    "dc_dropout": 0.1,  # dropout for the decoder block
    "hidden_act": "swish",  # activation function for the hidden layers (attention layers use ReLU)
    "epsilon": 1e-8,
    "weight_decay": 0.01,
    "beta1": 0.9,
    "beta2": 0.999,
}
model = TransformerEncoder(hparams)

In [3]:
model.hparams.batch_size 

12

In [12]:
batch = {
    'tokens': torch.randint(0, model.hparams.vocab_size, (model.hparams.batch_size, model.hparams.max_length)),
    'abspos': torch.arange(0, model.hparams.max_length).unsqueeze(0).repeat(model.hparams.batch_size, 1),
    'age': torch.randint(0, 100, (model.hparams.batch_size, 1)),
    'padding_mask': torch.zeros(model.hparams.batch_size, model.hparams.max_length),
    'targets': torch.randint(0, 2, (model.hparams.batch_size, 1)).float(),
}

In [13]:
model(batch)

{'logits': tensor([[-1.3514],
         [-1.0725],
         [-0.8076],
         [-1.1908],
         [-1.4738],
         [-1.1028],
         [-1.2330],
         [-1.1628],
         [-0.8098],
         [-1.0998],
         [-1.1409],
         [-1.2202]], grad_fn=<AddmmBackward0>),
 'preds': tensor([[0.2056],
         [0.2549],
         [0.3084],
         [0.2331],
         [0.1864],
         [0.2492],
         [0.2257],
         [0.2382],
         [0.3079],
         [0.2498],
         [0.2422],
         [0.2279]], grad_fn=<SigmoidBackward0>)}

In [14]:
model.training_step(batch, 0)

tensor(1.1722, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

In [15]:
model.on_train_epoch_end()

Train Metrics
Train Metrics
Loss: tensor(1.0116)
Accuracy: tensor(0.3750)
MCC: tensor(-0.1054)
