In [1]:
import torch, os
import numpy as np
from torch.nn import TransformerEncoder, TransformerEncoderLayer

os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [2]:
n_layers = 2
n_heads = 3
d_model = 9
n_hid = 49

# compute number of parameters
def get_num_params(model):
    return sum(p.numel() for p in model.parameters())


In [4]:
model = TransformerEncoder(
    TransformerEncoderLayer(
        d_model = d_model,
        nhead = n_heads,
        dim_feedforward = n_hid,
    ),
    num_layers = n_layers,
)


print('Model Size')
print(f"| Standard Attention | Pytorch {torch.__version__} |: ", get_num_params(model))

print('Inference Time')
batch_size = 10
seq_len = int(1e5)
num_simulations = 10

time_elaspsed = {'pytorch':[]}
for i in range(num_simulations):
    
    torch.manual_seed(i)
    # Construct the dummy input
    X = torch.rand(batch_size, seq_len, d_model)

    # Prepare everythin for CUDA
    X = X.cuda()

    model.cuda()
    model.eval()

    if i == 0: model(X) # warm up

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)

    with torch.no_grad():
        start.record()
        y = model(X)
        end.record()
        torch.cuda.synchronize()
        time_elaspsed['pytorch'].append(start.elapsed_time(end))

print(f"| Standard Attention | Pytorch {torch.__version__} |: ", f"{np.mean(time_elaspsed['pytorch']):.2f}", f"({np.std(time_elaspsed['pytorch']):.2f})", "ms")

Model Size
| Standard Attention | Pytorch 2.3.0 |:  2672
Inference Time
| Standard Attention | Pytorch 2.3.0 |:  59.62 (0.09) ms
