In [1]:
import torch, os
import numpy as np
from torch.nn import TransformerEncoder, TransformerEncoderLayer

os.environ['CUDA_VISIBLE_DEVICES'] = '2'

torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_math_sdp(True)

In [2]:
n_layers = 2
n_heads = 4
d_model = 64
n_hid = 49

In [3]:
model = TransformerEncoder(
    TransformerEncoderLayer(
        d_model = d_model,
        nhead = n_heads,
        dim_feedforward = n_hid,
    ),
    num_layers = n_layers,
)

In [4]:
# compute number of parameters
def get_num_params(model):
    return sum(p.numel() for p in model.parameters())

print('Model Size')
print("| Standard Attention | Pytorch Implementation |: ", get_num_params(model))

Model Size
| Standard Attention | Pytorch Implementation |:  46562


In [5]:
print('Inference Time')

batch_size = 10
seq_len = int(3e5)
num_simulations = 1
time_elaspsed = {'attention':[]}

for _ in range(num_simulations):
    
    # Construct the dummy input
    X = torch.rand(batch_size, seq_len, d_model)

    # Prepare everythin for CUDA
    X = X.to(device='cuda',dtype=torch.bfloat16)

    model.to(device='cuda',dtype=torch.bfloat16)
    model.eval()

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)

    with torch.no_grad():
        start.record()
        y = model(X)
        end.record()
        torch.cuda.synchronize()
        time_elaspsed['attention'].append(start.elapsed_time(end))

print("| Standard Attention | Pytorch Implementation |: ", f"{np.mean(time_elaspsed['attention']):.2f}", f"({np.std(time_elaspsed['attention']):.2f})", "ms")


Inference Time
| Standard Attention | Pytorch Implementation |:  663.86 (0.00) ms
