In [1]:
import numpy as np
import torch

In [15]:
N = 50000   # samples
d = 300   # dimensionality
C = 5     # n classes

# pure numpy, no vectorization

In [16]:
W = np.random.rand(C, d).astype(np.float32)
wl = [np.random.rand(d, 1).astype(np.float32) for i in range(N)]

In [17]:
%%timeit
r = [np.matmul(W, wli) for wli in wl]

74.9 ms ± 953 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


# pure numpy, with vectorization

In [18]:
W = np.random.rand(C, d).astype(np.float32)
wl = np.random.rand(d, N).astype(np.float32)

In [19]:
%%timeit
r = np.matmul(W, wl)

1.36 ms ± 86.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


# torch, no GPU

In [20]:
W = torch.rand(C, d, dtype=torch.float32)
wl = torch.rand(d, N, dtype=torch.float32)

In [21]:
%%timeit
r = torch.matmul(W, wl)

18.4 ms ± 59.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# torch, with GPU

In [22]:
W = torch.rand(C, d, dtype=torch.float32).cuda()
wl = torch.rand(d, N, dtype=torch.float32).cuda()

In [23]:
%%timeit
r = torch.matmul(W, wl)

94 µs ± 13.3 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [24]:
torch.cuda.get_device_name()

'Tesla V100-SXM3-32GB'