In [1]:
from math import ceil

import torch
from torch.utils.cpp_extension import load_inline
import numpy as np

from cuda_utils import load_cuda_inline

%load_ext wurlitzer

## Vector addition
### Python

In [2]:
a = torch.rand(5000)
b = torch.rand(5000)

In [3]:
def vector_addition_math(blockidx, blockdim, threadidx, a, b, out, n):
    i = blockidx * blockdim + threadidx
    if i < n:
        out[i] = a[i] + b[i]

def grid_1d_kernel_launch(f, blocks, threads, *args):
    for i in range(blocks):
        for j in range(threads):
            f(i, threads, j, *args)

def vector_addition(a, b):
    assert a.shape == b.shape
    n = a.shape[0]
    output = torch.zeros(n)
    threads_per_block = 256
    blocks = int(ceil(n / threads_per_block))
    grid_1d_kernel_launch(vector_addition_math, blocks, threads_per_block, a, b, output, n)
    return output

In [4]:
%time c = vector_addition(a, b)

CPU times: user 85.7 ms, sys: 3 ms, total: 88.7 ms
Wall time: 87.6 ms


In [5]:
torch.allclose(a + b, c)

True

### CUDA

In [6]:
cuda_src = """
__global__ void vector_addition_math(float* a, float* b, float* out, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) out[i] = a[i] + b[i];
}

torch::Tensor vector_addition(torch::Tensor a, torch::Tensor b) {
    TORCH_CHECK(a.sizes() == b.sizes());
    int n = a.size(0);
    auto output = torch::empty(n, a.options());
    int threads_per_block = 512;
    int blocks = cdiv(n, threads_per_block);
    vector_addition_math<<<blocks, threads_per_block>>>(
        a.data_ptr<float>(), b.data_ptr<float>(), output.data_ptr<float>(), n
    );
    return output;
}
"""

In [7]:
cpp_src = "torch::Tensor vector_addition(torch::Tensor a, torch::Tensor b);"
cuda_module = load_cuda_inline(cuda_src, cpp_src, ["vector_addition"])

In [8]:
a_cuda = a.contiguous().cuda()
b_cuda = b.contiguous().cuda()

In [9]:
torch.allclose(a_cuda + b_cuda, cuda_module.vector_addition(a_cuda, b_cuda))

True

In [10]:
%%timeit
ouput_cuda = cuda_module.vector_addition(a_cuda, b_cuda).cpu()

30 µs ± 299 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [11]:
%timeit (a_cuda + b_cuda).cpu()

27.5 µs ± 158 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [12]:
a = torch.rand(10000000)
b = torch.rand(10000000)
a_np = a.numpy()
b_np = b.numpy()
a_cuda = a.contiguous().cuda()
b_cuda = b.contiguous().cuda()

In [13]:
%timeit a_np + b_np

19.1 ms ± 1.75 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
%timeit cuda_module.vector_addition(a_cuda, b_cuda).cpu()

18.5 ms ± 74.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
%timeit (a_cuda + b_cuda).cpu()

18.9 ms ± 359 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
