help diagnosing a GPU performance issue #4647

beckermr · 2019-10-01T17:08:02Z

Reporting a bug

I am using the latest released version of Numba (most recent is visible in
the change log (https://github.com/numba/numba/blob/master/CHANGE_LOG).
I have included below a minimal working reproducer (if you are unsure how
to write one see http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports).

I am coding a simple algorithm in numba and raw CUDA C using cupy. I am seeing a 3x performance difference, which is a lot.

The code below outputs on my local P100:

$ python demo_cupy_brute_rawkernel.py 1000000
cupy+CUDA events: 11133.681315104166 ms
cupy+CUDA wall  : 15018.668174743652 ms
numba events: 34106.19140625 ms
numba wall  : 45484.97438430786 ms

Here is the code:

import sys
import time

import cupy as cp
import numpy as np
from numba import cuda

source_code = """\
extern "C"{

__global__ void brute_force_pairs_kernel(
    float* x1, float* y1, float* z1, float* w1,
    float* x2, float* y2, float* z2, float* w2,
    float* rbins_squared, float* result,
    int n1, int n2, int nbins) {

    size_t start = threadIdx.x + blockIdx.x * blockDim.x;
    size_t stride = blockDim.x * gridDim.x;

    for (size_t i = start; i < n1; i += stride) {
        float px = x1[i];
        float py = y1[i];
        float pz = z1[i];
        float pw = w1[i];

        for (size_t j = 0; j < n2; j++) {
            float qx = x2[j];
            float qy = y2[j];
            float qz = z2[j];
            float qw = w2[j];

            float dx = px - qx;
            float dy = py - qy;
            float dz = pz - qz;
            float wprod = pw * qw;
            float dsq = dx * dx + dy * dy + dz * dz;

            int k = nbins - 1;
            while (dsq <= rbins_squared[k]) {
                atomicAdd(&(result[k-1]), wprod);
                k -= 1;
                if (k <= 0) break;
            }
        }
    }
}

}
"""

# compile and load CUDA kernel using CuPy
brute_force_pairs_kernel = cp.RawKernel(
    source_code, 'brute_force_pairs_kernel')

# parameters
blocks = 512
threads = 512
if len(sys.argv) > 1:
    npoints = int(sys.argv[1])
else:
    npoints = 100_000
n1 = npoints
n2 = npoints

# make the data
DEFAULT_RBINS_SQUARED = (np.logspace(
    np.log10(0.1/1e3), np.log10(40/1e3), 20)**2).astype(np.float32)
rng = np.random.RandomState(seed=42)
x1, y1, z1, w1 = rng.uniform(size=(4, n1)).astype(np.float32)
x2, y2, z2, w2 = rng.uniform(size=(4, n1)).astype(np.float32)

# array init
result = np.zeros_like(DEFAULT_RBINS_SQUARED)[:-1].astype(np.float32)

d_x1 = cp.asarray(x1, dtype=cp.float32)
d_y1 = cp.asarray(y1, dtype=cp.float32)
d_z1 = cp.asarray(z1, dtype=cp.float32)
d_w1 = cp.asarray(w1, dtype=cp.float32)

d_x2 = cp.asarray(x2, dtype=cp.float32)
d_y2 = cp.asarray(y2, dtype=cp.float32)
d_z2 = cp.asarray(z2, dtype=cp.float32)
d_w2 = cp.asarray(w2, dtype=cp.float32)

d_rbins_squared = cp.asarray(DEFAULT_RBINS_SQUARED, dtype=cp.float32)
d_result_cp = cp.asarray(result, dtype=cp.float32)

# for GPU timing using CuPy
start = cp.cuda.Event()
end = cp.cuda.Event()
timing_cp = 0
timing_cp_wall = 0

# running the kernel using CuPy's functionality
for i in range(4):
    if i > 0:  # warm-up not needed if using RawModule
        start.record()
        _s = time.time()
    brute_force_pairs_kernel(
        (blocks,), (threads,),
        (d_x1, d_y1, d_z1, d_w1,
         d_x2, d_y2, d_z2, d_w2,
         d_rbins_squared, d_result_cp,
         cp.int32(d_x1.shape[0]),
         cp.int32(d_x2.shape[0]),
         cp.int32(d_rbins_squared.shape[0]))
    )
    if i > 0:  # warm-up not needed if using RawModule
        end.record()
        end.synchronize()
        _e = time.time()
        timing_cp += cp.cuda.get_elapsed_time(start, end)
        timing_cp_wall += (_e - _s)

print('cupy+CUDA events:', timing_cp/3, 'ms')
print('cupy+CUDA wall  :', timing_cp_wall/3*1000, 'ms')
d_result_cp = d_result_cp.copy()


# for GPU timing using Numba
@cuda.jit
def count_weighted_pairs_3d_cuda(
        x1, y1, z1, w1, x2, y2, z2, w2, rbins_squared, result):
    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    n1 = x1.shape[0]
    n2 = x2.shape[0]
    nbins = rbins_squared.shape[0]

    for i in range(start, n1, stride):
        px = x1[i]
        py = y1[i]
        pz = z1[i]
        pw = w1[i]
        for j in range(n2):
            qx = x2[j]
            qy = y2[j]
            qz = z2[j]
            qw = w2[j]
            dx = px-qx
            dy = py-qy
            dz = pz-qz
            wprod = pw*qw
            dsq = dx*dx + dy*dy + dz*dz

            k = nbins - 1
            while dsq <= rbins_squared[k]:
                cuda.atomic.add(result, k-1, wprod)
                k -= 1
                if k <= 0:
                    break


start = cuda.event()
end = cuda.event()
timing_nb = 0
timing_nb_wall = 0

d_x1 = cuda.to_device(x1.astype(np.float32))
d_y1 = cuda.to_device(y1.astype(np.float32))
d_z1 = cuda.to_device(z1.astype(np.float32))
d_w1 = cuda.to_device(w1.astype(np.float32))

d_x2 = cuda.to_device(x2.astype(np.float32))
d_y2 = cuda.to_device(y2.astype(np.float32))
d_z2 = cuda.to_device(z2.astype(np.float32))
d_w2 = cuda.to_device(w2.astype(np.float32))

d_rbins_squared = cuda.to_device(
    DEFAULT_RBINS_SQUARED.astype(np.float32))
d_result_nb = cuda.device_array_like(result.astype(np.float32))


# running the Numba jit kernel
# this works because CuPy arrays have the __cuda_array_interface__ attribute,
# which is accepted by Numba kernels, so you don't have to create the arrays
# again using Numba's API
for i in range(4):
    if i > 0:
        start.record()
        _s = time.time()
    count_weighted_pairs_3d_cuda[blocks, threads](
        d_x1, d_y1, d_z1, d_w1,
        d_x2, d_y2, d_z2, d_w2,
        d_rbins_squared, d_result_nb)
    if i > 0:
        end.record()
        end.synchronize()
        _e = time.time()
        timing_nb += cuda.event_elapsed_time(start, end)
        timing_nb_wall += (_e - _s)

print('numba events:', timing_nb/3, 'ms')
print('numba wall  :', timing_nb_wall/3*1000, 'ms')

# print(count_weighted_pairs_3d_cuda.inspect_types())

# check that the CUDA kernel agrees with the Numba kernel
assert cp.allclose(d_result_cp, d_result_nb, rtol=5E-4)

The text was updated successfully, but these errors were encountered:

beckermr · 2019-10-01T17:08:13Z

@leofang for viz

beckermr · 2019-10-01T17:08:29Z

@leofang also got me started with the CUDA code! :)

beckermr · 2019-10-01T19:25:26Z

I timed this with nvprof and got the same results. I don't think the events or simple wall clock times are the issue here.

beckermr · 2019-10-01T21:36:24Z

The main difference I can see is that the kernel in raw cuda is using way more texture operations.

beckermr · 2019-10-01T21:50:44Z

I've attached profiles made with nvprof --analysis-metrics. Maybe someone who knows more about GPUs can have a look?

Archive.zip

maxpkatz · 2019-10-02T02:53:52Z

Can you please make the kernel as simple as you can while still demonstrating a performance difference, e.g. by doing things like removing the innermost while loop, and then attach PTX for each of the two cases?

beckermr · 2019-10-02T03:05:32Z

Yes! I'll try a few variants to see if I can make simpler. IDK how to get the PTX out of cupy. Any tips?

leofang · 2019-10-02T03:11:13Z

Yes! I'll try a few variants to see if I can make simpler. IDK how to get the PTX out of cupy. Any tips?

There's a function called compile_using_nvrtc() in cupy/cupy/cuda/compiler.py. Try intercepting it. The return value should be a string of PTX code.

beckermr · 2019-10-02T04:24:16Z

Alright. Here is the best I could do. Removing much else made the differences go away.

The timings

(gpu-dev) [mrbecker@cp1-p scripts]$ python perf_testing.py 500000
cupy+CUDA events: 1935.0027262369792 ms
cupy+CUDA wall  : 2632.060448328654 ms
numba events: 5319.1396484375 ms
numba wall  : 7121.022303899129 ms

The code

import sys
import time

import cupy as cp
import numpy as np
from numba import cuda

if len(sys.argv) > 2:
    kind = sys.argv[2]
else:
    kind = 'both'

# parameters
blocks = 512
threads = 512
if len(sys.argv) > 1:
    npoints = int(sys.argv[1])
else:
    npoints = 100_000
n1 = npoints
n2 = npoints

# make the data
DEFAULT_RBINS_SQUARED = (np.logspace(
    np.log10(0.1/1e3), np.log10(40/1e3), 20)**2).astype(np.float32)
rng = np.random.RandomState(seed=42)
x1, y1, z1, w1 = rng.uniform(size=(4, n1)).astype(np.float32)
x2, y2, z2, w2 = rng.uniform(size=(4, n1)).astype(np.float32)

# array init
result = np.zeros_like(DEFAULT_RBINS_SQUARED)[:-1].astype(np.float32)

if kind in ['both', 'cupy']:

    source_code = """\
    extern "C"{

    __global__ void brute_force_pairs_kernel(
        float* x1, float* y1, float* z1, float* w1,
        float* x2, float* y2, float* z2, float* w2,
        float* rbins_squared, float* result,
        int n1, int n2, int nbins) {

        size_t start = threadIdx.x + blockIdx.x * blockDim.x;
        size_t stride = blockDim.x * gridDim.x;
        float g = 0;

        for (size_t i = start; i < n1; i += stride) {
            float px = x1[i];
            float py = y1[i];
            float pz = z1[i];
            float pw = w1[i];

            for (size_t j = 0; j < n2; j++) {
                float qx = x2[j];
                float qy = y2[j];
                float qz = z2[j];
                float qw = w2[j];

                float dx = px - qx;
                float dy = py - qy;
                float dz = pz - qz;
                float wprod = pw * qw;
                float dsq = dx * dx + dy * dy + dz * dz;

                g += (dsq * wprod);
            }
        }

        result[0] = g;
    }

    }
    """

    # get the PTX
    from cupy.cuda.compiler import compile_using_nvrtc
    with open('cupy_ptx.txt', 'w') as fp:
        fp.write(compile_using_nvrtc(source_code))

    # compile and load CUDA kernel using CuPy
    brute_force_pairs_kernel = cp.RawKernel(
        source_code, 'brute_force_pairs_kernel')

    d_x1 = cp.asarray(x1, dtype=cp.float32)
    d_y1 = cp.asarray(y1, dtype=cp.float32)
    d_z1 = cp.asarray(z1, dtype=cp.float32)
    d_w1 = cp.asarray(w1, dtype=cp.float32)

    d_x2 = cp.asarray(x2, dtype=cp.float32)
    d_y2 = cp.asarray(y2, dtype=cp.float32)
    d_z2 = cp.asarray(z2, dtype=cp.float32)
    d_w2 = cp.asarray(w2, dtype=cp.float32)

    d_rbins_squared = cp.asarray(DEFAULT_RBINS_SQUARED, dtype=cp.float32)
    d_result_cp = cp.asarray(result, dtype=cp.float32)

    # for GPU timing using CuPy
    start = cp.cuda.Event()
    end = cp.cuda.Event()
    timing_cp = 0
    timing_cp_wall = 0

    # running the kernel using CuPy's functionality
    for i in range(4):
        if i > 0:  # warm-up not needed if using RawModule
            start.record()
            _s = time.time()
        brute_force_pairs_kernel(
            (blocks,), (threads,),
            (d_x1, d_y1, d_z1, d_w1,
             d_x2, d_y2, d_z2, d_w2,
             d_rbins_squared, d_result_cp,
             cp.int32(d_x1.shape[0]),
             cp.int32(d_x2.shape[0]),
             cp.int32(d_rbins_squared.shape[0]))
        )
        if i > 0:  # warm-up not needed if using RawModule
            end.record()
            end.synchronize()
            _e = time.time()
            timing_cp += cp.cuda.get_elapsed_time(start, end)
            timing_cp_wall += (_e - _s)

    print('cupy+CUDA events:', timing_cp/3, 'ms')
    print('cupy+CUDA wall  :', timing_cp_wall/3*1000, 'ms')
    d_result_cp = d_result_cp.copy()


if kind in ['both', 'numba']:
    # for GPU timing using Numba
    @cuda.jit
    def count_weighted_pairs_3d_cuda(
            x1, y1, z1, w1, x2, y2, z2, w2, rbins_squared, result):
        start = cuda.grid(1)
        stride = cuda.gridsize(1)

        n1 = x1.shape[0]
        n2 = x2.shape[0]
        g = 0

        for i in range(start, n1, stride):
            px = x1[i]
            py = y1[i]
            pz = z1[i]
            pw = w1[i]
            for j in range(n2):
                qx = x2[j]
                qy = y2[j]
                qz = z2[j]
                qw = w2[j]
                dx = px-qx
                dy = py-qy
                dz = pz-qz
                wprod = pw*qw
                dsq = dx*dx + dy*dy + dz*dz

                g += (dsq * wprod)

        result[0] = g

    start = cuda.event()
    end = cuda.event()
    timing_nb = 0
    timing_nb_wall = 0

    d_x1 = cuda.to_device(x1.astype(np.float32))
    d_y1 = cuda.to_device(y1.astype(np.float32))
    d_z1 = cuda.to_device(z1.astype(np.float32))
    d_w1 = cuda.to_device(w1.astype(np.float32))

    d_x2 = cuda.to_device(x2.astype(np.float32))
    d_y2 = cuda.to_device(y2.astype(np.float32))
    d_z2 = cuda.to_device(z2.astype(np.float32))
    d_w2 = cuda.to_device(w2.astype(np.float32))

    d_rbins_squared = cuda.to_device(
        DEFAULT_RBINS_SQUARED.astype(np.float32))
    d_result_nb = cuda.device_array_like(result.astype(np.float32))

    # running the Numba jit kernel
    for i in range(4):
        if i > 0:
            start.record()
            _s = time.time()
        count_weighted_pairs_3d_cuda[blocks, threads](
            d_x1, d_y1, d_z1, d_w1,
            d_x2, d_y2, d_z2, d_w2,
            d_rbins_squared, d_result_nb)
        if i > 0:
            end.record()
            end.synchronize()
            _e = time.time()
            timing_nb += cuda.event_elapsed_time(start, end)
            timing_nb_wall += (_e - _s)

    print('numba events:', timing_nb/3, 'ms')
    print('numba wall  :', timing_nb_wall/3*1000, 'ms')

    # print(count_weighted_pairs_3d_cuda.inspect_types())

    with open('numba_ptx.txt', 'w') as fp:
        fp.write(
            list(count_weighted_pairs_3d_cuda.definitions.values())[0].ptx)

if kind in ['both'] and npoints <= 10:
    # check that the CUDA kernel agrees with the Numba kernel
    assert cp.allclose(d_result_cp, d_result_nb, rtol=5E-4)

The PTX is attached: Archive.zip

stuartarchibald · 2019-10-02T11:34:16Z

In the Numba generated PTX there is the notably suspicious:

	mul.lo.s64 	%rd91, %rd146, %rd38;
	add.s64 	%rd92, %rd91, %rd36;
	mul.lo.s64 	%rd93, %rd146, %rd40;
	add.s64 	%rd94, %rd93, %rd39;
	mul.lo.s64 	%rd95, %rd146, %rd42;
	add.s64 	%rd96, %rd95, %rd41;
	mul.lo.s64 	%rd97, %rd146, %rd44;
	add.s64 	%rd98, %rd97, %rd43;
	ld.f32 	%f41, [%rd92];
	sub.f32 	%f42, %f1, %f41;
	ld.f32 	%f43, [%rd94];
	sub.f32 	%f44, %f2, %f43;
	ld.f32 	%f45, [%rd96];
	sub.f32 	%f46, %f3, %f45;
	ld.f32 	%f47, [%rd98];

which is basically doing multiply adds to compute the value of the induction variable j in the inner loop, followed by the subtractions from the loop invariants (p{x,y,z} variables). This is the most notable difference from what is produced via CUDA C. Adjusting the code so that range is not used and restricting the registers used:

    @cuda.jit(max_registers=256)
    def count_weighted_pairs_3d_cuda(
            x1, y1, z1, w1, x2, y2, z2, w2, rbins_squared, result):
        start = types.uint32(cuda.grid(1))
        stride = types.uint32(cuda.gridsize(1))

        n1 = types.uint32(x1.shape[types.uint32(0)])
        n2 = types.uint32(x2.shape[types.uint32(0)])
        g = types.float32(0)

        i = types.uint32(start)
        while i < n1:
            px = x1[i]
            py = y1[i]
            pz = z1[i]
            pw = w1[i]
            i = types.uint32(i + stride)

            j = types.uint32(0)
            while j < n2:
                qx = x2[j]
                qy = y2[j]
                qz = z2[j]
                qw = w2[j]
                dx = px-qx
                dy = py-qy
                dz = pz-qz
                wprod = pw*qw
                dsq = dx*dx + dy*dy + dz*dz

                g += (dsq * wprod)
                j = types.uint32(j + types.uint32(1))

        result[0] = g

gives me:

cupy+CUDA events: 392.3850504557292 ms
cupy+CUDA wall  : 528.5851160685221 ms
numba events: 541.5867513020834 ms
numba wall  : 721.2473551432291 ms

which is getting a lot closer. The offending PTX region now looks like:

	add.s64 	%rd108, %rd35, %rd145;
	add.s64 	%rd109, %rd34, %rd146;
	add.s64 	%rd110, %rd33, %rd147;
	ld.f32 	%f53, [%rd141];
	sub.f32 	%f54, %f2, %f53;
	ld.f32 	%f55, [%rd108];
	sub.f32 	%f56, %f3, %f55;
	ld.f32 	%f57, [%rd109];
	sub.f32 	%f58, %f4, %f57;
	ld.f32 	%f59, [%rd110];

beckermr · 2019-10-02T11:45:34Z

Thanks @stuartarchibald! The CUDA code is using size_t for the loop variables. This should be equivalent to uint64 IIUIC. Is there a reason you went with uint32? (It should be faster on the GPU OFC but the numba code is still slower.)

stuartarchibald · 2019-10-02T11:57:26Z

Thanks @stuartarchibald! The CUDA code is using size_t for the loop variables. This should be equivalent to uint64 IIUIC. Is there a reason you went with unit32? (It should be faster on the GPU OFC but the numba code is still slower.)

The use of uint64 seems to trigger the mul+add pattern, not sure why yet. AFAICT the PTX for the uint32 is still using 64bit (e.g. add.s64 as seen above), it's just that the "bad" pattern isn't present.

beckermr · 2019-10-14T10:57:54Z

Hi @stuartarchibald! What is the path to getting this issue addressed? Is this something within numba, or is it in LLVM?

stuartarchibald · 2019-10-14T11:04:54Z

@beckermr It's something that needs to happen in Numba for the CUDA target. The most guaranteed way to get this to would would be to write a Numba compiler transform pass to switch out ranges for while loops. I also checked, this is just impacting the nvvm backend, CPU side LLVM range compilation is fine.

beckermr · 2019-10-14T11:11:48Z

That sounds like I might be able to handle it. Can you point me to the section of code and give me some hints? I can try my hand at a PR and learn something new!

stuartarchibald · 2019-10-14T11:24:16Z

Sounds great, though this may get tricky so please shout if you get stuck :)

The 0.46 release has an updated API for creating new compilers, compiler pipelines and compiler passes. The 2nd half of the 0.46 release developers demo notebook, you can see it on binder here, has working examples. Further, there's a lot of documentation on customising the compiler here. Given that this transform is just operating on Numba IR, it'd probably be easier and quicker to just develop it on a standard @njit CPU function. What you'd need to do is implement a new compiler pass to do the transform by essentially rewriting the Numba IR, have a look in numba.typed_passes and numba.untyped_passes for already existing passes.

Again, if you get stuck/need a hand, the Numba core developers are also available on gitter.im real time chat. Thanks!

leofang · 2019-10-15T04:22:49Z

Thanks, @stuartarchibald, for such a detailed instruction. Just curious,

I also checked, this is just impacting the nvvm backend, CPU side LLVM range compilation is fine.

Doesn't this imply that @beckermr could simply steal the CPU side's pass code and make it work on GPU? Could you point us to where the CPU counterpart is?

seibert · 2019-10-15T12:41:50Z

In this case, I think Stuart means that the LLVM target for the CPU is able to optimize away any inefficiencies in the generate code, whereas NVVM is not. The optimization passes in the two compiler libraries don't do the same thing (which is not surprising), so we'll need to work around that by generating different code for the GPU.

beckermr · 2019-10-15T12:58:57Z

Right ok. I’m definitely having trouble understanding the llvm ir but I’ll keep at it. More help or hints would be awesome.

noahstier · 2020-02-12T23:26:23Z

I've started to experiment with the test code above, and at first I didn't get the same mul/add pattern in the output PTX. I was able to get that output by checking out commits from early October when this issue was made, and I narrowed it down to this commit that fixes the "bug": 599ecce

Here are the first 10 lines of the loop body before the fix:

BB0_17:
	mul.lo.s64 	%rd91, %rd146, %rd38;
	add.s64 	%rd92, %rd91, %rd36;
	mul.lo.s64 	%rd93, %rd146, %rd40;
	add.s64 	%rd94, %rd93, %rd39;
	mul.lo.s64 	%rd95, %rd146, %rd42;
	add.s64 	%rd96, %rd95, %rd41;
	mul.lo.s64 	%rd97, %rd146, %rd44;
	add.s64 	%rd98, %rd97, %rd43;
	ld.f32 	%f41, [%rd92];
	sub.f32 	%f42, %f1, %f41;

And after the fix:

BB0_18:
	add.s64 	%rd84, %rd4, %rd96;
	add.s64 	%rd85, %rd3, %rd96;
	add.s64 	%rd86, %rd2, %rd96;
	add.s64 	%rd87, %rd1, %rd96;
	ld.global.f32 	%f41, [%rd84];
	sub.f32 	%f42, %f1, %f41;
	ld.global.f32 	%f43, [%rd85];
	sub.f32 	%f44, %f2, %f43;
	ld.global.f32 	%f45, [%rd86];
	sub.f32 	%f46, %f3, %f45;

Looking at the diff from that PR, it seems like using arrays with stride 0 as function arguments would result in the old behavior, and I confirmed this by creating inputs like this:

d_x1 = cp.broadcast_to(cp.ones(1), (n1,))
d_x1.strides

-> 0

instead of this:

d_x1 = cuda.to_device(x1.astype(np.float32))
d_x1.strides

-> 4

And then I see mul/adds in the numba PTX but not the cupy PTX, and a significant performance degradation:

cupy+CUDA events: 311.50421142578125 ms
cupy+CUDA wall  : 419.1913604736328 ms
numba events: 2001.8206380208333 ms
numba wall  : 3332.796812057495 ms

So this issue is still present although it now seems a lot more niche. I'm not sure how useful it will be to fix it, but for our class project it's still a good starting point.

@jakebliss @andrewdt23

beckermr · 2020-02-13T00:43:24Z

Wooooo! I'm very glad to see this issue getting some love!

cc @aphearin @leofang

aphearin · 2020-02-13T00:45:07Z

Yes, definitely happy to see progress on this!

noahstier · 2020-02-20T02:30:50Z

@stuartarchibald Do you think a compiler pass would be feasible / desirable to generate code that is specialized for input array stride?

I'm finding in experiments that there is significant slowdown for non-contiguous arrays, since their stride doesn't get hard-coded. We are already specializing for the two cases of stride == datatype_width (contiguous) and stride != datatype_width (non-contiguous), so it doesn't seem outlandish to further specialize the non-contiguous cases by stride size.

I imagine we'd need to

add strides as a property of numba.types.Array in order to consider stride as part of a function signature
add a pass:
- for each non-contiguous input, get its stride and re-write the numba IR with stride as a constant

Once stride is a constant in numba IR, NVVM should handle hard-coding stride into the PTX.

PS - I looked into how cupy handles this to see if we could use their solution, and I noticed that cupy's output PTX hardcodes offsets equal to the size of the datatype (i.e. not aware of stride at all). Then if you pass a stride-0 array it reads the wrong (potentially out of bounds) elements. I made an issue about it, maybe the result will be interesting for numba as well.

stuartarchibald · 2020-02-20T18:26:20Z

@noahstier I think that this is worth exploring. If the stride is encoded in the Array type then it would become a compile time constant and permissible for use in e.g. looping. The downside may be that more compilation happens as a result of the variation in A ordered arrays now being part of the function type specialisation.

noahstier · 2020-02-20T18:35:35Z

Cool! We'll continue to investigate.

Re more compilation, we will want to do some performance analysis which should include compile times. So it's on our radar. One thing we should look into is what tests the compiler can run to determine if the optimization is worthwhile. I.e. it could specialize only for arrays of a given size, only up to a predefined number of specializations, etc.

stuartarchibald · 2020-02-20T20:01:00Z

Great, thanks. My concern isn't so much the time for an individual compilation, more that there's a huge amount of potential variants for stride, so compiling for each one could get costly opposed to the A layout catch all now.

gmarkall · 2020-09-18T13:41:39Z

I need to re-run the test code above with #6112 to see if it makes any difference here - making the indices unsigned removes simplifies the computation of some indices in PTX.

beckermr changed the title ~~help diagnosing a performance issue~~ help diagnosing a GPU performance issue Oct 1, 2019

esc added CUDA CUDA related issue/PR needtriage performance performance related issue labels Oct 2, 2019

stuartarchibald removed the needtriage label Oct 2, 2019

stuartarchibald mentioned this issue Feb 7, 2020

Ideas for a new Rewrite pass #5205

Closed

This was referenced Mar 13, 2020

[REVIEW] Added streams and double/complex64/complex128 functionality rapidsai/cusignal#31

Merged

Excessive overhead for CuPy raw kernel launch. cupy/cupy#3193

Closed

beckermr mentioned this issue May 7, 2020

using jax like numba on GPUs and CPUs google/jax#3002

Open

leofang mentioned this issue Mar 23, 2021

GPU Implementation of Pairwise calculation sgkit-dev/sgkit#498

Merged

5 tasks

gmarkall mentioned this issue Dec 2, 2021

CUDA Benchmarking #7612

Open

5 tasks

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

help diagnosing a GPU performance issue #4647

help diagnosing a GPU performance issue #4647

beckermr commented Oct 1, 2019 •

edited

Loading

beckermr commented Oct 1, 2019

beckermr commented Oct 1, 2019

beckermr commented Oct 1, 2019

beckermr commented Oct 1, 2019

beckermr commented Oct 1, 2019

maxpkatz commented Oct 2, 2019

beckermr commented Oct 2, 2019

leofang commented Oct 2, 2019

beckermr commented Oct 2, 2019

stuartarchibald commented Oct 2, 2019

beckermr commented Oct 2, 2019 •

edited

Loading

stuartarchibald commented Oct 2, 2019 •

edited

Loading

beckermr commented Oct 14, 2019

stuartarchibald commented Oct 14, 2019

beckermr commented Oct 14, 2019

stuartarchibald commented Oct 14, 2019

leofang commented Oct 15, 2019

seibert commented Oct 15, 2019

beckermr commented Oct 15, 2019

noahstier commented Feb 12, 2020

beckermr commented Feb 13, 2020

aphearin commented Feb 13, 2020

noahstier commented Feb 20, 2020 •

edited

Loading

stuartarchibald commented Feb 20, 2020

noahstier commented Feb 20, 2020

stuartarchibald commented Feb 20, 2020

gmarkall commented Sep 18, 2020

help diagnosing a GPU performance issue #4647

help diagnosing a GPU performance issue #4647

Comments

beckermr commented Oct 1, 2019 • edited Loading

Reporting a bug

beckermr commented Oct 1, 2019

beckermr commented Oct 1, 2019

beckermr commented Oct 1, 2019

beckermr commented Oct 1, 2019

beckermr commented Oct 1, 2019

maxpkatz commented Oct 2, 2019

beckermr commented Oct 2, 2019

leofang commented Oct 2, 2019

beckermr commented Oct 2, 2019

stuartarchibald commented Oct 2, 2019

beckermr commented Oct 2, 2019 • edited Loading

stuartarchibald commented Oct 2, 2019 • edited Loading

beckermr commented Oct 14, 2019

stuartarchibald commented Oct 14, 2019

beckermr commented Oct 14, 2019

stuartarchibald commented Oct 14, 2019

leofang commented Oct 15, 2019

seibert commented Oct 15, 2019

beckermr commented Oct 15, 2019

noahstier commented Feb 12, 2020

beckermr commented Feb 13, 2020

aphearin commented Feb 13, 2020

noahstier commented Feb 20, 2020 • edited Loading

stuartarchibald commented Feb 20, 2020

noahstier commented Feb 20, 2020

stuartarchibald commented Feb 20, 2020

gmarkall commented Sep 18, 2020

beckermr commented Oct 1, 2019 •

edited

Loading

beckermr commented Oct 2, 2019 •

edited

Loading

stuartarchibald commented Oct 2, 2019 •

edited

Loading

noahstier commented Feb 20, 2020 •

edited

Loading