Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
DLWRAP(cuDevicePrimaryCtxRetain, 2)
DLWRAP(cuModuleLoadDataEx, 5)
DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
DLWRAP(cuFuncGetParamInfo, 4)

DLWRAP(cuDeviceCanAccessPeer, 3)
DLWRAP(cuCtxEnablePeerAccess, 2)
Expand Down
1 change: 1 addition & 0 deletions offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -390,5 +390,6 @@ CUresult cuMemGetAllocationGranularity(size_t *granularity,
CUmemAllocationGranularity_flags option);
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
CUoccupancyB2DSize, size_t, int);
CUresult cuFuncGetParamInfo(CUfunction, size_t, size_t *, size_t *);

#endif
33 changes: 31 additions & 2 deletions offload/plugins-nextgen/cuda/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,8 @@ struct CUDAKernelTy : public GenericKernelTy {
// The maximum number of threads cannot exceed the maximum of the kernel.
MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);

return Plugin::success();
// Retrieve the size of the arguments.
return initArgsSize();
}

/// Launch the CUDA kernel function.
Expand All @@ -173,11 +174,32 @@ struct CUDAKernelTy : public GenericKernelTy {
}

private:
/// Initialize the size of the arguments.
Error initArgsSize() {
CUresult Res;
size_t ArgOffset, ArgSize;
size_t Arg = 0;

ArgsSize = 0;

// Find the last argument to know the total size of the arguments.
while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) ==
CUDA_SUCCESS)
ArgsSize = ArgOffset + ArgSize;

if (Res != CUDA_ERROR_INVALID_VALUE)
return Plugin::check(Res, "error in cuFuncGetParamInfo: %s");
return Plugin::success();
}

/// The CUDA kernel function to execute.
CUfunction Func;
/// The maximum amount of dynamic shared memory per thread group. By default,
/// this is set to 48 KB.
mutable uint32_t MaxDynCGroupMemLimit = 49152;

/// The size of the kernel arguments.
size_t ArgsSize;
};

/// Class wrapping a CUDA stream reference. These are the objects handled by the
Expand Down Expand Up @@ -1430,16 +1452,23 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);

// The args size passed in LaunchParams may have tail padding, which is not
// accepted by the CUDA driver.
if (ArgsSize > LaunchParams.Size)
return Plugin::error(ErrorCode::INVALID_ARGUMENT,
"mismatch in kernel arguments");

CUstream Stream;
if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
return Err;

uint32_t MaxDynCGroupMem =
std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());

size_t ConfigArgsSize = ArgsSize;
void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
CU_LAUNCH_PARAM_BUFFER_SIZE,
reinterpret_cast<void *>(&LaunchParams.Size),
reinterpret_cast<void *>(&ConfigArgsSize),
CU_LAUNCH_PARAM_END};

// If we are running an RPC server we want to wake up the server thread
Expand Down
3 changes: 0 additions & 3 deletions offload/test/offloading/CUDA/basic_launch_multi_arg.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@
// clang-format on

// REQUIRES: gpu
//
// FIXME: https://github.com/llvm/llvm-project/issues/161265
// UNSUPPORTED: gpu

#include <stdio.h>

Expand Down
2 changes: 2 additions & 0 deletions offload/unittests/OffloadAPI/device_code/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ add_offload_test_device_code(foo.cpp foo)
add_offload_test_device_code(bar.cpp bar)
# Compile with optimizations to eliminate AMDGPU implicit arguments.
add_offload_test_device_code(noargs.cpp noargs -O3)
add_offload_test_device_code(multiargs.cpp multiargs -O3)
add_offload_test_device_code(byte.cpp byte)
add_offload_test_device_code(localmem.cpp localmem)
add_offload_test_device_code(localmem_reduction.cpp localmem_reduction)
Expand All @@ -15,6 +16,7 @@ add_custom_target(offload_device_binaries DEPENDS
foo.bin
bar.bin
noargs.bin
multiargs.bin
byte.bin
localmem.bin
localmem_reduction.bin
Expand Down
3 changes: 3 additions & 0 deletions offload/unittests/OffloadAPI/device_code/multiargs.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#include <gpuintrin.h>

extern "C" __gpu_kernel void multiargs(char, int *, short) { (void)0; }
14 changes: 14 additions & 0 deletions offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ struct LaunchSingleKernelTestBase : LaunchKernelTestBase {

KERNEL_TEST(Foo, foo)
KERNEL_TEST(NoArgs, noargs)
KERNEL_TEST(MultiArgs, multiargs)
KERNEL_TEST(Byte, byte)
KERNEL_TEST(LocalMem, localmem)
KERNEL_TEST(LocalMemReduction, localmem_reduction)
Expand Down Expand Up @@ -135,6 +136,19 @@ TEST_P(olLaunchKernelNoArgsTest, Success) {
ASSERT_SUCCESS(olSyncQueue(Queue));
}

TEST_P(olLaunchKernelMultiTest, Success) {
struct {
char A;
int *B;
short C;
} Args{0, nullptr, 0};

ASSERT_SUCCESS(
olLaunchKernel(Queue, Device, Kernel, Args, sizeof(Args), &LaunchArgs));

ASSERT_SUCCESS(olSyncQueue(Queue));
}

TEST_P(olLaunchKernelFooTest, SuccessSynchronous) {
void *Mem;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
Expand Down
Loading