Skip to content

Commit

Permalink
[CUDA] add support for the new kernel launch API in CUDA-9.2+.
Browse files Browse the repository at this point in the history
Instead of calling CUDA runtime to arrange function arguments,
the new API constructs arguments in a local array and the kernels
are launched with __cudaLaunchKernel().

The old API has been deprecated and is expected to go away
in the next CUDA release.

Differential Revision: https://reviews.llvm.org/D57488

llvm-svn: 352799
  • Loading branch information
Artem-B committed Jan 31, 2019
1 parent 8fa28a0 commit c62214d
Show file tree
Hide file tree
Showing 14 changed files with 250 additions and 46 deletions.
2 changes: 1 addition & 1 deletion clang/include/clang/Basic/DiagnosticSemaKinds.td
Expand Up @@ -7143,7 +7143,7 @@ def err_kern_type_not_void_return : Error<
def err_kern_is_nonstatic_method : Error<
"kernel function %0 must be a free function or static member function">;
def err_config_scalar_return : Error<
"CUDA special function 'cudaConfigureCall' must have scalar return type">;
"CUDA special function '%0' must have scalar return type">;
def err_kern_call_not_global_function : Error<
"kernel call to non-global function %0">;
def err_global_call_not_config : Error<
Expand Down
5 changes: 5 additions & 0 deletions clang/include/clang/Sema/Sema.h
Expand Up @@ -10348,6 +10348,11 @@ class Sema {
/// Copies target attributes from the template TD to the function FD.
void inheritCUDATargetAttrs(FunctionDecl *FD, const FunctionTemplateDecl &TD);

/// Returns the name of the launch configuration function. This is the name
/// of the function that will be called to configure kernel call, with the
/// parameters specified via <<<>>>.
std::string getCudaConfigureFuncName() const;

/// \name Code completion
//@{
/// Describes the context in which code completion occurs.
Expand Down
110 changes: 106 additions & 4 deletions clang/lib/CodeGen/CGCUDANV.cpp
Expand Up @@ -15,6 +15,8 @@
#include "CodeGenFunction.h"
#include "CodeGenModule.h"
#include "clang/AST/Decl.h"
#include "clang/Basic/Cuda.h"
#include "clang/CodeGen/CodeGenABITypes.h"
#include "clang/CodeGen/ConstantInitBuilder.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
Expand Down Expand Up @@ -102,7 +104,8 @@ class CGNVCUDARuntime : public CGCUDARuntime {
return DummyFunc;
}

void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args);
void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);

public:
CGNVCUDARuntime(CodeGenModule &CGM);
Expand Down Expand Up @@ -187,11 +190,110 @@ llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
FunctionArgList &Args) {
EmittedKernels.push_back(CGF.CurFn);
emitDeviceStubBody(CGF, Args);
if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
CudaFeature::CUDA_USES_NEW_LAUNCH))
emitDeviceStubBodyNew(CGF, Args);
else
emitDeviceStubBodyLegacy(CGF, Args);
}

void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF,
FunctionArgList &Args) {
// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
// array and kernels are launched using cudaLaunchKernel().
void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
FunctionArgList &Args) {
// Build the shadow stack entry at the very start of the function.

// Calculate amount of space we will need for all arguments. If we have no
// args, allocate a single pointer so we still have a valid pointer to the
// argument array that we can pass to runtime, even if it will be unused.
Address KernelArgs = CGF.CreateTempAlloca(
VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
// Store pointers to the arguments in a locally allocated launch_args.
for (unsigned i = 0; i < Args.size(); ++i) {
llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
CGF.Builder.CreateDefaultAlignedStore(
VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
}

llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");

// Lookup cudaLaunchKernel function.
// cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
// void **args, size_t sharedMem,
// cudaStream_t stream);
TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
IdentifierInfo &cudaLaunchKernelII =
CGM.getContext().Idents.get("cudaLaunchKernel");
FunctionDecl *cudaLaunchKernelFD = nullptr;
for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
cudaLaunchKernelFD = FD;
}

if (cudaLaunchKernelFD == nullptr) {
CGM.Error(CGF.CurFuncDecl->getLocation(),
"Can't find declaration for cudaLaunchKernel()");
return;
}
// Create temporary dim3 grid_dim, block_dim.
ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
QualType Dim3Ty = GridDimParam->getType();
Address GridDim =
CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
Address BlockDim =
CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
Address ShmemSize =
CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
Address Stream =
CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
llvm::Constant *cudaPopConfigFn = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(IntTy,
{/*gridDim=*/GridDim.getType(),
/*blockDim=*/BlockDim.getType(),
/*ShmemSize=*/ShmemSize.getType(),
/*Stream=*/Stream.getType()},
/*isVarArg=*/false),
"__cudaPopCallConfiguration");

CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
{GridDim.getPointer(), BlockDim.getPointer(),
ShmemSize.getPointer(), Stream.getPointer()});

// Emit the call to cudaLaunch
llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
CallArgList LaunchKernelArgs;
LaunchKernelArgs.add(RValue::get(Kernel),
cudaLaunchKernelFD->getParamDecl(0)->getType());
LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
cudaLaunchKernelFD->getParamDecl(3)->getType());
LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
cudaLaunchKernelFD->getParamDecl(4)->getType());
LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
cudaLaunchKernelFD->getParamDecl(5)->getType());

QualType QT = cudaLaunchKernelFD->getType();
QualType CQT = QT.getCanonicalType();
llvm::Type *Ty = CGM.getTypes().ConvertFunctionType(CQT, cudaLaunchKernelFD);
llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);

const CGFunctionInfo &FI =
CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
llvm::Constant *cudaLaunchKernelFn =
CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel");
CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
LaunchKernelArgs);
CGF.EmitBranch(EndBlock);

CGF.EmitBlock(EndBlock);
}

void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
FunctionArgList &Args) {
// Emit a call to cudaSetupArgument for each arg in Args.
llvm::Constant *cudaSetupArgFn = getSetupArgumentFn();
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
Expand Down
10 changes: 10 additions & 0 deletions clang/lib/Headers/__clang_cuda_runtime_wrapper.h
Expand Up @@ -426,5 +426,15 @@ __device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
#pragma pop_macro("__USE_FAST_MATH__")
#pragma pop_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")

// CUDA runtime uses this undocumented function to access kernel launch
// configuration. The declaration is in crt/device_functions.h but that file
// includes a lot of other stuff we don't want. Instead, we'll provide our own
// declaration for it here.
#if CUDA_VERSION >= 9020
extern "C" unsigned __cudaPushCallConfiguration(dim3 gridDim, dim3 blockDim,
size_t sharedMem = 0,
void *stream = 0);
#endif

#endif // __CUDA__
#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__
19 changes: 16 additions & 3 deletions clang/lib/Sema/SemaCUDA.cpp
Expand Up @@ -13,6 +13,7 @@
#include "clang/AST/ASTContext.h"
#include "clang/AST/Decl.h"
#include "clang/AST/ExprCXX.h"
#include "clang/Basic/Cuda.h"
#include "clang/Lex/Preprocessor.h"
#include "clang/Sema/Lookup.h"
#include "clang/Sema/Sema.h"
Expand Down Expand Up @@ -41,9 +42,8 @@ ExprResult Sema::ActOnCUDAExecConfigExpr(Scope *S, SourceLocation LLLLoc,
SourceLocation GGGLoc) {
FunctionDecl *ConfigDecl = Context.getcudaConfigureCallDecl();
if (!ConfigDecl)
return ExprError(
Diag(LLLLoc, diag::err_undeclared_var_use)
<< (getLangOpts().HIP ? "hipConfigureCall" : "cudaConfigureCall"));
return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
<< getCudaConfigureFuncName());
QualType ConfigQTy = ConfigDecl->getType();

DeclRefExpr *ConfigDR = new (Context)
Expand Down Expand Up @@ -957,3 +957,16 @@ void Sema::inheritCUDATargetAttrs(FunctionDecl *FD,
copyAttrIfPresent<CUDAHostAttr>(*this, FD, TemplateFD);
copyAttrIfPresent<CUDADeviceAttr>(*this, FD, TemplateFD);
}

std::string Sema::getCudaConfigureFuncName() const {
if (getLangOpts().HIP)
return "hipConfigureCall";

// New CUDA kernel launch sequence.
if (CudaFeatureEnabled(Context.getTargetInfo().getSDKVersion(),
CudaFeature::CUDA_USES_NEW_LAUNCH))
return "__cudaPushCallConfiguration";

// Legacy CUDA kernel configuration call
return "cudaConfigureCall";
}
7 changes: 3 additions & 4 deletions clang/lib/Sema/SemaDecl.cpp
Expand Up @@ -9146,13 +9146,12 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,

if (getLangOpts().CUDA) {
IdentifierInfo *II = NewFD->getIdentifier();
if (II &&
II->isStr(getLangOpts().HIP ? "hipConfigureCall"
: "cudaConfigureCall") &&
if (II && II->isStr(getCudaConfigureFuncName()) &&
!NewFD->isInvalidDecl() &&
NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
if (!R->getAs<FunctionType>()->getReturnType()->isScalarType())
Diag(NewFD->getLocation(), diag::err_config_scalar_return);
Diag(NewFD->getLocation(), diag::err_config_scalar_return)
<< getCudaConfigureFuncName();
Context.setcudaConfigureCallDecl(NewFD);
}

Expand Down
13 changes: 10 additions & 3 deletions clang/test/CodeGenCUDA/Inputs/cuda.h
Expand Up @@ -15,13 +15,20 @@ struct dim3 {
};

typedef struct cudaStream *cudaStream_t;

typedef enum cudaError {} cudaError_t;
#ifdef __HIP__
int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
cudaStream_t stream = 0);
#else
int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
cudaStream_t stream = 0);
extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
size_t sharedSize = 0,
cudaStream_t stream = 0);
extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
size_t sharedSize = 0,
cudaStream_t stream = 0);
extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
dim3 blockDim, void **args,
size_t sharedMem, cudaStream_t stream);
#endif

extern "C" __device__ int printf(const char*, ...);
65 changes: 53 additions & 12 deletions clang/test/CodeGenCUDA/device-stub.cu
@@ -1,14 +1,36 @@
// RUN: echo "GPU binary would be here" > %t
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -fcuda-include-gpubinary %t -o - \
// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,CUDA,CUDANORDC
// RUN: -target-sdk-version=8.0 -fcuda-include-gpubinary %t -o - \
// RUN: | FileCheck -allow-deprecated-dag-overlap %s \
// RUN: --check-prefixes=ALL,NORDC,CUDA,CUDANORDC,CUDA-OLD
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -fcuda-include-gpubinary %t -o - -DNOGLOBALS \
// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,CUDANOGLOBALS
// RUN: -target-sdk-version=8.0 -fcuda-include-gpubinary %t \
// RUN: -o - -DNOGLOBALS \
// RUN: | FileCheck -allow-deprecated-dag-overlap %s \
// RUN: -check-prefixes=NOGLOBALS,CUDANOGLOBALS
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -fgpu-rdc -fcuda-include-gpubinary %t -o - \
// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,RDC,CUDA,CUDARDC
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \
// RUN: -target-sdk-version=8.0 -fgpu-rdc -fcuda-include-gpubinary %t \
// RUN: -o - \
// RUN: | FileCheck -allow-deprecated-dag-overlap %s \
// RUN: --check-prefixes=ALL,RDC,CUDA,CUDARDC,CUDA-OLD
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -target-sdk-version=8.0 -o - \
// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN

// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -target-sdk-version=9.2 -fcuda-include-gpubinary %t -o - \
// RUN: | FileCheck %s -allow-deprecated-dag-overlap \
// RUN: --check-prefixes=ALL,NORDC,CUDA,CUDANORDC,CUDA-NEW
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -target-sdk-version=9.2 -fcuda-include-gpubinary %t -o - -DNOGLOBALS \
// RUN: | FileCheck -allow-deprecated-dag-overlap %s \
// RUN: --check-prefixes=NOGLOBALS,CUDANOGLOBALS
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -target-sdk-version=9.2 -fgpu-rdc -fcuda-include-gpubinary %t -o - \
// RUN: | FileCheck %s -allow-deprecated-dag-overlap \
// RUN: --check-prefixes=ALL,RDC,CUDA,CUDARDC,CUDA_NEW
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -target-sdk-version=9.2 -o - \
// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN

// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
Expand Down Expand Up @@ -103,15 +125,34 @@ void use_pointers() {
// by a call to cudaLaunch.

// ALL: define{{.*}}kernelfunc
// ALL: call{{.*}}[[PREFIX]]SetupArgument
// ALL: call{{.*}}[[PREFIX]]SetupArgument
// ALL: call{{.*}}[[PREFIX]]SetupArgument
// ALL: call{{.*}}[[PREFIX]]Launch

// New launch sequence stores arguments into local buffer and passes array of
// pointers to them directly to cudaLaunchKernel
// CUDA-NEW: alloca
// CUDA-NEW: store
// CUDA-NEW: store
// CUDA-NEW: store
// CUDA-NEW: call{{.*}}__cudaPopCallConfiguration
// CUDA-NEW: call{{.*}}cudaLaunchKernel

// Legacy style launch sequence sets up arguments by passing them to
// [cuda|hip]SetupArgument.
// CUDA-OLD: call{{.*}}[[PREFIX]]SetupArgument
// CUDA-OLD: call{{.*}}[[PREFIX]]SetupArgument
// CUDA-OLD: call{{.*}}[[PREFIX]]SetupArgument
// CUDA-OLD: call{{.*}}[[PREFIX]]Launch

// HIP: call{{.*}}[[PREFIX]]SetupArgument
// HIP: call{{.*}}[[PREFIX]]SetupArgument
// HIP: call{{.*}}[[PREFIX]]SetupArgument
// HIP: call{{.*}}[[PREFIX]]Launch
__global__ void kernelfunc(int i, int j, int k) {}

// Test that we've built correct kernel launch sequence.
// ALL: define{{.*}}hostfunc
// ALL: call{{.*}}[[PREFIX]]ConfigureCall
// CUDA-OLD: call{{.*}}[[PREFIX]]ConfigureCall
// CUDA-NEW: call{{.*}}__cudaPushCallConfiguration
// HIP: call{{.*}}[[PREFIX]]ConfigureCall
// ALL: call{{.*}}kernelfunc
void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
#endif
Expand Down
16 changes: 10 additions & 6 deletions clang/test/CodeGenCUDA/kernel-args-alignment.cu
@@ -1,8 +1,12 @@
// RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | \
// RUN: FileCheck -check-prefix HOST -check-prefix CHECK %s
// New CUDA kernel launch sequence does not require explicit specification of
// size/offset for each argument, so only the old way is tested.
//
// RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -emit-llvm \
// RUN: -target-sdk-version=8.0 -o - %s \
// RUN: | FileCheck -check-prefixes=HOST-OLD,CHECK %s

// RUN: %clang_cc1 --std=c++11 -fcuda-is-device -triple nvptx64-nvidia-cuda \
// RUN: -emit-llvm -o - %s | FileCheck -check-prefix DEVICE -check-prefix CHECK %s
// RUN: -emit-llvm -o - %s | FileCheck -check-prefixes=DEVICE,CHECK %s

#include "Inputs/cuda.h"

Expand All @@ -27,9 +31,9 @@ static_assert(alignof(S) == 8, "Unexpected alignment.");
// 1. offset 0, width 1
// 2. offset 8 (because alignof(S) == 8), width 16
// 3. offset 24, width 8
// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 1, i64 0)
// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 16, i64 8)
// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24)
// HOST-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 1, i64 0)
// HOST-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 16, i64 8)
// HOST-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24)

// DEVICE-LABEL: @_Z6kernelc1SPi
// DEVICE-SAME: i8{{[^,]*}}, %struct.S* byval align 8{{[^,]*}}, i32*
Expand Down
17 changes: 12 additions & 5 deletions clang/test/CodeGenCUDA/kernel-call.cu
@@ -1,20 +1,27 @@
// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CUDA,CHECK
// RUN: %clang_cc1 -x hip -emit-llvm %s -o - | FileCheck %s --check-prefixes=HIP,CHECK
// RUN: %clang_cc1 -target-sdk-version=8.0 -emit-llvm %s -o - \
// RUN: | FileCheck %s --check-prefixes=CUDA-OLD,CHECK
// RUN: %clang_cc1 -target-sdk-version=9.2 -emit-llvm %s -o - \
// RUN: | FileCheck %s --check-prefixes=CUDA-NEW,CHECK
// RUN: %clang_cc1 -x hip -emit-llvm %s -o - \
// RUN: | FileCheck %s --check-prefixes=HIP,CHECK


#include "Inputs/cuda.h"

// CHECK-LABEL: define{{.*}}g1
// HIP: call{{.*}}hipSetupArgument
// HIP: call{{.*}}hipLaunchByPtr
// CUDA: call{{.*}}cudaSetupArgument
// CUDA: call{{.*}}cudaLaunch
// CUDA-OLD: call{{.*}}cudaSetupArgument
// CUDA-OLD: call{{.*}}cudaLaunch
// CUDA-NEW: call{{.*}}__cudaPopCallConfiguration
// CUDA-NEW: call{{.*}}cudaLaunchKernel
__global__ void g1(int x) {}

// CHECK-LABEL: define{{.*}}main
int main(void) {
// HIP: call{{.*}}hipConfigureCall
// CUDA: call{{.*}}cudaConfigureCall
// CUDA-OLD: call{{.*}}cudaConfigureCall
// CUDA-NEW: call{{.*}}__cudaPushCallConfiguration
// CHECK: icmp
// CHECK: br
// CHECK: call{{.*}}g1
Expand Down

0 comments on commit c62214d

Please sign in to comment.