-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[CUDA] Add device-side kernel launch support #165519
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-clang-codegen @llvm/pr-subscribers-clang Author: None (darkbuck) Changes
Patch is 33.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165519.diff 20 Files Affected:
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 33aa2d343aa7a..f64e29be3205f 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -488,6 +488,10 @@ class ASTContext : public RefCountedBase<ASTContext> {
/// Declaration for the CUDA cudaConfigureCall function.
FunctionDecl *cudaConfigureCallDecl = nullptr;
+ /// Declaration for the CUDA cudaGetParameterBuffer function.
+ FunctionDecl *cudaGetParameterBufferDecl = nullptr;
+ /// Declaration for the CUDA cudaLaunchDevice function.
+ FunctionDecl *cudaLaunchDeviceDecl = nullptr;
/// Keeps track of all declaration attributes.
///
@@ -1641,6 +1645,18 @@ class ASTContext : public RefCountedBase<ASTContext> {
return cudaConfigureCallDecl;
}
+ void setcudaGetParameterBufferDecl(FunctionDecl *FD) {
+ cudaGetParameterBufferDecl = FD;
+ }
+
+ FunctionDecl *getcudaGetParameterBufferDecl() {
+ return cudaGetParameterBufferDecl;
+ }
+
+ void setcudaLaunchDeviceDecl(FunctionDecl *FD) { cudaLaunchDeviceDecl = FD; }
+
+ FunctionDecl *getcudaLaunchDeviceDecl() { return cudaLaunchDeviceDecl; }
+
/// Returns true iff we need copy/dispose helpers for the given type.
bool BlockRequiresCopying(QualType Ty, const VarDecl *D);
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 4e369be0bbb92..5e010cb52954d 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9499,6 +9499,8 @@ def err_kern_is_nonstatic_method : Error<
"kernel function %0 must be a free function or static member function">;
def err_config_scalar_return : Error<
"CUDA special function '%0' must have scalar return type">;
+def err_config_pointer_return
+ : Error<"CUDA special function '%0' must have pointer return type">;
def err_kern_call_not_global_function : Error<
"kernel call to non-global function %0">;
def err_global_call_not_config : Error<
@@ -13690,4 +13692,8 @@ def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">;
def note_amdgcn_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">;
def err_amdgcn_coop_atomic_invalid_as : Error<"cooperative atomic requires a global or generic pointer">;
+
+def err_cuda_device_kernel_launch_require_rdc
+ : Error<"kernel launch from __device__ or __global__ function requires "
+ "relocatable device code, also known as separate compilation mode">;
} // end of sema component.
diff --git a/clang/include/clang/Sema/SemaCUDA.h b/clang/include/clang/Sema/SemaCUDA.h
index dbc1432860d89..dbb4290f5d149 100644
--- a/clang/include/clang/Sema/SemaCUDA.h
+++ b/clang/include/clang/Sema/SemaCUDA.h
@@ -273,6 +273,11 @@ class SemaCUDA : public SemaBase {
/// of the function that will be called to configure kernel call, with the
/// parameters specified via <<<>>>.
std::string getConfigureFuncName() const;
+ /// Return the name of the parameter buffer allocation function for the
+ /// device kernel launch.
+ std::string getGetParameterBufferFuncName() const;
+ /// Return the name of the device kernel launch function.
+ std::string getLaunchDeviceFuncName() const;
/// Record variables that are potentially ODR-used in CUDA/HIP.
void recordPotentialODRUsedVariable(MultiExprArg Args,
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index af856a8097ab1..a65f7fd2d1d43 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -1013,7 +1013,7 @@ class ASTReader
///
/// The AST context tracks a few important decls, currently cudaConfigureCall,
/// directly.
- SmallVector<GlobalDeclID, 2> CUDASpecialDeclRefs;
+ SmallVector<GlobalDeclID, 4> CUDASpecialDeclRefs;
/// The floating point pragma option settings.
SmallVector<uint64_t, 1> FPPragmaOptions;
diff --git a/clang/lib/CodeGen/CGCUDARuntime.cpp b/clang/lib/CodeGen/CGCUDARuntime.cpp
index 121a481213396..cd1476ebd6754 100644
--- a/clang/lib/CodeGen/CGCUDARuntime.cpp
+++ b/clang/lib/CodeGen/CGCUDARuntime.cpp
@@ -22,6 +22,116 @@ using namespace CodeGen;
CGCUDARuntime::~CGCUDARuntime() {}
+static llvm::Value *emitGetParamBuf(CodeGenFunction &CGF,
+ const CUDAKernelCallExpr *E) {
+ auto *GetParamBuf = CGF.getContext().getcudaGetParameterBufferDecl();
+ const FunctionProtoType *GetParamBufProto =
+ GetParamBuf->getType()->getAs<FunctionProtoType>();
+
+ DeclRefExpr *DRE = DeclRefExpr::Create(
+ CGF.getContext(), {}, {}, GetParamBuf,
+ /*RefersToEnclosingVariableOrCapture=*/false, GetParamBuf->getNameInfo(),
+ GetParamBuf->getType(), VK_PRValue);
+ auto *ImpCast = ImplicitCastExpr::Create(
+ CGF.getContext(), CGF.getContext().getPointerType(GetParamBuf->getType()),
+ CK_FunctionToPointerDecay, DRE, nullptr, VK_PRValue, FPOptionsOverride());
+
+ CGCallee Callee = CGF.EmitCallee(ImpCast);
+ CallArgList Args;
+ // Use 64B alignment.
+ Args.add(RValue::get(CGF.CGM.getSize(CharUnits::fromQuantity(64))),
+ CGF.getContext().getSizeType());
+ // Calculate parameter sizes.
+ const PointerType *PT = E->getCallee()->getType()->getAs<PointerType>();
+ const FunctionProtoType *FTP =
+ PT->getPointeeType()->getAs<FunctionProtoType>();
+ CharUnits Offset = CharUnits::Zero();
+ for (auto ArgTy : FTP->getParamTypes()) {
+ auto TInfo = CGF.CGM.getContext().getTypeInfoInChars(ArgTy);
+ Offset = Offset.alignTo(TInfo.Align);
+ Offset += TInfo.Width;
+ }
+ Args.add(RValue::get(CGF.CGM.getSize(Offset)),
+ CGF.getContext().getSizeType());
+ const CGFunctionInfo &CallInfo = CGF.CGM.getTypes().arrangeFreeFunctionCall(
+ Args, GetParamBufProto, /*ChainCall=*/false);
+ auto Ret = CGF.EmitCall(CallInfo, Callee, /*ReturnValue=*/{}, Args);
+
+ return Ret.getScalarVal();
+}
+
+RValue CGCUDARuntime::EmitCUDADeviceKernelCallExpr(
+ CodeGenFunction &CGF, const CUDAKernelCallExpr *E,
+ ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke) {
+ ASTContext &Ctx = CGM.getContext();
+ assert(Ctx.getcudaLaunchDeviceDecl() == E->getConfig()->getDirectCallee());
+
+ llvm::BasicBlock *ConfigOKBlock = CGF.createBasicBlock("dkcall.configok");
+ llvm::BasicBlock *ContBlock = CGF.createBasicBlock("dkcall.end");
+
+ llvm::Value *Config = emitGetParamBuf(CGF, E);
+ CGF.Builder.CreateCondBr(
+ CGF.Builder.CreateICmpNE(Config,
+ llvm::Constant::getNullValue(Config->getType())),
+ ConfigOKBlock, ContBlock);
+
+ CodeGenFunction::ConditionalEvaluation eval(CGF);
+
+ eval.begin(CGF);
+ CGF.EmitBlock(ConfigOKBlock);
+
+ QualType KernelCalleeFuncTy =
+ E->getCallee()->getType()->getAs<PointerType>()->getPointeeType();
+ CGCallee KernelCallee = CGF.EmitCallee(E->getCallee());
+ // Emit kernel arguments.
+ CallArgList KernelCallArgs;
+ CGF.EmitCallArgs(
+ KernelCallArgs,
+ dyn_cast<FunctionProtoType>(KernelCalleeFuncTy->castAs<FunctionType>()),
+ E->arguments(), E->getDirectCallee());
+ // Copy emitted kernel arguments into that parameter buffer.
+ RawAddress CfgBase(Config, CGM.Int8Ty,
+ /*Alignment=*/CharUnits::fromQuantity(64));
+ CharUnits Offset = CharUnits::Zero();
+ for (auto &Arg : KernelCallArgs) {
+ auto TInfo = CGM.getContext().getTypeInfoInChars(Arg.getType());
+ Offset = Offset.alignTo(TInfo.Align);
+ Address Addr =
+ CGF.Builder.CreateConstInBoundsGEP(CfgBase, Offset.getQuantity());
+ Arg.copyInto(CGF, Addr);
+ Offset += TInfo.Width;
+ }
+ // Make `cudaLaunchDevice` call, i.e. E->getConfig().
+ const CallExpr *LaunchCall = E->getConfig();
+ QualType LaunchCalleeFuncTy = LaunchCall->getCallee()
+ ->getType()
+ ->getAs<PointerType>()
+ ->getPointeeType();
+ CGCallee LaunchCallee = CGF.EmitCallee(LaunchCall->getCallee());
+ CallArgList LaunchCallArgs;
+ CGF.EmitCallArgs(
+ LaunchCallArgs,
+ dyn_cast<FunctionProtoType>(LaunchCalleeFuncTy->castAs<FunctionType>()),
+ LaunchCall->arguments(), LaunchCall->getDirectCallee());
+ // Replace func and paramterbuffer arguments.
+ LaunchCallArgs[0] = CallArg(RValue::get(KernelCallee.getFunctionPointer()),
+ CGM.getContext().VoidPtrTy);
+ LaunchCallArgs[1] = CallArg(RValue::get(Config), CGM.getContext().VoidPtrTy);
+ const CGFunctionInfo &LaunchCallInfo = CGM.getTypes().arrangeFreeFunctionCall(
+ LaunchCallArgs,
+ dyn_cast<FunctionProtoType>(LaunchCalleeFuncTy->castAs<FunctionType>()),
+ /*ChainCall=*/false);
+ CGF.EmitCall(LaunchCallInfo, LaunchCallee, ReturnValue, LaunchCallArgs,
+ CallOrInvoke,
+ /*IsMustTail=*/false, E->getExprLoc());
+ CGF.EmitBranch(ContBlock);
+
+ CGF.EmitBlock(ContBlock);
+ eval.end(CGF);
+
+ return RValue::get(nullptr);
+}
+
RValue CGCUDARuntime::EmitCUDAKernelCallExpr(CodeGenFunction &CGF,
const CUDAKernelCallExpr *E,
ReturnValueSlot ReturnValue,
diff --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h
index 86f776004ee7c..64fb9a31422e0 100644
--- a/clang/lib/CodeGen/CGCUDARuntime.h
+++ b/clang/lib/CodeGen/CGCUDARuntime.h
@@ -88,6 +88,10 @@ class CGCUDARuntime {
ReturnValueSlot ReturnValue,
llvm::CallBase **CallOrInvoke = nullptr);
+ virtual RValue EmitCUDADeviceKernelCallExpr(
+ CodeGenFunction &CGF, const CUDAKernelCallExpr *E,
+ ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke = nullptr);
+
/// Emits a kernel launch stub.
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0;
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 14d8db32bafc6..0c01933790100 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -503,6 +503,12 @@ RValue CodeGenFunction::EmitCXXOperatorMemberCallExpr(
RValue CodeGenFunction::EmitCUDAKernelCallExpr(const CUDAKernelCallExpr *E,
ReturnValueSlot ReturnValue,
llvm::CallBase **CallOrInvoke) {
+ auto *FD = E->getConfig()->getDirectCallee();
+ // Emit as a device kernel call if the config is prepared using
+ // 'cudaGetParameterBuffer'.
+ if (FD && CGM.getContext().getcudaLaunchDeviceDecl() == FD)
+ return CGM.getCUDARuntime().EmitCUDADeviceKernelCallExpr(
+ *this, E, ReturnValue, CallOrInvoke);
return CGM.getCUDARuntime().EmitCUDAKernelCallExpr(*this, E, ReturnValue,
CallOrInvoke);
}
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 31735a0f5feb3..a60a32dcb9e4c 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -52,16 +52,85 @@ bool SemaCUDA::PopForceHostDevice() {
ExprResult SemaCUDA::ActOnExecConfigExpr(Scope *S, SourceLocation LLLLoc,
MultiExprArg ExecConfig,
SourceLocation GGGLoc) {
- FunctionDecl *ConfigDecl = getASTContext().getcudaConfigureCallDecl();
+ bool IsDeviceKernelCall = false;
+ switch (CurrentTarget()) {
+ case CUDAFunctionTarget::Global:
+ case CUDAFunctionTarget::Device:
+ IsDeviceKernelCall = true;
+ break;
+ case CUDAFunctionTarget::HostDevice:
+ if (getLangOpts().CUDAIsDevice) {
+ // Under the device compilation, config call under an HD function should
+ // be treated as a device kernel call. But, for implicit HD ones (such as
+ // lambdas), need to check whether RDC is enabled or not.
+ IsDeviceKernelCall = true;
+ if (!getLangOpts().GPURelocatableDeviceCode) {
+ FunctionDecl *Caller = SemaRef.getCurFunctionDecl(/*AllowLambda=*/true);
+ if (Caller && isImplicitHostDeviceFunction(Caller))
+ IsDeviceKernelCall = false;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (IsDeviceKernelCall && !getLangOpts().GPURelocatableDeviceCode)
+ return ExprError(
+ Diag(LLLLoc, diag::err_cuda_device_kernel_launch_require_rdc));
+
+ FunctionDecl *ConfigDecl = IsDeviceKernelCall
+ ? getASTContext().getcudaLaunchDeviceDecl()
+ : getASTContext().getcudaConfigureCallDecl();
if (!ConfigDecl)
return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
- << getConfigureFuncName());
+ << (IsDeviceKernelCall ? getLaunchDeviceFuncName()
+ : getConfigureFuncName()));
+ // Additional check on the launch function if it's a device kernel call.
+ if (IsDeviceKernelCall) {
+ auto *GetParamBuf = getASTContext().getcudaGetParameterBufferDecl();
+ if (!GetParamBuf)
+ return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
+ << getGetParameterBufferFuncName());
+ }
+
QualType ConfigQTy = ConfigDecl->getType();
DeclRefExpr *ConfigDR = new (getASTContext()) DeclRefExpr(
getASTContext(), ConfigDecl, false, ConfigQTy, VK_LValue, LLLLoc);
SemaRef.MarkFunctionReferenced(LLLLoc, ConfigDecl);
+ if (IsDeviceKernelCall) {
+ SmallVector<Expr *> Args;
+ // Use a null pointer as the kernel function, which may not be resolvable
+ // here. For example, resolving that kernel function may need additional
+ // kernel arguments.
+ llvm::APInt Zero(SemaRef.Context.getTypeSize(SemaRef.Context.IntTy), 0);
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+ SemaRef.Context.IntTy, LLLLoc));
+ // Use a null pointer as the parameter buffer, which should be allocated in
+ // the codegen.
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+ SemaRef.Context.IntTy, LLLLoc));
+ // Add the original config arguments.
+ llvm::append_range(Args, ExecConfig);
+ // Add the default blockDim if it's missing.
+ if (Args.size() < 4) {
+ llvm::APInt One(SemaRef.Context.getTypeSize(SemaRef.Context.IntTy), 1);
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, One,
+ SemaRef.Context.IntTy, LLLLoc));
+ }
+ // Add the default sharedMemSize if it's missing.
+ if (Args.size() < 5)
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+ SemaRef.Context.IntTy, LLLLoc));
+ // Add the default stream if it's missing.
+ if (Args.size() < 6)
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+ SemaRef.Context.IntTy, LLLLoc));
+ return SemaRef.BuildCallExpr(S, ConfigDR, LLLLoc, Args, GGGLoc, nullptr,
+ /*IsExecConfig=*/true);
+ }
return SemaRef.BuildCallExpr(S, ConfigDR, LLLLoc, ExecConfig, GGGLoc, nullptr,
/*IsExecConfig=*/true);
}
@@ -251,7 +320,7 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
if (CalleeTarget == CUDAFunctionTarget::Global &&
(CallerTarget == CUDAFunctionTarget::Global ||
CallerTarget == CUDAFunctionTarget::Device))
- return CFP_Never;
+ return CFP_Native;
// (b) Calling HostDevice is OK for everyone.
if (CalleeTarget == CUDAFunctionTarget::HostDevice)
@@ -279,7 +348,8 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
if (CallerTarget == CUDAFunctionTarget::HostDevice) {
// It's OK to call a compilation-mode matching function from an HD one.
if ((getLangOpts().CUDAIsDevice &&
- CalleeTarget == CUDAFunctionTarget::Device) ||
+ (CalleeTarget == CUDAFunctionTarget::Device ||
+ CalleeTarget == CUDAFunctionTarget::Global)) ||
(!getLangOpts().CUDAIsDevice &&
(CalleeTarget == CUDAFunctionTarget::Host ||
CalleeTarget == CUDAFunctionTarget::Global)))
@@ -1103,6 +1173,18 @@ std::string SemaCUDA::getConfigureFuncName() const {
return "cudaConfigureCall";
}
+std::string SemaCUDA::getGetParameterBufferFuncName() const {
+ // FIXME: Use the API from CUDA programming guide. Add V2 support when
+ // necessary.
+ return "cudaGetParameterBuffer";
+}
+
+std::string SemaCUDA::getLaunchDeviceFuncName() const {
+ // FIXME: Use the API from CUDA programming guide. Add V2 support when
+ // necessary.
+ return "cudaLaunchDevice";
+}
+
// Record any local constexpr variables that are passed one way on the host
// and another on the device.
void SemaCUDA::recordPotentialODRUsedVariable(
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index fc3aabf5741ca..1e39bfb5e42cd 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -11050,14 +11050,30 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
}
if (getLangOpts().CUDA) {
- IdentifierInfo *II = NewFD->getIdentifier();
- if (II && II->isStr(CUDA().getConfigureFuncName()) &&
- !NewFD->isInvalidDecl() &&
- NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
- if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
- Diag(NewFD->getLocation(), diag::err_config_scalar_return)
- << CUDA().getConfigureFuncName();
- Context.setcudaConfigureCallDecl(NewFD);
+ if (IdentifierInfo *II = NewFD->getIdentifier()) {
+ if (II->isStr(CUDA().getConfigureFuncName()) && !NewFD->isInvalidDecl() &&
+ NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
+ if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
+ Diag(NewFD->getLocation(), diag::err_config_scalar_return)
+ << CUDA().getConfigureFuncName();
+ Context.setcudaConfigureCallDecl(NewFD);
+ }
+ if (II->isStr(CUDA().getGetParameterBufferFuncName()) &&
+ !NewFD->isInvalidDecl() &&
+ NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
+ if (!R->castAs<FunctionType>()->getReturnType()->isPointerType())
+ Diag(NewFD->getLocation(), diag::err_config_pointer_return)
+ << CUDA().getConfigureFuncName();
+ Context.setcudaGetParameterBufferDecl(NewFD);
+ }
+ if (II->isStr(CUDA().getLaunchDeviceFuncName()) &&
+ !NewFD->isInvalidDecl() &&
+ NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
+ if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
+ Diag(NewFD->getLocation(), diag::err_config_scalar_return)
+ << CUDA().getConfigureFuncName();
+ Context.setcudaLaunchDeviceDecl(NewFD);
+ }
}
}
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index c1b5cb730e4a4..e415d5816ab01 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -5588,9 +5588,13 @@ void ASTReader::InitializeContext() {
// If there were any CUDA special declarations, deserialize them.
if (!CUDASpecialDeclRefs.empty()) {
- assert(CUDASpecialDeclRefs.size() == 1 && "More decl refs than expected!");
+ assert(CUDASpecialDeclRefs.size() == 3 && "More decl refs than expected!");
Context.setcudaConfigureCallDecl(
- cast<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[0])));
+ cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[0])));
+ Context.setcudaGetParameterBufferDecl(
+ cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[1])));
+ Context.setcudaLaunchDeviceDecl(
+ cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[2])));
}
// Re-export any modules that were imported by a non-module AST file.
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 377e3966874f3..8e527db972fb0 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -5714,8 +5714,13 @@ void ASTWriter::PrepareWritingSpecialDecls(Sema &Sem...
[truncated]
|
|
@llvm/pr-subscribers-clang-modules Author: None (darkbuck) Changes
Patch is 33.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165519.diff 20 Files Affected:
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 33aa2d343aa7a..f64e29be3205f 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -488,6 +488,10 @@ class ASTContext : public RefCountedBase<ASTContext> {
/// Declaration for the CUDA cudaConfigureCall function.
FunctionDecl *cudaConfigureCallDecl = nullptr;
+ /// Declaration for the CUDA cudaGetParameterBuffer function.
+ FunctionDecl *cudaGetParameterBufferDecl = nullptr;
+ /// Declaration for the CUDA cudaLaunchDevice function.
+ FunctionDecl *cudaLaunchDeviceDecl = nullptr;
/// Keeps track of all declaration attributes.
///
@@ -1641,6 +1645,18 @@ class ASTContext : public RefCountedBase<ASTContext> {
return cudaConfigureCallDecl;
}
+ void setcudaGetParameterBufferDecl(FunctionDecl *FD) {
+ cudaGetParameterBufferDecl = FD;
+ }
+
+ FunctionDecl *getcudaGetParameterBufferDecl() {
+ return cudaGetParameterBufferDecl;
+ }
+
+ void setcudaLaunchDeviceDecl(FunctionDecl *FD) { cudaLaunchDeviceDecl = FD; }
+
+ FunctionDecl *getcudaLaunchDeviceDecl() { return cudaLaunchDeviceDecl; }
+
/// Returns true iff we need copy/dispose helpers for the given type.
bool BlockRequiresCopying(QualType Ty, const VarDecl *D);
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 4e369be0bbb92..5e010cb52954d 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9499,6 +9499,8 @@ def err_kern_is_nonstatic_method : Error<
"kernel function %0 must be a free function or static member function">;
def err_config_scalar_return : Error<
"CUDA special function '%0' must have scalar return type">;
+def err_config_pointer_return
+ : Error<"CUDA special function '%0' must have pointer return type">;
def err_kern_call_not_global_function : Error<
"kernel call to non-global function %0">;
def err_global_call_not_config : Error<
@@ -13690,4 +13692,8 @@ def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">;
def note_amdgcn_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">;
def err_amdgcn_coop_atomic_invalid_as : Error<"cooperative atomic requires a global or generic pointer">;
+
+def err_cuda_device_kernel_launch_require_rdc
+ : Error<"kernel launch from __device__ or __global__ function requires "
+ "relocatable device code, also known as separate compilation mode">;
} // end of sema component.
diff --git a/clang/include/clang/Sema/SemaCUDA.h b/clang/include/clang/Sema/SemaCUDA.h
index dbc1432860d89..dbb4290f5d149 100644
--- a/clang/include/clang/Sema/SemaCUDA.h
+++ b/clang/include/clang/Sema/SemaCUDA.h
@@ -273,6 +273,11 @@ class SemaCUDA : public SemaBase {
/// of the function that will be called to configure kernel call, with the
/// parameters specified via <<<>>>.
std::string getConfigureFuncName() const;
+ /// Return the name of the parameter buffer allocation function for the
+ /// device kernel launch.
+ std::string getGetParameterBufferFuncName() const;
+ /// Return the name of the device kernel launch function.
+ std::string getLaunchDeviceFuncName() const;
/// Record variables that are potentially ODR-used in CUDA/HIP.
void recordPotentialODRUsedVariable(MultiExprArg Args,
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index af856a8097ab1..a65f7fd2d1d43 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -1013,7 +1013,7 @@ class ASTReader
///
/// The AST context tracks a few important decls, currently cudaConfigureCall,
/// directly.
- SmallVector<GlobalDeclID, 2> CUDASpecialDeclRefs;
+ SmallVector<GlobalDeclID, 4> CUDASpecialDeclRefs;
/// The floating point pragma option settings.
SmallVector<uint64_t, 1> FPPragmaOptions;
diff --git a/clang/lib/CodeGen/CGCUDARuntime.cpp b/clang/lib/CodeGen/CGCUDARuntime.cpp
index 121a481213396..cd1476ebd6754 100644
--- a/clang/lib/CodeGen/CGCUDARuntime.cpp
+++ b/clang/lib/CodeGen/CGCUDARuntime.cpp
@@ -22,6 +22,116 @@ using namespace CodeGen;
CGCUDARuntime::~CGCUDARuntime() {}
+static llvm::Value *emitGetParamBuf(CodeGenFunction &CGF,
+ const CUDAKernelCallExpr *E) {
+ auto *GetParamBuf = CGF.getContext().getcudaGetParameterBufferDecl();
+ const FunctionProtoType *GetParamBufProto =
+ GetParamBuf->getType()->getAs<FunctionProtoType>();
+
+ DeclRefExpr *DRE = DeclRefExpr::Create(
+ CGF.getContext(), {}, {}, GetParamBuf,
+ /*RefersToEnclosingVariableOrCapture=*/false, GetParamBuf->getNameInfo(),
+ GetParamBuf->getType(), VK_PRValue);
+ auto *ImpCast = ImplicitCastExpr::Create(
+ CGF.getContext(), CGF.getContext().getPointerType(GetParamBuf->getType()),
+ CK_FunctionToPointerDecay, DRE, nullptr, VK_PRValue, FPOptionsOverride());
+
+ CGCallee Callee = CGF.EmitCallee(ImpCast);
+ CallArgList Args;
+ // Use 64B alignment.
+ Args.add(RValue::get(CGF.CGM.getSize(CharUnits::fromQuantity(64))),
+ CGF.getContext().getSizeType());
+ // Calculate parameter sizes.
+ const PointerType *PT = E->getCallee()->getType()->getAs<PointerType>();
+ const FunctionProtoType *FTP =
+ PT->getPointeeType()->getAs<FunctionProtoType>();
+ CharUnits Offset = CharUnits::Zero();
+ for (auto ArgTy : FTP->getParamTypes()) {
+ auto TInfo = CGF.CGM.getContext().getTypeInfoInChars(ArgTy);
+ Offset = Offset.alignTo(TInfo.Align);
+ Offset += TInfo.Width;
+ }
+ Args.add(RValue::get(CGF.CGM.getSize(Offset)),
+ CGF.getContext().getSizeType());
+ const CGFunctionInfo &CallInfo = CGF.CGM.getTypes().arrangeFreeFunctionCall(
+ Args, GetParamBufProto, /*ChainCall=*/false);
+ auto Ret = CGF.EmitCall(CallInfo, Callee, /*ReturnValue=*/{}, Args);
+
+ return Ret.getScalarVal();
+}
+
+RValue CGCUDARuntime::EmitCUDADeviceKernelCallExpr(
+ CodeGenFunction &CGF, const CUDAKernelCallExpr *E,
+ ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke) {
+ ASTContext &Ctx = CGM.getContext();
+ assert(Ctx.getcudaLaunchDeviceDecl() == E->getConfig()->getDirectCallee());
+
+ llvm::BasicBlock *ConfigOKBlock = CGF.createBasicBlock("dkcall.configok");
+ llvm::BasicBlock *ContBlock = CGF.createBasicBlock("dkcall.end");
+
+ llvm::Value *Config = emitGetParamBuf(CGF, E);
+ CGF.Builder.CreateCondBr(
+ CGF.Builder.CreateICmpNE(Config,
+ llvm::Constant::getNullValue(Config->getType())),
+ ConfigOKBlock, ContBlock);
+
+ CodeGenFunction::ConditionalEvaluation eval(CGF);
+
+ eval.begin(CGF);
+ CGF.EmitBlock(ConfigOKBlock);
+
+ QualType KernelCalleeFuncTy =
+ E->getCallee()->getType()->getAs<PointerType>()->getPointeeType();
+ CGCallee KernelCallee = CGF.EmitCallee(E->getCallee());
+ // Emit kernel arguments.
+ CallArgList KernelCallArgs;
+ CGF.EmitCallArgs(
+ KernelCallArgs,
+ dyn_cast<FunctionProtoType>(KernelCalleeFuncTy->castAs<FunctionType>()),
+ E->arguments(), E->getDirectCallee());
+ // Copy emitted kernel arguments into that parameter buffer.
+ RawAddress CfgBase(Config, CGM.Int8Ty,
+ /*Alignment=*/CharUnits::fromQuantity(64));
+ CharUnits Offset = CharUnits::Zero();
+ for (auto &Arg : KernelCallArgs) {
+ auto TInfo = CGM.getContext().getTypeInfoInChars(Arg.getType());
+ Offset = Offset.alignTo(TInfo.Align);
+ Address Addr =
+ CGF.Builder.CreateConstInBoundsGEP(CfgBase, Offset.getQuantity());
+ Arg.copyInto(CGF, Addr);
+ Offset += TInfo.Width;
+ }
+ // Make `cudaLaunchDevice` call, i.e. E->getConfig().
+ const CallExpr *LaunchCall = E->getConfig();
+ QualType LaunchCalleeFuncTy = LaunchCall->getCallee()
+ ->getType()
+ ->getAs<PointerType>()
+ ->getPointeeType();
+ CGCallee LaunchCallee = CGF.EmitCallee(LaunchCall->getCallee());
+ CallArgList LaunchCallArgs;
+ CGF.EmitCallArgs(
+ LaunchCallArgs,
+ dyn_cast<FunctionProtoType>(LaunchCalleeFuncTy->castAs<FunctionType>()),
+ LaunchCall->arguments(), LaunchCall->getDirectCallee());
+ // Replace func and paramterbuffer arguments.
+ LaunchCallArgs[0] = CallArg(RValue::get(KernelCallee.getFunctionPointer()),
+ CGM.getContext().VoidPtrTy);
+ LaunchCallArgs[1] = CallArg(RValue::get(Config), CGM.getContext().VoidPtrTy);
+ const CGFunctionInfo &LaunchCallInfo = CGM.getTypes().arrangeFreeFunctionCall(
+ LaunchCallArgs,
+ dyn_cast<FunctionProtoType>(LaunchCalleeFuncTy->castAs<FunctionType>()),
+ /*ChainCall=*/false);
+ CGF.EmitCall(LaunchCallInfo, LaunchCallee, ReturnValue, LaunchCallArgs,
+ CallOrInvoke,
+ /*IsMustTail=*/false, E->getExprLoc());
+ CGF.EmitBranch(ContBlock);
+
+ CGF.EmitBlock(ContBlock);
+ eval.end(CGF);
+
+ return RValue::get(nullptr);
+}
+
RValue CGCUDARuntime::EmitCUDAKernelCallExpr(CodeGenFunction &CGF,
const CUDAKernelCallExpr *E,
ReturnValueSlot ReturnValue,
diff --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h
index 86f776004ee7c..64fb9a31422e0 100644
--- a/clang/lib/CodeGen/CGCUDARuntime.h
+++ b/clang/lib/CodeGen/CGCUDARuntime.h
@@ -88,6 +88,10 @@ class CGCUDARuntime {
ReturnValueSlot ReturnValue,
llvm::CallBase **CallOrInvoke = nullptr);
+ virtual RValue EmitCUDADeviceKernelCallExpr(
+ CodeGenFunction &CGF, const CUDAKernelCallExpr *E,
+ ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke = nullptr);
+
/// Emits a kernel launch stub.
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0;
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 14d8db32bafc6..0c01933790100 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -503,6 +503,12 @@ RValue CodeGenFunction::EmitCXXOperatorMemberCallExpr(
RValue CodeGenFunction::EmitCUDAKernelCallExpr(const CUDAKernelCallExpr *E,
ReturnValueSlot ReturnValue,
llvm::CallBase **CallOrInvoke) {
+ auto *FD = E->getConfig()->getDirectCallee();
+ // Emit as a device kernel call if the config is prepared using
+ // 'cudaGetParameterBuffer'.
+ if (FD && CGM.getContext().getcudaLaunchDeviceDecl() == FD)
+ return CGM.getCUDARuntime().EmitCUDADeviceKernelCallExpr(
+ *this, E, ReturnValue, CallOrInvoke);
return CGM.getCUDARuntime().EmitCUDAKernelCallExpr(*this, E, ReturnValue,
CallOrInvoke);
}
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 31735a0f5feb3..a60a32dcb9e4c 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -52,16 +52,85 @@ bool SemaCUDA::PopForceHostDevice() {
ExprResult SemaCUDA::ActOnExecConfigExpr(Scope *S, SourceLocation LLLLoc,
MultiExprArg ExecConfig,
SourceLocation GGGLoc) {
- FunctionDecl *ConfigDecl = getASTContext().getcudaConfigureCallDecl();
+ bool IsDeviceKernelCall = false;
+ switch (CurrentTarget()) {
+ case CUDAFunctionTarget::Global:
+ case CUDAFunctionTarget::Device:
+ IsDeviceKernelCall = true;
+ break;
+ case CUDAFunctionTarget::HostDevice:
+ if (getLangOpts().CUDAIsDevice) {
+ // Under the device compilation, config call under an HD function should
+ // be treated as a device kernel call. But, for implicit HD ones (such as
+ // lambdas), need to check whether RDC is enabled or not.
+ IsDeviceKernelCall = true;
+ if (!getLangOpts().GPURelocatableDeviceCode) {
+ FunctionDecl *Caller = SemaRef.getCurFunctionDecl(/*AllowLambda=*/true);
+ if (Caller && isImplicitHostDeviceFunction(Caller))
+ IsDeviceKernelCall = false;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (IsDeviceKernelCall && !getLangOpts().GPURelocatableDeviceCode)
+ return ExprError(
+ Diag(LLLLoc, diag::err_cuda_device_kernel_launch_require_rdc));
+
+ FunctionDecl *ConfigDecl = IsDeviceKernelCall
+ ? getASTContext().getcudaLaunchDeviceDecl()
+ : getASTContext().getcudaConfigureCallDecl();
if (!ConfigDecl)
return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
- << getConfigureFuncName());
+ << (IsDeviceKernelCall ? getLaunchDeviceFuncName()
+ : getConfigureFuncName()));
+ // Additional check on the launch function if it's a device kernel call.
+ if (IsDeviceKernelCall) {
+ auto *GetParamBuf = getASTContext().getcudaGetParameterBufferDecl();
+ if (!GetParamBuf)
+ return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
+ << getGetParameterBufferFuncName());
+ }
+
QualType ConfigQTy = ConfigDecl->getType();
DeclRefExpr *ConfigDR = new (getASTContext()) DeclRefExpr(
getASTContext(), ConfigDecl, false, ConfigQTy, VK_LValue, LLLLoc);
SemaRef.MarkFunctionReferenced(LLLLoc, ConfigDecl);
+ if (IsDeviceKernelCall) {
+ SmallVector<Expr *> Args;
+ // Use a null pointer as the kernel function, which may not be resolvable
+ // here. For example, resolving that kernel function may need additional
+ // kernel arguments.
+ llvm::APInt Zero(SemaRef.Context.getTypeSize(SemaRef.Context.IntTy), 0);
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+ SemaRef.Context.IntTy, LLLLoc));
+ // Use a null pointer as the parameter buffer, which should be allocated in
+ // the codegen.
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+ SemaRef.Context.IntTy, LLLLoc));
+ // Add the original config arguments.
+ llvm::append_range(Args, ExecConfig);
+ // Add the default blockDim if it's missing.
+ if (Args.size() < 4) {
+ llvm::APInt One(SemaRef.Context.getTypeSize(SemaRef.Context.IntTy), 1);
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, One,
+ SemaRef.Context.IntTy, LLLLoc));
+ }
+ // Add the default sharedMemSize if it's missing.
+ if (Args.size() < 5)
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+ SemaRef.Context.IntTy, LLLLoc));
+ // Add the default stream if it's missing.
+ if (Args.size() < 6)
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+ SemaRef.Context.IntTy, LLLLoc));
+ return SemaRef.BuildCallExpr(S, ConfigDR, LLLLoc, Args, GGGLoc, nullptr,
+ /*IsExecConfig=*/true);
+ }
return SemaRef.BuildCallExpr(S, ConfigDR, LLLLoc, ExecConfig, GGGLoc, nullptr,
/*IsExecConfig=*/true);
}
@@ -251,7 +320,7 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
if (CalleeTarget == CUDAFunctionTarget::Global &&
(CallerTarget == CUDAFunctionTarget::Global ||
CallerTarget == CUDAFunctionTarget::Device))
- return CFP_Never;
+ return CFP_Native;
// (b) Calling HostDevice is OK for everyone.
if (CalleeTarget == CUDAFunctionTarget::HostDevice)
@@ -279,7 +348,8 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
if (CallerTarget == CUDAFunctionTarget::HostDevice) {
// It's OK to call a compilation-mode matching function from an HD one.
if ((getLangOpts().CUDAIsDevice &&
- CalleeTarget == CUDAFunctionTarget::Device) ||
+ (CalleeTarget == CUDAFunctionTarget::Device ||
+ CalleeTarget == CUDAFunctionTarget::Global)) ||
(!getLangOpts().CUDAIsDevice &&
(CalleeTarget == CUDAFunctionTarget::Host ||
CalleeTarget == CUDAFunctionTarget::Global)))
@@ -1103,6 +1173,18 @@ std::string SemaCUDA::getConfigureFuncName() const {
return "cudaConfigureCall";
}
+std::string SemaCUDA::getGetParameterBufferFuncName() const {
+ // FIXME: Use the API from CUDA programming guide. Add V2 support when
+ // necessary.
+ return "cudaGetParameterBuffer";
+}
+
+std::string SemaCUDA::getLaunchDeviceFuncName() const {
+ // FIXME: Use the API from CUDA programming guide. Add V2 support when
+ // necessary.
+ return "cudaLaunchDevice";
+}
+
// Record any local constexpr variables that are passed one way on the host
// and another on the device.
void SemaCUDA::recordPotentialODRUsedVariable(
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index fc3aabf5741ca..1e39bfb5e42cd 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -11050,14 +11050,30 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
}
if (getLangOpts().CUDA) {
- IdentifierInfo *II = NewFD->getIdentifier();
- if (II && II->isStr(CUDA().getConfigureFuncName()) &&
- !NewFD->isInvalidDecl() &&
- NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
- if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
- Diag(NewFD->getLocation(), diag::err_config_scalar_return)
- << CUDA().getConfigureFuncName();
- Context.setcudaConfigureCallDecl(NewFD);
+ if (IdentifierInfo *II = NewFD->getIdentifier()) {
+ if (II->isStr(CUDA().getConfigureFuncName()) && !NewFD->isInvalidDecl() &&
+ NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
+ if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
+ Diag(NewFD->getLocation(), diag::err_config_scalar_return)
+ << CUDA().getConfigureFuncName();
+ Context.setcudaConfigureCallDecl(NewFD);
+ }
+ if (II->isStr(CUDA().getGetParameterBufferFuncName()) &&
+ !NewFD->isInvalidDecl() &&
+ NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
+ if (!R->castAs<FunctionType>()->getReturnType()->isPointerType())
+ Diag(NewFD->getLocation(), diag::err_config_pointer_return)
+ << CUDA().getConfigureFuncName();
+ Context.setcudaGetParameterBufferDecl(NewFD);
+ }
+ if (II->isStr(CUDA().getLaunchDeviceFuncName()) &&
+ !NewFD->isInvalidDecl() &&
+ NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
+ if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
+ Diag(NewFD->getLocation(), diag::err_config_scalar_return)
+ << CUDA().getConfigureFuncName();
+ Context.setcudaLaunchDeviceDecl(NewFD);
+ }
}
}
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index c1b5cb730e4a4..e415d5816ab01 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -5588,9 +5588,13 @@ void ASTReader::InitializeContext() {
// If there were any CUDA special declarations, deserialize them.
if (!CUDASpecialDeclRefs.empty()) {
- assert(CUDASpecialDeclRefs.size() == 1 && "More decl refs than expected!");
+ assert(CUDASpecialDeclRefs.size() == 3 && "More decl refs than expected!");
Context.setcudaConfigureCallDecl(
- cast<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[0])));
+ cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[0])));
+ Context.setcudaGetParameterBufferDecl(
+ cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[1])));
+ Context.setcudaLaunchDeviceDecl(
+ cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[2])));
}
// Re-export any modules that were imported by a non-module AST file.
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 377e3966874f3..8e527db972fb0 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -5714,8 +5714,13 @@ void ASTWriter::PrepareWritingSpecialDecls(Sema &Sem...
[truncated]
|
|
I should mention that the generated code sequence differs from the CUDA 12.9. The latter uses |
bb68180 to
a5320cb
Compare
01eb84a to
b0cdc5e
Compare
- CUDA's dynamic parallelism extension allows device-side kernel
launches, which share the identical syntax to host-side launches,
e.g.,
kernel<<<Dg, Db, Ns, S>>>(arguments);
but differ from the code generation. That device-side kernel launches
is eventually translated into the following sequence
config = cudaGetParameterBuffer(alignment, size);
// setup arguments by copying them into `config`.
cudaLaunchDevice(func, config, Dg, Db, Ns, S);
- To support the device-side kernel launch, 'CUDAKernelCallExpr' is
reused but its config expr is set to a call to 'cudaLaunchDevice'.
During the code generation, 'CUDAKernelCallExpr' is expanded into the
sequence aforementioned.
- As the device-side kernel launch requires the source to be compiled as
relocatable device code and linked with '-lcudadevrt'. Linkers are
changed to pass relevant link options to 'nvlink'.
b0cdc5e to
7f4de97
Compare
|
rebase |
CUDA's dynamic parallelism extension allows device-side kernel launches, which share the identical syntax to host-side launches, e.g.,
kernel<<<Dg, Db, Ns, S>>>(arguments);
but differ from the code generation. That device-side kernel launches is eventually translated into the following sequence
config = cudaGetParameterBuffer(alignment, size);
// setup arguments by copying them into
config.cudaLaunchDevice(func, config, Dg, Db, Ns, S);
To support the device-side kernel launch, 'CUDAKernelCallExpr' is reused but its config expr is set to a call to 'cudaLaunchDevice'. During the code generation, 'CUDAKernelCallExpr' is expanded into the sequence aforementioned.
As the device-side kernel launch requires the source to be compiled as relocatable device code and linked with '-lcudadevrt'. Linkers are changed to pass relevant link options to 'nvlink'.