Skip to content

Commit

Permalink
[HIP] Add hip input kind and codegen for kernel launching
Browse files Browse the repository at this point in the history
HIP is a language similar to CUDA (https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md ).
The language syntax is very similar, which allows a hip program to be compiled as a CUDA program by Clang. The main difference
is the host API. HIP has a set of vendor neutral host API which can be implemented on different platforms. Currently there is open source
implementation of HIP runtime on amdgpu target (https://github.com/ROCm-Developer-Tools/HIP).

This patch adds support of input kind and language standard hip.

When hip file is compiled, both LangOpts.CUDA and LangOpts.HIP is turned on. This allows compilation of hip program as CUDA
in most cases and only special handling of hip program is needed LangOpts.HIP is checked.

This patch also adds support of kernel launching of HIP program using HIP host API.

When -x hip is not specified, there is no behaviour change for CUDA.

Patch by Greg Rodgers.
Revised and lit test added by Yaxun Liu.

Differential Revision: https://reviews.llvm.org/D44984

llvm-svn: 330790
  • Loading branch information
yxsamliu committed Apr 25, 2018
1 parent 7282d32 commit 887c569
Show file tree
Hide file tree
Showing 13 changed files with 151 additions and 67 deletions.
6 changes: 6 additions & 0 deletions clang/include/clang/Basic/IdentifierTable.h
Expand Up @@ -98,6 +98,12 @@ class IdentifierInfo {
memcmp(getNameStart(), Str, StrLen-1) == 0;
}

/// \brief Return true if this is the identifier for the specified StringRef.
bool isStr(llvm::StringRef Str) const {
llvm::StringRef ThisStr(getNameStart(), getLength());
return ThisStr == Str;
}

/// \brief Return the beginning of the actual null-terminated string for this
/// identifier.
const char *getNameStart() const {
Expand Down
1 change: 1 addition & 0 deletions clang/include/clang/Basic/LangOptions.def
Expand Up @@ -195,6 +195,7 @@ LANGOPT(NativeHalfType , 1, 0, "Native half type support")
LANGOPT(NativeHalfArgsAndReturns, 1, 0, "Native half args and returns")
LANGOPT(HalfArgsAndReturns, 1, 0, "half args and returns")
LANGOPT(CUDA , 1, 0, "CUDA")
LANGOPT(HIP , 1, 0, "HIP")
LANGOPT(OpenMP , 32, 0, "OpenMP support and version of OpenMP (31, 40 or 45)")
LANGOPT(OpenMPSimd , 1, 0, "Use SIMD only OpenMP support.")
LANGOPT(OpenMPUseTLS , 1, 0, "Use TLS for threadprivates or runtime calls")
Expand Down
1 change: 1 addition & 0 deletions clang/include/clang/Frontend/FrontendOptions.h
Expand Up @@ -161,6 +161,7 @@ class InputKind {
OpenCL,
CUDA,
RenderScript,
HIP,
///@}
};

Expand Down
4 changes: 4 additions & 0 deletions clang/include/clang/Frontend/LangStandards.def
Expand Up @@ -168,6 +168,10 @@ LANGSTANDARD_ALIAS_DEPR(opencl20, "CL2.0")
LANGSTANDARD(cuda, "cuda", CUDA, "NVIDIA CUDA(tm)",
LineComment | CPlusPlus | Digraphs)

// HIP
LANGSTANDARD(hip, "hip", HIP, "HIP",
LineComment | CPlusPlus | Digraphs)

#undef LANGSTANDARD
#undef LANGSTANDARD_ALIAS
#undef LANGSTANDARD_ALIAS_DEPR
58 changes: 42 additions & 16 deletions clang/lib/CodeGen/CGCUDANV.cpp
Expand Up @@ -55,6 +55,8 @@ class CGNVCUDARuntime : public CGCUDARuntime {
llvm::FunctionType *getRegisterGlobalsFnTy() const;
llvm::FunctionType *getCallbackFnTy() const;
llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
std::string addPrefixToName(StringRef FuncName) const;
std::string addUnderscoredPrefixToName(StringRef FuncName) const;

/// Creates a function to register all kernel stubs generated in this module.
llvm::Function *makeRegisterGlobalsFn();
Expand Down Expand Up @@ -114,6 +116,18 @@ class CGNVCUDARuntime : public CGCUDARuntime {

}

std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
if (CGM.getLangOpts().HIP)
return ((Twine("hip") + Twine(FuncName)).str());
return ((Twine("cuda") + Twine(FuncName)).str());
}
std::string
CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
if (CGM.getLangOpts().HIP)
return ((Twine("__hip") + Twine(FuncName)).str());
return ((Twine("__cuda") + Twine(FuncName)).str());
}

CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
: CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
TheModule(CGM.getModule()),
Expand All @@ -133,15 +147,21 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const {
// cudaError_t cudaSetupArgument(void *, size_t, size_t)
llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy,
Params, false),
"cudaSetupArgument");
return CGM.CreateRuntimeFunction(
llvm::FunctionType::get(IntTy, Params, false),
addPrefixToName("SetupArgument"));
}

llvm::Constant *CGNVCUDARuntime::getLaunchFn() const {
// cudaError_t cudaLaunch(char *)
return CGM.CreateRuntimeFunction(
llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
if (CGM.getLangOpts().HIP) {
// hipError_t hipLaunchByPtr(char *);
return CGM.CreateRuntimeFunction(
llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
} else {
// cudaError_t cudaLaunch(char *);
return CGM.CreateRuntimeFunction(
llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
}
}

llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
Expand Down Expand Up @@ -222,7 +242,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {

llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
"__cuda_register_globals", &TheModule);
addUnderscoredPrefixToName("_register_globals"), &TheModule);
llvm::BasicBlock *EntryBB =
llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
CGBuilderTy Builder(CGM, Context);
Expand All @@ -235,7 +255,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
"__cudaRegisterFunction");
addUnderscoredPrefixToName("RegisterFunction"));

// Extract GpuBinaryHandle passed as the first argument passed to
// __cuda_register_globals() and generate __cudaRegisterFunction() call for
Expand All @@ -259,7 +279,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
IntTy, IntTy};
llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(IntTy, RegisterVarParams, false),
"__cudaRegisterVar");
addUnderscoredPrefixToName("RegisterVar"));
for (auto &Pair : DeviceVars) {
llvm::GlobalVariable *Var = Pair.first;
unsigned Flags = Pair.second;
Expand Down Expand Up @@ -305,7 +325,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
// void ** __cudaRegisterFatBinary(void *);
llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
"__cudaRegisterFatBinary");
addUnderscoredPrefixToName("RegisterFatBinary"));
// struct { int magic, int version, void * gpu_binary, void * dont_care };
llvm::StructType *FatbinWrapperTy =
llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
Expand All @@ -324,7 +344,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {

llvm::Function *ModuleCtorFunc = llvm::Function::Create(
llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule);
llvm::GlobalValue::InternalLinkage,
addUnderscoredPrefixToName("_module_ctor"), &TheModule);
llvm::BasicBlock *CtorEntryBB =
llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
CGBuilderTy CtorBuilder(CGM, Context);
Expand Down Expand Up @@ -357,7 +378,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
// Unused in fatbin v1.
Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
"__cuda_fatbin_wrapper", CGM.getPointerAlign(),
addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
/*constant*/ true);
FatbinWrapper->setSection(FatbinSectionName);

Expand All @@ -370,7 +391,9 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
GpuBinaryHandle = new llvm::GlobalVariable(
TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
llvm::ConstantPointerNull::get(VoidPtrPtrTy),
addUnderscoredPrefixToName("_gpubin_handle"));

CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
CGM.getPointerAlign());

Expand All @@ -392,7 +415,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {

// void __cudaRegisterLinkedBinary%NVModuleID%(void (*)(void *), void *,
// void *, void (*)(void **))
SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
SmallString<128> RegisterLinkedBinaryName(
addUnderscoredPrefixToName("RegisterLinkedBinary"));
RegisterLinkedBinaryName += NVModuleID;
llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
Expand Down Expand Up @@ -424,11 +448,13 @@ llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
// void __cudaUnregisterFatBinary(void ** handle);
llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
"__cudaUnregisterFatBinary");
addUnderscoredPrefixToName("UnregisterFatBinary"));

llvm::Function *ModuleDtorFunc = llvm::Function::Create(
llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
llvm::GlobalValue::InternalLinkage, "__cuda_module_dtor", &TheModule);
llvm::GlobalValue::InternalLinkage,
addUnderscoredPrefixToName("_module_dtor"), &TheModule);

llvm::BasicBlock *DtorEntryBB =
llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
CGBuilderTy DtorBuilder(CGM, Context);
Expand Down
13 changes: 12 additions & 1 deletion clang/lib/Frontend/CompilerInvocation.cpp
Expand Up @@ -1608,6 +1608,7 @@ static InputKind ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
.Case("c", InputKind::C)
.Case("cl", InputKind::OpenCL)
.Case("cuda", InputKind::CUDA)
.Case("hip", InputKind::HIP)
.Case("c++", InputKind::CXX)
.Case("objective-c", InputKind::ObjC)
.Case("objective-c++", InputKind::ObjCXX)
Expand Down Expand Up @@ -1887,6 +1888,9 @@ void CompilerInvocation::setLangDefaults(LangOptions &Opts, InputKind IK,
case InputKind::RenderScript:
LangStd = LangStandard::lang_c99;
break;
case InputKind::HIP:
LangStd = LangStandard::lang_hip;
break;
}
}

Expand Down Expand Up @@ -1934,7 +1938,8 @@ void CompilerInvocation::setLangDefaults(LangOptions &Opts, InputKind IK,
}
}

Opts.CUDA = IK.getLanguage() == InputKind::CUDA;
Opts.HIP = IK.getLanguage() == InputKind::HIP;
Opts.CUDA = IK.getLanguage() == InputKind::CUDA || Opts.HIP;
if (Opts.CUDA)
// Set default FP_CONTRACT to FAST.
Opts.setDefaultFPContractMode(LangOptions::FPC_Fast);
Expand Down Expand Up @@ -2005,6 +2010,10 @@ static bool IsInputCompatibleWithStandard(InputKind IK,
return S.getLanguage() == InputKind::CUDA ||
S.getLanguage() == InputKind::CXX;

case InputKind::HIP:
return S.getLanguage() == InputKind::CXX ||
S.getLanguage() == InputKind::HIP;

case InputKind::Asm:
// Accept (and ignore) all -std= values.
// FIXME: The -std= value is not ignored; it affects the tokenization
Expand Down Expand Up @@ -2032,6 +2041,8 @@ static const StringRef GetInputKindName(InputKind IK) {
return "CUDA";
case InputKind::RenderScript:
return "RenderScript";
case InputKind::HIP:
return "HIP";

case InputKind::Asm:
return "Asm";
Expand Down
1 change: 1 addition & 0 deletions clang/lib/Frontend/FrontendActions.cpp
Expand Up @@ -733,6 +733,7 @@ void PrintPreambleAction::ExecuteAction() {
case InputKind::ObjCXX:
case InputKind::OpenCL:
case InputKind::CUDA:
case InputKind::HIP:
break;

case InputKind::Unknown:
Expand Down
4 changes: 3 additions & 1 deletion clang/lib/Frontend/InitPreprocessor.cpp
Expand Up @@ -471,8 +471,10 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
// Not "standard" per se, but available even with the -undef flag.
if (LangOpts.AsmPreprocessor)
Builder.defineMacro("__ASSEMBLER__");
if (LangOpts.CUDA)
if (LangOpts.CUDA && !LangOpts.HIP)
Builder.defineMacro("__CUDA__");
if (LangOpts.HIP)
Builder.defineMacro("__HIP__");
}

/// Initialize the predefined C++ language feature test macros defined in
Expand Down
5 changes: 3 additions & 2 deletions clang/lib/Sema/SemaCUDA.cpp
Expand Up @@ -42,8 +42,9 @@ ExprResult Sema::ActOnCUDAExecConfigExpr(Scope *S, SourceLocation LLLLoc,
SourceLocation GGGLoc) {
FunctionDecl *ConfigDecl = Context.getcudaConfigureCallDecl();
if (!ConfigDecl)
return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
<< "cudaConfigureCall");
return ExprError(
Diag(LLLLoc, diag::err_undeclared_var_use)
<< (getLangOpts().HIP ? "hipConfigureCall" : "cudaConfigureCall"));
QualType ConfigQTy = ConfigDecl->getType();

DeclRefExpr *ConfigDR = new (Context)
Expand Down
6 changes: 4 additions & 2 deletions clang/lib/Sema/SemaDecl.cpp
Expand Up @@ -9056,11 +9056,13 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,

if (getLangOpts().CUDA) {
IdentifierInfo *II = NewFD->getIdentifier();
if (II && II->isStr("cudaConfigureCall") && !NewFD->isInvalidDecl() &&
if (II &&
II->isStr(getLangOpts().HIP ? "hipConfigureCall"
: "cudaConfigureCall") &&
!NewFD->isInvalidDecl() &&
NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
if (!R->getAs<FunctionType>()->getReturnType()->isScalarType())
Diag(NewFD->getLocation(), diag::err_config_scalar_return);

Context.setcudaConfigureCallDecl(NewFD);
}

Expand Down
5 changes: 5 additions & 0 deletions clang/test/CodeGenCUDA/Inputs/cuda.h
Expand Up @@ -16,7 +16,12 @@ struct dim3 {

typedef struct cudaStream *cudaStream_t;

#ifdef __HIP__
int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
cudaStream_t stream = 0);
#else
int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
cudaStream_t stream = 0);
#endif

extern "C" __device__ int printf(const char*, ...);

0 comments on commit 887c569

Please sign in to comment.