Skip to content

Commit

Permalink
[HIP] Allow partial linking for -fgpu-rdc (#81700)
Browse files Browse the repository at this point in the history
`-fgpu-rdc` mode allows device functions call device functions in
different TU. However, currently all device objects have to be linked
together since only one fat binary is supported. This is time consuming
for AMDGPU backend since it only supports LTO.

There are use cases that objects can be divided into groups in which
device functions are self-contained but host functions are not. It is
desirable to link/optimize/codegen the device code and generate a fatbin
for each group, whereas partially link the host code with `ld -r` or
generate a static library by using the `--emit-static-lib` option of
clang. This avoids linking all device code together, therefore decreases
the linking time for `-fgpu-rdc`.

Previously, clang emits an external symbol `__hip_fatbin` for all
objects for `-fgpu-rdc`. With this patch, clang emits an unique external
symbol `__hip_fatbin_{cuid}` for the fat binary for each object. When a
group of objects are linked together to generate a fatbin, the symbols
are merged by alias and point to the same fat binary. Each group has its
own fat binary. One executable or shared library can have multiple fat
binaries. Device linking is done for undefined fab binary symbols only
to avoid repeated linking. `__hip_gpubin_handle` is also uniquefied and
merged to avoid repeated registering. Symbol `__hip_cuid_{cuid}` is
introduced to facilitate debugging and tooling.

Fixes: #77018
  • Loading branch information
yxsamliu committed Feb 22, 2024
1 parent cc83927 commit 33a6ce1
Show file tree
Hide file tree
Showing 11 changed files with 469 additions and 50 deletions.
22 changes: 11 additions & 11 deletions clang/lib/CodeGen/CGCUDANV.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -760,10 +760,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
// to contain the fat binary but will be populated somewhere else,
// e.g. by lld through link script.
FatBinStr = new llvm::GlobalVariable(
CGM.getModule(), CGM.Int8Ty,
/*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
"__hip_fatbin", nullptr,
llvm::GlobalVariable::NotThreadLocal);
CGM.getModule(), CGM.Int8Ty,
/*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
"__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr,
llvm::GlobalVariable::NotThreadLocal);
cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
}

Expand Down Expand Up @@ -816,8 +816,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
// thread safety of the loaded program. Therefore we can assume sequential
// execution of constructor functions here.
if (IsHIP) {
auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
llvm::GlobalValue::LinkOnceAnyLinkage;
auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
: llvm::GlobalValue::ExternalLinkage;
llvm::BasicBlock *IfBlock =
llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
llvm::BasicBlock *ExitBlock =
Expand All @@ -826,11 +826,11 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
// of HIP ABI.
GpuBinaryHandle = new llvm::GlobalVariable(
TheModule, PtrTy, /*isConstant=*/false, Linkage,
/*Initializer=*/llvm::ConstantPointerNull::get(PtrTy),
"__hip_gpubin_handle");
if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage)
GpuBinaryHandle->setComdat(
CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName()));
/*Initializer=*/
CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr,
CudaGpuBinary
? "__hip_gpubin_handle"
: "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
// Prevent the weak symbol in different shared libraries being merged.
if (Linkage != llvm::GlobalValue::InternalLinkage)
Expand Down
10 changes: 9 additions & 1 deletion clang/lib/CodeGen/CodeGenModule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -915,7 +915,15 @@ void CodeGenModule::Release() {
llvm::ConstantArray::get(ATy, UsedArray), "__clang_gpu_used_external");
addCompilerUsedGlobal(GV);
}

if (LangOpts.HIP) {
// Emit a unique ID so that host and device binaries from the same
// compilation unit can be associated.
auto *GV = new llvm::GlobalVariable(
getModule(), Int8Ty, false, llvm::GlobalValue::ExternalLinkage,
llvm::Constant::getNullValue(Int8Ty),
"__hip_cuid_" + getContext().getCUIDHash());
addCompilerUsedGlobal(GV);
}
emitLLVMUsed();
if (SanStats)
SanStats->finish();
Expand Down
40 changes: 38 additions & 2 deletions clang/lib/Driver/OffloadBundler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -588,8 +588,15 @@ class ObjectFileHandler final : public FileHandler {
StringRef Content = *ContentOrErr;

// Copy fat object contents to the output when extracting host bundle.
if (Content.size() == 1u && Content.front() == 0)
Content = StringRef(Input.getBufferStart(), Input.getBufferSize());
std::string ModifiedContent;
if (Content.size() == 1u && Content.front() == 0) {
auto HostBundleOrErr = getHostBundle();
if (!HostBundleOrErr)
return HostBundleOrErr.takeError();

ModifiedContent = std::move(*HostBundleOrErr);
Content = ModifiedContent;
}

OS.write(Content.data(), Content.size());
return Error::success();
Expand Down Expand Up @@ -692,6 +699,35 @@ class ObjectFileHandler final : public FileHandler {
}
return Error::success();
}

Expected<std::string> getHostBundle() {
TempFileHandlerRAII TempFiles;

auto ModifiedObjPathOrErr = TempFiles.Create(std::nullopt);
if (!ModifiedObjPathOrErr)
return ModifiedObjPathOrErr.takeError();
StringRef ModifiedObjPath = *ModifiedObjPathOrErr;

BumpPtrAllocator Alloc;
StringSaver SS{Alloc};
SmallVector<StringRef, 16> ObjcopyArgs{"llvm-objcopy"};

ObjcopyArgs.push_back("--regex");
ObjcopyArgs.push_back("--remove-section=__CLANG_OFFLOAD_BUNDLE__.*");
ObjcopyArgs.push_back("--");
ObjcopyArgs.push_back(BundlerConfig.InputFileNames.front());
ObjcopyArgs.push_back(ModifiedObjPath);

if (Error Err = executeObjcopy(BundlerConfig.ObjcopyPath, ObjcopyArgs))
return std::move(Err);

auto BufOrErr = MemoryBuffer::getFile(ModifiedObjPath);
if (!BufOrErr)
return createStringError(BufOrErr.getError(),
"Failed to read back the modified object file");

return BufOrErr->get()->getBuffer().str();
}
};

/// Handler for text files. The bundled file will have the following format.
Expand Down

0 comments on commit 33a6ce1

Please sign in to comment.