Skip to content

Commit

Permalink
[CUDA] Include single GPU binary, NFCI.
Browse files Browse the repository at this point in the history
Binaries for multiple architectures are combined by fatbinary,
so the current code was effectively not needed.

Differential Revision: https://reviews.llvm.org/D43461

llvm-svn: 326342
  • Loading branch information
hahnjo committed Feb 28, 2018
1 parent 9f9e468 commit e768132
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 93 deletions.
7 changes: 3 additions & 4 deletions clang/include/clang/Frontend/CodeGenOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,10 +205,9 @@ class CodeGenOptions : public CodeGenOptionsBase {
/// the summary and module symbol table (and not, e.g. any debug metadata).
std::string ThinLinkBitcodeFile;

/// A list of file names passed with -fcuda-include-gpubinary options to
/// forward to CUDA runtime back-end for incorporating them into host-side
/// object file.
std::vector<std::string> CudaGpuBinaryFileNames;
/// Name of file passed with -fcuda-include-gpubinary option to forward to
/// CUDA runtime back-end for incorporating them into host-side object file.
std::string CudaGpuBinaryFileName;

/// The name of the file to which the backend should save YAML optimization
/// records.
Expand Down
135 changes: 60 additions & 75 deletions clang/lib/CodeGen/CGCUDANV.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ class CGNVCUDARuntime : public CGCUDARuntime {
/// Keeps track of kernel launch stubs emitted in this module
llvm::SmallVector<llvm::Function *, 16> EmittedKernels;
llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars;
/// Keeps track of variables containing handles of GPU binaries. Populated by
/// Keeps track of variable containing handle of GPU binary. Populated by
/// ModuleCtorFunction() and used to create corresponding cleanup calls in
/// ModuleDtorFunction()
llvm::SmallVector<llvm::GlobalVariable *, 16> GpuBinaryHandles;
llvm::GlobalVariable *GpuBinaryHandle = nullptr;

llvm::Constant *getSetupArgumentFn() const;
llvm::Constant *getLaunchFn() const;
Expand Down Expand Up @@ -245,16 +245,14 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
/// Creates a global constructor function for the module:
/// \code
/// void __cuda_module_ctor(void*) {
/// Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0);
/// __cuda_register_globals(Handle0);
/// ...
/// HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN);
/// __cuda_register_globals(HandleN);
/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
/// __cuda_register_globals(Handle);
/// }
/// \endcode
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
// No need to generate ctors/dtors if there are no GPU binaries.
if (CGM.getCodeGenOpts().CudaGpuBinaryFileNames.empty())
// No need to generate ctors/dtors if there is no GPU binary.
std::string GpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
if (GpuBinaryFileName.empty())
return nullptr;

// void __cuda_register_globals(void* handle);
Expand All @@ -267,6 +265,18 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
llvm::StructType *FatbinWrapperTy =
llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);

// Register GPU binary with the CUDA runtime, store returned handle in a
// global variable and save a reference in GpuBinaryHandle to be cleaned up
// in destructor on exit. Then associate all known kernels with the GPU binary
// handle so CUDA runtime can figure out what to call on the GPU side.
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName);
if (std::error_code EC = GpuBinaryOrErr.getError()) {
CGM.getDiags().Report(diag::err_cannot_open_file)
<< GpuBinaryFileName << EC.message();
return nullptr;
}

llvm::Function *ModuleCtorFunc = llvm::Function::Create(
llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule);
Expand All @@ -276,79 +286,56 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {

CtorBuilder.SetInsertPoint(CtorEntryBB);

// For each GPU binary, register it with the CUDA runtime and store returned
// handle in a global variable and save the handle in GpuBinaryHandles vector
// to be cleaned up in destructor on exit. Then associate all known kernels
// with the GPU binary handle so CUDA runtime can figure out what to call on
// the GPU side.
for (const std::string &GpuBinaryFileName :
CGM.getCodeGenOpts().CudaGpuBinaryFileNames) {
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName);
if (std::error_code EC = GpuBinaryOrErr.getError()) {
CGM.getDiags().Report(diag::err_cannot_open_file) << GpuBinaryFileName
<< EC.message();
continue;
}

const char *FatbinConstantName =
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
// NVIDIA's cuobjdump looks for fatbins in this section.
const char *FatbinSectionName =
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";

// Create initialized wrapper structure that points to the loaded GPU binary
ConstantInitBuilder Builder(CGM);
auto Values = Builder.beginStruct(FatbinWrapperTy);
// Fatbin wrapper magic.
Values.addInt(IntTy, 0x466243b1);
// Fatbin version.
Values.addInt(IntTy, 1);
// Data.
Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(),
"", FatbinConstantName, 8));
// Unused in fatbin v1.
Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
llvm::GlobalVariable *FatbinWrapper =
Values.finishAndCreateGlobal("__cuda_fatbin_wrapper",
CGM.getPointerAlign(),
/*constant*/ true);
FatbinWrapper->setSection(FatbinSectionName);

// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
RegisterFatbinFunc,
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable(
TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
CGM.getPointerAlign());

// Call __cuda_register_globals(GpuBinaryHandle);
if (RegisterGlobalsFunc)
CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);

// Save GpuBinaryHandle so we can unregister it in destructor.
GpuBinaryHandles.push_back(GpuBinaryHandle);
}
const char *FatbinConstantName =
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
// NVIDIA's cuobjdump looks for fatbins in this section.
const char *FatbinSectionName =
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";

// Create initialized wrapper structure that points to the loaded GPU binary
ConstantInitBuilder Builder(CGM);
auto Values = Builder.beginStruct(FatbinWrapperTy);
// Fatbin wrapper magic.
Values.addInt(IntTy, 0x466243b1);
// Fatbin version.
Values.addInt(IntTy, 1);
// Data.
Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "",
FatbinConstantName, 8));
// Unused in fatbin v1.
Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
"__cuda_fatbin_wrapper", CGM.getPointerAlign(),
/*constant*/ true);
FatbinWrapper->setSection(FatbinSectionName);

// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
RegisterFatbinFunc, CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
GpuBinaryHandle = new llvm::GlobalVariable(
TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
CGM.getPointerAlign());

// Call __cuda_register_globals(GpuBinaryHandle);
if (RegisterGlobalsFunc)
CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);

CtorBuilder.CreateRetVoid();
return ModuleCtorFunc;
}

/// Creates a global destructor function that unregisters all GPU code blobs
/// Creates a global destructor function that unregisters the GPU code blob
/// registered by constructor.
/// \code
/// void __cuda_module_dtor(void*) {
/// __cudaUnregisterFatBinary(Handle0);
/// ...
/// __cudaUnregisterFatBinary(HandleN);
/// __cudaUnregisterFatBinary(Handle);
/// }
/// \endcode
llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
// No need for destructor if we don't have handles to unregister.
if (GpuBinaryHandles.empty())
// No need for destructor if we don't have a handle to unregister.
if (!GpuBinaryHandle)
return nullptr;

// void __cudaUnregisterFatBinary(void ** handle);
Expand All @@ -364,11 +351,9 @@ llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
CGBuilderTy DtorBuilder(CGM, Context);
DtorBuilder.SetInsertPoint(DtorEntryBB);

for (llvm::GlobalVariable *GpuBinaryHandle : GpuBinaryHandles) {
auto HandleValue =
auto HandleValue =
DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign());
DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
}
DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);

DtorBuilder.CreateRetVoid();
return ModuleDtorFunc;
Expand Down
11 changes: 5 additions & 6 deletions clang/lib/Driver/ToolChains/Clang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4677,13 +4677,12 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
}

if (IsCuda) {
// Host-side cuda compilation receives device-side outputs as Inputs[1...].
// Include them with -fcuda-include-gpubinary.
// Host-side cuda compilation receives all device-side outputs in a single
// fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary.
if (Inputs.size() > 1) {
for (auto I = std::next(Inputs.begin()), E = Inputs.end(); I != E; ++I) {
CmdArgs.push_back("-fcuda-include-gpubinary");
CmdArgs.push_back(I->getFilename());
}
assert(Inputs.size() == 2 && "More than one GPU binary!");
CmdArgs.push_back("-fcuda-include-gpubinary");
CmdArgs.push_back(Inputs[1].getFilename());
}

if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false))
Expand Down
4 changes: 2 additions & 2 deletions clang/lib/Frontend/CompilerInvocation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1046,8 +1046,8 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
Args.getAllArgValues(OPT_fsanitize_trap_EQ), Diags,
Opts.SanitizeTrap);

Opts.CudaGpuBinaryFileNames =
Args.getAllArgValues(OPT_fcuda_include_gpubinary);
Opts.CudaGpuBinaryFileName =
Args.getLastArgValue(OPT_fcuda_include_gpubinary);

Opts.Backchain = Args.hasArg(OPT_mbackchain);

Expand Down
23 changes: 17 additions & 6 deletions clang/test/Driver/cuda-options.cu
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,10 @@
// and that all results are included on the host side.
// RUN: %clang -### -target x86_64-linux-gnu \
// RUN: --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 -c %s 2>&1 \
// RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
// RUN: -check-prefix DEVICE2 -check-prefix DEVICE-SM30 \
// RUN: -check-prefix DEVICE2-SM35 -check-prefix HOST \
// RUN: -check-prefix HOST-NOSAVE -check-prefix INCLUDES-DEVICE \
// RUN: -check-prefix NOLINK %s
// RUN: | FileCheck -check-prefixes DEVICE,DEVICE-NOSAVE,DEVICE2 \
// RUN: -check-prefixes DEVICE-SM30,DEVICE2-SM35 \
// RUN: -check-prefixes INCLUDES-DEVICE,INCLUDES-DEVICE2 \
// RUN: -check-prefixes HOST,HOST-NOSAVE,NOLINK %s

// Verify that device-side results are passed to the correct tool when
// -save-temps is used.
Expand Down Expand Up @@ -182,9 +181,15 @@
// DEVICE2-SAME: "-aux-triple" "x86_64--linux-gnu"
// DEVICE2-SAME: "-fcuda-is-device"
// DEVICE2-SM35-SAME: "-target-cpu" "sm_35"
// DEVICE2-SAME: "-o" "[[GPUBINARY2:[^"]*]]"
// DEVICE2-SAME: "-o" "[[PTXFILE2:[^"]*]]"
// DEVICE2-SAME: "-x" "cuda"

// Match another call to ptxas.
// DEVICE2: ptxas
// DEVICE2-SM35-DAG: "--gpu-name" "sm_35"
// DEVICE2-DAG: "--output-file" "[[CUBINFILE2:[^"]*]]"
// DEVICE2-DAG: "[[PTXFILE2]]"

// Match no device-side compilation.
// NODEVICE-NOT: "-cc1" "-triple" "nvptx64-nvidia-cuda"
// NODEVICE-NOT: "-fcuda-is-device"
Expand All @@ -193,6 +198,8 @@
// INCLUDES-DEVICE-DAG: "--create" "[[FATBINARY:[^"]*]]"
// INCLUDES-DEVICE-DAG: "--image=profile=sm_{{[0-9]+}},file=[[CUBINFILE]]"
// INCLUDES-DEVICE-DAG: "--image=profile=compute_{{[0-9]+}},file=[[PTXFILE]]"
// INCLUDES-DEVICE2-DAG: "--image=profile=sm_{{[0-9]+}},file=[[CUBINFILE2]]"
// INCLUDES-DEVICE2-DAG: "--image=profile=compute_{{[0-9]+}},file=[[PTXFILE2]]"

// Match host-side preprocessor job with -save-temps.
// HOST-SAVE: "-cc1" "-triple" "x86_64--linux-gnu"
Expand All @@ -207,7 +214,11 @@
// HOST-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]"
// HOST-NOSAVE-SAME: "-x" "cuda"
// HOST-SAVE-SAME: "-x" "cuda-cpp-output"
// There is only one GPU binary after combining it with fatbinary!
// INCLUDES-DEVICE2-NOT: "-fcuda-include-gpubinary"
// INCLUDES-DEVICE-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
// There is only one GPU binary after combining it with fatbinary.
// INCLUDES-DEVICE2-NOT: "-fcuda-include-gpubinary"

// Match external assembler that uses compilation output.
// HOST-AS: "-o" "{{.*}}.o" "[[HOSTOUTPUT]]"
Expand Down

0 comments on commit e768132

Please sign in to comment.