New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Offload] Change unregister library to use atexit
instead of destructor
#86830
Conversation
@llvm/pr-subscribers-clang-driver @llvm/pr-subscribers-clang Author: Joseph Huber (jhuber6) ChangesSummary: Secondly, there were issues with the CUDA runtime when destructed with a Full diff: https://github.com/llvm/llvm-project/pull/86830.diff 2 Files Affected:
diff --git a/clang/test/Driver/linker-wrapper-image.c b/clang/test/Driver/linker-wrapper-image.c
index 75475264135224..5d5d62805e174d 100644
--- a/clang/test/Driver/linker-wrapper-image.c
+++ b/clang/test/Driver/linker-wrapper-image.c
@@ -26,12 +26,12 @@
// OPENMP: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}", section ".llvm.offloading", align 8
// OPENMP-NEXT: @.omp_offloading.device_images = internal unnamed_addr constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr getelementptr inbounds ([[[BEGIN:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr getelementptr inbounds ([[[END:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }]
// OPENMP-NEXT: @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }
-// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_reg, ptr null }]
-// OPENMP-NEXT: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_unreg, ptr null }]
+// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.omp_offloading.descriptor_reg, ptr null }]
// OPENMP: define internal void @.omp_offloading.descriptor_reg() section ".text.startup" {
// OPENMP-NEXT: entry:
// OPENMP-NEXT: call void @__tgt_register_lib(ptr @.omp_offloading.descriptor)
+// OPENMP-NEXT: %0 = call i32 @atexit(ptr @.omp_offloading.descriptor_unreg)
// OPENMP-NEXT: ret void
// OPENMP-NEXT: }
@@ -62,7 +62,7 @@
// CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8
// CUDA-NEXT: @.cuda.binary_handle = internal global ptr null
-// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }]
+// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.cuda.fatbin_reg, ptr null }]
// CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" {
// CUDA-NEXT: entry:
@@ -162,7 +162,7 @@
// HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8
// HIP-NEXT: @.hip.binary_handle = internal global ptr null
-// HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.hip.fatbin_reg, ptr null }]
+// HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.hip.fatbin_reg, ptr null }]
// HIP: define internal void @.hip.fatbin_reg() section ".text.startup" {
// HIP-NEXT: entry:
diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
index fec1bdbe9d8c74..4f9494d2143fce 100644
--- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
+++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
@@ -186,57 +186,60 @@ GlobalVariable *createBinDesc(Module &M, ArrayRef<ArrayRef<char>> Bufs,
".omp_offloading.descriptor" + Suffix);
}
-void createRegisterFunction(Module &M, GlobalVariable *BinDesc,
- StringRef Suffix) {
+Function *createUnregisterFunction(Module &M, GlobalVariable *BinDesc,
+ StringRef Suffix) {
LLVMContext &C = M.getContext();
auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
- auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
- ".omp_offloading.descriptor_reg" + Suffix, &M);
+ auto *Func =
+ Function::Create(FuncTy, GlobalValue::InternalLinkage,
+ ".omp_offloading.descriptor_unreg" + Suffix, &M);
Func->setSection(".text.startup");
- // Get __tgt_register_lib function declaration.
- auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
- /*isVarArg*/ false);
- FunctionCallee RegFuncC =
- M.getOrInsertFunction("__tgt_register_lib", RegFuncTy);
+ // Get __tgt_unregister_lib function declaration.
+ auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
+ /*isVarArg*/ false);
+ FunctionCallee UnRegFuncC =
+ M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy);
// Construct function body
IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
- Builder.CreateCall(RegFuncC, BinDesc);
+ Builder.CreateCall(UnRegFuncC, BinDesc);
Builder.CreateRetVoid();
- // Add this function to constructors.
- // Set priority to 1 so that __tgt_register_lib is executed AFTER
- // __tgt_register_requires (we want to know what requirements have been
- // asked for before we load a libomptarget plugin so that by the time the
- // plugin is loaded it can report how many devices there are which can
- // satisfy these requirements).
- appendToGlobalCtors(M, Func, /*Priority*/ 1);
+ return Func;
}
-void createUnregisterFunction(Module &M, GlobalVariable *BinDesc,
- StringRef Suffix) {
+void createRegisterFunction(Module &M, GlobalVariable *BinDesc,
+ StringRef Suffix) {
LLVMContext &C = M.getContext();
auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
- auto *Func =
- Function::Create(FuncTy, GlobalValue::InternalLinkage,
- ".omp_offloading.descriptor_unreg" + Suffix, &M);
+ auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
+ ".omp_offloading.descriptor_reg" + Suffix, &M);
Func->setSection(".text.startup");
- // Get __tgt_unregister_lib function declaration.
- auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
- /*isVarArg*/ false);
- FunctionCallee UnRegFuncC =
- M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy);
+ // Get __tgt_register_lib function declaration.
+ auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
+ /*isVarArg*/ false);
+ FunctionCallee RegFuncC =
+ M.getOrInsertFunction("__tgt_register_lib", RegFuncTy);
+
+ auto *AtExitTy = FunctionType::get(
+ Type::getInt32Ty(C), PointerType::getUnqual(C), /*isVarArg=*/false);
+ FunctionCallee AtExit = M.getOrInsertFunction("atexit", AtExitTy);
+
+ Function *UnregFunc = createUnregisterFunction(M, BinDesc, Suffix);
// Construct function body
IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
- Builder.CreateCall(UnRegFuncC, BinDesc);
+ Builder.CreateCall(RegFuncC, BinDesc);
+
+ // Register the destructors with 'atexit', This is expected by the CUDA
+ // runtime and ensures that we clean up before dynamic objects are destroyed.
+ Builder.CreateCall(AtExit, UnregFunc);
Builder.CreateRetVoid();
- // Add this function to global destructors.
- // Match priority of __tgt_register_lib
- appendToGlobalDtors(M, Func, /*Priority*/ 1);
+ // Add this function to constructors.
+ appendToGlobalCtors(M, Func, /*Priority=*/101);
}
// struct fatbin_wrapper {
@@ -578,7 +581,7 @@ void createRegisterFatbinFunction(Module &M, GlobalVariable *FatbinDesc,
DtorBuilder.CreateRetVoid();
// Add this function to constructors.
- appendToGlobalCtors(M, CtorFunc, /*Priority*/ 1);
+ appendToGlobalCtors(M, CtorFunc, /*Priority=*/101);
}
} // namespace
@@ -591,7 +594,6 @@ Error offloading::wrapOpenMPBinaries(Module &M, ArrayRef<ArrayRef<char>> Images,
return createStringError(inconvertibleErrorCode(),
"No binary descriptors created.");
createRegisterFunction(M, Desc, Suffix);
- createUnregisterFunction(M, Desc, Suffix);
return Error::success();
}
|
@@ -186,57 +186,60 @@ GlobalVariable *createBinDesc(Module &M, ArrayRef<ArrayRef<char>> Bufs, | |||
".omp_offloading.descriptor" + Suffix); | |||
} | |||
|
|||
void createRegisterFunction(Module &M, GlobalVariable *BinDesc, | |||
StringRef Suffix) { | |||
Function *createUnregisterFunction(Module &M, GlobalVariable *BinDesc, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems the order of createRegisterFunction and createUnregisterFunction is swapped. This causes some artificial differences. Is it OK to keep their original order.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, since I need to call this function from createRegisterFunction
now. I could forward declare it but I don't think there's a point given it's inside an anonymous namespace.
So, looking into |
…ctor Summary: The 'new driver' sets up the lifetime of a registered liftime using global constructors and destructors. Currently, this is put at priority 1 which isn't strictly conformant as it will conflict with system utilities. We now use 101 as this is the loweest suggested for non-system constructors and will still run before user constructors. Secondly, there were issues with the CUDA runtime when destructed with a global destructor. Because the global ones are in any order and potentially run before other things we were hitting an edge case where the OpenMP runtime was uninitialized *after* `_dl_fini` was called. This would result in us erroring when we call into a destroyed `libcuda.so` instance. using `atexit` is what CUDA / HIP use and it prevents this from happening. Most everything uses `atexit` except system utilities and because of the constructor priority it will be unregistered *after* everything else but not after `_fl_fini`.
1583db2
to
875ed36
Compare
Fixed, I neglected the fact that OpenMP registers more destructors inside of the constructor itself. Passes all the tests now. |
|
||
// Construct function body | ||
IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); | ||
Builder.CreateCall(UnRegFuncC, BinDesc); | ||
|
||
// Register the destructors with 'atexit', This is expected by the CUDA |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Typo. ,
-> .
|
||
// Construct function body | ||
IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); | ||
Builder.CreateCall(UnRegFuncC, BinDesc); | ||
|
||
// Register the destructors with 'atexit', This is expected by the CUDA |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is expected by the CUDA runtime
I'd add a reference to clang/lib/CodeGen/CGCUDANV.cpp which provides some history why we switched to atexit
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think that's actually in this file somewhere for the CUDA wrapper portion.
Summary:
The 'new driver' sets up the lifetime of a registered liftime using
global constructors and destructors. Currently, this is put at priority
1 which isn't strictly conformant as it will conflict with system
utilities. We now use 101 as this is the loweest suggested for
non-system constructors and will still run before user constructors.
Secondly, there were issues with the CUDA runtime when destructed with a
global destructor. Because the global ones are in any order and
potentially run before other things we were hitting an edge case where
the OpenMP runtime was uninitialized after
_dl_fini
was called. Thiswould result in us erroring when we call into a destroyed
libcuda.so
instance. using
atexit
is what CUDA / HIP use and it prevents thisfrom happening. Most everything uses
atexit
except system utilitiesand because of the constructor priority it will be unregistered after
everything else but not after
_fl_fini
.