diff --git a/clang/test/Driver/linker-wrapper-image.c b/clang/test/Driver/linker-wrapper-image.c index 7547526413522..d01445e3aed04 100644 --- a/clang/test/Driver/linker-wrapper-image.c +++ b/clang/test/Driver/linker-wrapper-image.c @@ -26,11 +26,11 @@ // OPENMP: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}", section ".llvm.offloading", align 8 // OPENMP-NEXT: @.omp_offloading.device_images = internal unnamed_addr constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr getelementptr inbounds ([[[BEGIN:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr getelementptr inbounds ([[[END:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }] // OPENMP-NEXT: @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries } -// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_reg, ptr null }] -// OPENMP-NEXT: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_unreg, ptr null }] +// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.omp_offloading.descriptor_reg, ptr null }] // OPENMP: define internal void @.omp_offloading.descriptor_reg() section ".text.startup" { // OPENMP-NEXT: entry: +// OPENMP-NEXT: %0 = call i32 @atexit(ptr @.omp_offloading.descriptor_unreg) // OPENMP-NEXT: call void @__tgt_register_lib(ptr @.omp_offloading.descriptor) // OPENMP-NEXT: ret void // OPENMP-NEXT: } @@ -62,7 +62,7 @@ // CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8 // CUDA-NEXT: @.cuda.binary_handle = internal global ptr null -// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }] +// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.cuda.fatbin_reg, ptr null }] // CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" { // CUDA-NEXT: entry: @@ -162,7 +162,7 @@ // HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8 // HIP-NEXT: @.hip.binary_handle = internal global ptr null -// HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.hip.fatbin_reg, ptr null }] +// HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.hip.fatbin_reg, ptr null }] // HIP: define internal void @.hip.fatbin_reg() section ".text.startup" { // HIP-NEXT: entry: diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp index fec1bdbe9d8c7..7241d15ed1c67 100644 --- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp +++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp @@ -186,57 +186,62 @@ GlobalVariable *createBinDesc(Module &M, ArrayRef> Bufs, ".omp_offloading.descriptor" + Suffix); } -void createRegisterFunction(Module &M, GlobalVariable *BinDesc, - StringRef Suffix) { +Function *createUnregisterFunction(Module &M, GlobalVariable *BinDesc, + StringRef Suffix) { LLVMContext &C = M.getContext(); auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); - auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, - ".omp_offloading.descriptor_reg" + Suffix, &M); + auto *Func = + Function::Create(FuncTy, GlobalValue::InternalLinkage, + ".omp_offloading.descriptor_unreg" + Suffix, &M); Func->setSection(".text.startup"); - // Get __tgt_register_lib function declaration. - auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), - /*isVarArg*/ false); - FunctionCallee RegFuncC = - M.getOrInsertFunction("__tgt_register_lib", RegFuncTy); + // Get __tgt_unregister_lib function declaration. + auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), + /*isVarArg*/ false); + FunctionCallee UnRegFuncC = + M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy); // Construct function body IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); - Builder.CreateCall(RegFuncC, BinDesc); + Builder.CreateCall(UnRegFuncC, BinDesc); Builder.CreateRetVoid(); - // Add this function to constructors. - // Set priority to 1 so that __tgt_register_lib is executed AFTER - // __tgt_register_requires (we want to know what requirements have been - // asked for before we load a libomptarget plugin so that by the time the - // plugin is loaded it can report how many devices there are which can - // satisfy these requirements). - appendToGlobalCtors(M, Func, /*Priority*/ 1); + return Func; } -void createUnregisterFunction(Module &M, GlobalVariable *BinDesc, - StringRef Suffix) { +void createRegisterFunction(Module &M, GlobalVariable *BinDesc, + StringRef Suffix) { LLVMContext &C = M.getContext(); auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); - auto *Func = - Function::Create(FuncTy, GlobalValue::InternalLinkage, - ".omp_offloading.descriptor_unreg" + Suffix, &M); + auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, + ".omp_offloading.descriptor_reg" + Suffix, &M); Func->setSection(".text.startup"); - // Get __tgt_unregister_lib function declaration. - auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), - /*isVarArg*/ false); - FunctionCallee UnRegFuncC = - M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy); + // Get __tgt_register_lib function declaration. + auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), + /*isVarArg*/ false); + FunctionCallee RegFuncC = + M.getOrInsertFunction("__tgt_register_lib", RegFuncTy); + + auto *AtExitTy = FunctionType::get( + Type::getInt32Ty(C), PointerType::getUnqual(C), /*isVarArg=*/false); + FunctionCallee AtExit = M.getOrInsertFunction("atexit", AtExitTy); + + Function *UnregFunc = createUnregisterFunction(M, BinDesc, Suffix); // Construct function body IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); - Builder.CreateCall(UnRegFuncC, BinDesc); + + // Register the destructors with 'atexit'. This is expected by the CUDA + // runtime and ensures that we clean up before dynamic objects are destroyed. + // This needs to be done before the runtime is called and registers its own. + Builder.CreateCall(AtExit, UnregFunc); + + Builder.CreateCall(RegFuncC, BinDesc); Builder.CreateRetVoid(); - // Add this function to global destructors. - // Match priority of __tgt_register_lib - appendToGlobalDtors(M, Func, /*Priority*/ 1); + // Add this function to constructors. + appendToGlobalCtors(M, Func, /*Priority=*/101); } // struct fatbin_wrapper { @@ -578,7 +583,7 @@ void createRegisterFatbinFunction(Module &M, GlobalVariable *FatbinDesc, DtorBuilder.CreateRetVoid(); // Add this function to constructors. - appendToGlobalCtors(M, CtorFunc, /*Priority*/ 1); + appendToGlobalCtors(M, CtorFunc, /*Priority=*/101); } } // namespace @@ -591,7 +596,6 @@ Error offloading::wrapOpenMPBinaries(Module &M, ArrayRef> Images, return createStringError(inconvertibleErrorCode(), "No binary descriptors created."); createRegisterFunction(M, Desc, Suffix); - createUnregisterFunction(M, Desc, Suffix); return Error::success(); }