diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp index 8424d0f5df082..e7a38a93c9acb 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -1976,12 +1976,18 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { virtual Error callGlobalConstructors(GenericPluginTy &Plugin, DeviceImageTy &Image) override { - return callGlobalCtorDtorCommon(Plugin, Image, "amdgcn.device.init"); + GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); + if (Handler.isSymbolInImage(*this, Image, "amdgcn.device.fini")) + Image.setPendingGlobalDtors(); + + return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true); } virtual Error callGlobalDestructors(GenericPluginTy &Plugin, DeviceImageTy &Image) override { - return callGlobalCtorDtorCommon(Plugin, Image, "amdgcn.device.fini"); + if (Image.hasPendingGlobalDtors()) + return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false); + return Plugin::success(); } const uint64_t getStreamBusyWaitMicroseconds() const { @@ -2701,15 +2707,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// Common method to invoke a single threaded constructor or destructor /// kernel by name. Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image, - const char *Name) { + bool IsCtor) { + const char *KernelName = + IsCtor ? "amdgcn.device.init" : "amdgcn.device.fini"; // Perform a quick check for the named kernel in the image. The kernel // should be created by the 'amdgpu-lower-ctor-dtor' pass. GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); - if (!Handler.isSymbolInImage(*this, Image, Name)) + if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName)) return Plugin::success(); // Allocate and construct the AMDGPU kernel. - AMDGPUKernelTy AMDGPUKernel(Name); + AMDGPUKernelTy AMDGPUKernel(KernelName); if (auto Err = AMDGPUKernel.init(*this, Image)) return Err; diff --git a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h index b85dc146d86d2..f7f723610dd05 100644 --- a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h @@ -218,12 +218,19 @@ class DeviceImageTy { const __tgt_device_image *TgtImage; const __tgt_device_image *TgtImageBitcode; + /// If this image has any global destructors that much be called. + /// FIXME: This is only required because we currently have no invariants + /// towards the lifetime of the underlying image. We should either copy + /// the image into memory locally or erase the pointers after init. + bool PendingGlobalDtors; + /// Table of offload entries. OffloadEntryTableTy OffloadEntryTable; public: DeviceImageTy(int32_t Id, const __tgt_device_image *Image) - : ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr) { + : ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr), + PendingGlobalDtors(false) { assert(TgtImage && "Invalid target image"); } @@ -255,6 +262,10 @@ class DeviceImageTy { "Image"); } + /// Accessors to the boolean value + bool setPendingGlobalDtors() { return PendingGlobalDtors = true; } + bool hasPendingGlobalDtors() const { return PendingGlobalDtors; } + /// Get a reference to the offload entry table for the image. OffloadEntryTableTy &getOffloadEntryTable() { return OffloadEntryTable; } }; diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp index 7ccd0b4fef284..ce6b39898ae95 100644 --- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -392,12 +392,20 @@ struct CUDADeviceTy : public GenericDeviceTy { virtual Error callGlobalConstructors(GenericPluginTy &Plugin, DeviceImageTy &Image) override { + // Check for the presense of global destructors at initialization time. This + // is required when the image may be deallocated before destructors are run. + GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); + if (Handler.isSymbolInImage(*this, Image, "nvptx$device$fini")) + Image.setPendingGlobalDtors(); + return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true); } virtual Error callGlobalDestructors(GenericPluginTy &Plugin, DeviceImageTy &Image) override { - return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false); + if (Image.hasPendingGlobalDtors()) + return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false); + return Plugin::success(); } Expected> @@ -1145,7 +1153,7 @@ struct CUDADeviceTy : public GenericDeviceTy { // Perform a quick check for the named kernel in the image. The kernel // should be created by the 'nvptx-lower-ctor-dtor' pass. GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); - if (!Handler.isSymbolInImage(*this, Image, KernelName)) + if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName)) return Plugin::success(); // The Nvidia backend cannot handle creating the ctor / dtor array