diff --git a/openmp/libomptarget/include/Shared/APITypes.h b/openmp/libomptarget/include/Shared/APITypes.h index 763a22f0a5e86..94521b4fbb577 100644 --- a/openmp/libomptarget/include/Shared/APITypes.h +++ b/openmp/libomptarget/include/Shared/APITypes.h @@ -62,6 +62,11 @@ struct __tgt_target_table { *EntriesEnd; // End of the table with all the entries (non inclusive) }; +/// This struct contains a handle to a loaded binary in the plugin device. +struct __tgt_device_binary { + uintptr_t handle; +}; + // clang-format on /// This struct contains information exchanged between different asynchronous diff --git a/openmp/libomptarget/include/Shared/PluginAPI.h b/openmp/libomptarget/include/Shared/PluginAPI.h index aece53d7ee1ca..5de5f106045b5 100644 --- a/openmp/libomptarget/include/Shared/PluginAPI.h +++ b/openmp/libomptarget/include/Shared/PluginAPI.h @@ -57,8 +57,18 @@ int32_t __tgt_rtl_init_device(int32_t ID); // return NULL. Otherwise, return a pointer to the built address table. // Individual entries in the table may also be NULL, when the corresponding // offload region is not supported on the target device. -__tgt_target_table *__tgt_rtl_load_binary(int32_t ID, - __tgt_device_image *Image); +int32_t __tgt_rtl_load_binary(int32_t ID, __tgt_device_image *Image, + __tgt_device_binary *Binary); + +// Look up the device address of the named symbol in the given binary. Returns +// non-zero on failure. +int32_t __tgt_rtl_get_global(__tgt_device_binary Binary, uint64_t Size, + const char *Name, void **DevicePtr); + +// Look up the device address of the named kernel in the given binary. Returns +// non-zero on failure. +int32_t __tgt_rtl_get_function(__tgt_device_binary Binary, const char *Name, + void **DevicePtr); // Allocate data on the particular target device, of the specified size. // HostPtr is a address of the host data the allocated target data diff --git a/openmp/libomptarget/include/Shared/PluginAPI.inc b/openmp/libomptarget/include/Shared/PluginAPI.inc index b842c6eef1d4f..5f8a9dd11fdce 100644 --- a/openmp/libomptarget/include/Shared/PluginAPI.inc +++ b/openmp/libomptarget/include/Shared/PluginAPI.inc @@ -19,6 +19,8 @@ PLUGIN_API_HANDLE(is_data_exchangable, false); PLUGIN_API_HANDLE(number_of_devices, true); PLUGIN_API_HANDLE(init_device, true); PLUGIN_API_HANDLE(load_binary, true); +PLUGIN_API_HANDLE(get_global, true); +PLUGIN_API_HANDLE(get_function, true); PLUGIN_API_HANDLE(data_alloc, true); PLUGIN_API_HANDLE(data_submit, true); PLUGIN_API_HANDLE(data_submit_async, false); diff --git a/openmp/libomptarget/include/device.h b/openmp/libomptarget/include/device.h index 3023fba6cc64d..1dc82e36f6813 100644 --- a/openmp/libomptarget/include/device.h +++ b/openmp/libomptarget/include/device.h @@ -70,7 +70,7 @@ struct DeviceTy { /// Provide access to the mapping handler. MappingInfoTy &getMappingInfo() { return MappingInfo; } - __tgt_target_table *loadBinary(__tgt_device_image *Img); + llvm::Expected<__tgt_device_binary> loadBinary(__tgt_device_image *Img); // device memory allocation/deallocation routines /// Allocates \p Size bytes on the device, host or shared memory space diff --git a/openmp/libomptarget/include/rtl.h b/openmp/libomptarget/include/rtl.h index d110e89de5f14..5e198bdad4364 100644 --- a/openmp/libomptarget/include/rtl.h +++ b/openmp/libomptarget/include/rtl.h @@ -26,11 +26,16 @@ /// are trying to (re)register an existing lib or really have a new one. struct TranslationTable { __tgt_target_table HostTable; + llvm::SmallVector<__tgt_target_table> DeviceTables; // Image assigned to a given device. llvm::SmallVector<__tgt_device_image *> TargetsImages; // One image per device ID. + // Arrays of entries active on the device. + llvm::SmallVector> + TargetsEntries; // One table per device ID. + // Table of entry points or NULL if it was not already computed. llvm::SmallVector<__tgt_target_table *> TargetsTable; // One table per device ID. diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp index 8066a231ef93f..81634ae1edc49 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -439,8 +439,9 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy { /// Class implementing the AMDGPU device images' properties. struct AMDGPUDeviceImageTy : public DeviceImageTy { /// Create the AMDGPU image with the id and the target image pointer. - AMDGPUDeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage) - : DeviceImageTy(ImageId, TgtImage) {} + AMDGPUDeviceImageTy(int32_t ImageId, GenericDeviceTy &Device, + const __tgt_device_image *TgtImage) + : DeviceImageTy(ImageId, Device, TgtImage) {} /// Prepare and load the executable corresponding to the image. Error loadExecutable(const AMDGPUDeviceTy &Device); @@ -2105,14 +2106,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { uint64_t getClockFrequency() const override { return ClockFrequency; } /// Allocate and construct an AMDGPU kernel. - Expected - constructKernel(const __tgt_offload_entry &KernelEntry) override { + Expected constructKernel(const char *Name) override { // Allocate and construct the AMDGPU kernel. AMDGPUKernelTy *AMDGPUKernel = Plugin::get().allocate(); if (!AMDGPUKernel) return Plugin::error("Failed to allocate memory for AMDGPU kernel"); - new (AMDGPUKernel) AMDGPUKernelTy(KernelEntry.name); + new (AMDGPUKernel) AMDGPUKernelTy(Name); return *AMDGPUKernel; } @@ -2160,7 +2160,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { // Allocate and initialize the image object. AMDGPUDeviceImageTy *AMDImage = Plugin::get().allocate(); - new (AMDImage) AMDGPUDeviceImageTy(ImageId, TgtImage); + new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, TgtImage); // Load the HSA executable. if (Error Err = AMDImage->loadExecutable(*this)) diff --git a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h index 8707e7b4c504e..5c767995126b7 100644 --- a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h +++ b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h @@ -47,9 +47,6 @@ class GlobalTy { GlobalTy(const std::string &Name, uint32_t Size, void *Ptr = nullptr) : Name(Name), Size(Size), Ptr(Ptr) {} - GlobalTy(const __tgt_offload_entry &Entry) - : Name(Entry.name), Size(Entry.size), Ptr(Entry.addr) {} - const std::string &getName() const { return Name; } uint32_t getSize() const { return Size; } void *getPtr() const { return Ptr; } diff --git a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h index d55dfbdd9e4c1..3c2a4d7e6c0e7 100644 --- a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h @@ -182,34 +182,6 @@ class InfoQueueTy { /// specific device. This class is responsible for storing and managing /// the offload entries for an image on a device. class DeviceImageTy { - - /// Class representing the offload entry table. The class stores the - /// __tgt_target_table and a map to search in the table faster. - struct OffloadEntryTableTy { - /// Add new entry to the table. - void addEntry(const __tgt_offload_entry &Entry) { - Entries.push_back(Entry); - TTTablePtr.EntriesBegin = &Entries[0]; - TTTablePtr.EntriesEnd = TTTablePtr.EntriesBegin + Entries.size(); - } - - /// Get the raw pointer to the __tgt_target_table. - operator __tgt_target_table *() { - if (Entries.empty()) - return nullptr; - return &TTTablePtr; - } - - private: - __tgt_target_table TTTablePtr; - llvm::SmallVector<__tgt_offload_entry> Entries; - - public: - using const_iterator = decltype(Entries)::const_iterator; - const_iterator begin() const { return Entries.begin(); } - const_iterator end() const { return Entries.end(); } - }; - /// Image identifier within the corresponding device. Notice that this id is /// not unique between different device; they may overlap. int32_t ImageId; @@ -218,18 +190,19 @@ class DeviceImageTy { const __tgt_device_image *TgtImage; const __tgt_device_image *TgtImageBitcode; + /// Reference to the device this image is loaded on. + GenericDeviceTy &Device; + /// If this image has any global destructors that much be called. /// FIXME: This is only required because we currently have no invariants /// towards the lifetime of the underlying image. We should either copy /// the image into memory locally or erase the pointers after init. bool PendingGlobalDtors; - /// Table of offload entries. - OffloadEntryTableTy OffloadEntryTable; - public: - DeviceImageTy(int32_t Id, const __tgt_device_image *Image) - : ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr), + DeviceImageTy(int32_t Id, GenericDeviceTy &Device, + const __tgt_device_image *Image) + : ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr), Device(Device), PendingGlobalDtors(false) { assert(TgtImage && "Invalid target image"); } @@ -237,6 +210,9 @@ class DeviceImageTy { /// Get the image identifier within the device. int32_t getId() const { return ImageId; } + /// Get the device that this image is loaded onto. + GenericDeviceTy &getDevice() const { return Device; } + /// Get the pointer to the raw __tgt_device_image. const __tgt_device_image *getTgtImage() const { return TgtImage; } @@ -261,13 +237,9 @@ class DeviceImageTy { return MemoryBufferRef(StringRef((const char *)getStart(), getSize()), "Image"); } - /// Accessors to the boolean value bool setPendingGlobalDtors() { return PendingGlobalDtors = true; } bool hasPendingGlobalDtors() const { return PendingGlobalDtors; } - - /// Get a reference to the offload entry table for the image. - OffloadEntryTableTy &getOffloadEntryTable() { return OffloadEntryTable; } }; /// Class implementing common functionalities of offload kernels. Each plugin @@ -661,8 +633,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy { virtual Error deinitImpl() = 0; /// Load the binary image into the device and return the target table. - Expected<__tgt_target_table *> loadBinary(GenericPluginTy &Plugin, - const __tgt_device_image *TgtImage); + Expected loadBinary(GenericPluginTy &Plugin, + const __tgt_device_image *TgtImage); virtual Expected loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0; @@ -680,9 +652,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy { // up to the target to override this using the shouldSetupRPCServer function. Error setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image); - /// Register the offload entries for a specific image on the device. - Error registerOffloadEntries(DeviceImageTy &Image); - /// Synchronize the current thread with the pending operations on the /// __tgt_async_info structure. Error synchronize(__tgt_async_info *AsyncInfo); @@ -888,21 +857,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy { bool useAutoZeroCopy(); virtual bool useAutoZeroCopyImpl() { return false; } -private: - /// Register offload entry for global variable. - Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage, - const __tgt_offload_entry &GlobalEntry, - __tgt_offload_entry &DeviceEntry); - - /// Register offload entry for kernel function. - Error registerKernelOffloadEntry(DeviceImageTy &DeviceImage, - const __tgt_offload_entry &KernelEntry, - __tgt_offload_entry &DeviceEntry); - /// Allocate and construct a kernel object. - virtual Expected - constructKernel(const __tgt_offload_entry &KernelEntry) = 0; + virtual Expected constructKernel(const char *Name) = 0; +private: /// Get and set the stack size and heap size for the device. If not used, the /// plugin can implement the setters as no-op and setting the output /// value to zero for the getters. diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp index 6ae30e78ce8c2..def9c14fa53f8 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp @@ -61,6 +61,14 @@ struct RecordReplayTy { bool UsedVAMap = false; uintptr_t MemoryOffset = 0; + // A list of all globals mapped to the device. + struct GlobalEntry { + const char *Name; + uint64_t Size; + void *Addr; + }; + llvm::SmallVector GlobalEntries{}; + void *suggestAddress(uint64_t MaxMemoryAllocation) { // Get a valid pointer address for this system void *Addr = @@ -189,6 +197,9 @@ struct RecordReplayTy { } void setStatus(RRStatusTy Status) { this->Status = Status; } bool isSaveOutputEnabled() const { return ReplaySaveOutput; } + void addEntry(const char *Name, uint64_t Size, void *Addr) { + GlobalEntries.emplace_back(GlobalEntry{Name, Size, Addr}); + } void saveImage(const char *Name, const DeviceImageTy &Image) { SmallString<128> ImageName = {Name, ".image"}; @@ -211,12 +222,12 @@ struct RecordReplayTy { void dumpGlobals(StringRef Filename, DeviceImageTy &Image) { int32_t Size = 0; - for (auto &OffloadEntry : Image.getOffloadEntryTable()) { - if (!OffloadEntry.size) + for (auto &OffloadEntry : GlobalEntries) { + if (!OffloadEntry.Size) continue; // Get the total size of the string and entry including the null byte. - Size += std::strlen(OffloadEntry.name) + 1 + sizeof(uint32_t) + - OffloadEntry.size; + Size += std::strlen(OffloadEntry.Name) + 1 + sizeof(uint32_t) + + OffloadEntry.Size; } ErrorOr> GlobalsMB = @@ -225,26 +236,26 @@ struct RecordReplayTy { report_fatal_error("Error creating MemoryBuffer for globals memory"); void *BufferPtr = GlobalsMB.get()->getBufferStart(); - for (auto &OffloadEntry : Image.getOffloadEntryTable()) { - if (!OffloadEntry.size) + for (auto &OffloadEntry : GlobalEntries) { + if (!OffloadEntry.Size) continue; - int32_t NameLength = std::strlen(OffloadEntry.name) + 1; - memcpy(BufferPtr, OffloadEntry.name, NameLength); + int32_t NameLength = std::strlen(OffloadEntry.Name) + 1; + memcpy(BufferPtr, OffloadEntry.Name, NameLength); BufferPtr = advanceVoidPtr(BufferPtr, NameLength); - *((uint32_t *)(BufferPtr)) = OffloadEntry.size; + *((uint32_t *)(BufferPtr)) = OffloadEntry.Size; BufferPtr = advanceVoidPtr(BufferPtr, sizeof(uint32_t)); auto Err = Plugin::success(); { - if (auto Err = Device->dataRetrieve(BufferPtr, OffloadEntry.addr, - OffloadEntry.size, nullptr)) + if (auto Err = Device->dataRetrieve(BufferPtr, OffloadEntry.Addr, + OffloadEntry.Size, nullptr)) report_fatal_error("Error retrieving data for global"); } if (Err) report_fatal_error("Error retrieving data for global"); - BufferPtr = advanceVoidPtr(BufferPtr, OffloadEntry.size); + BufferPtr = advanceVoidPtr(BufferPtr, OffloadEntry.Size); } assert(BufferPtr == GlobalsMB->get()->getBufferEnd() && "Buffer over/under-filled."); @@ -841,7 +852,7 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { return deinitImpl(); } -Expected<__tgt_target_table *> +Expected GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, const __tgt_device_image *InputTgtImage) { assert(InputTgtImage && "Expected non-null target image"); @@ -885,10 +896,6 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, return std::move(Err); } - // Register all offload entries of the image. - if (auto Err = registerOffloadEntries(*Image)) - return std::move(Err); - if (auto Err = setupRPCServer(Plugin, *Image)) return std::move(Err); @@ -909,7 +916,7 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, return std::move(Err); // Return the pointer to the table of entries. - return Image->getOffloadEntryTable(); + return Image; } Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin, @@ -1018,99 +1025,6 @@ Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin, return Plugin::success(); } -Error GenericDeviceTy::registerOffloadEntries(DeviceImageTy &Image) { - const __tgt_offload_entry *Begin = Image.getTgtImage()->EntriesBegin; - const __tgt_offload_entry *End = Image.getTgtImage()->EntriesEnd; - for (const __tgt_offload_entry *Entry = Begin; Entry != End; ++Entry) { - // The host should have always something in the address to uniquely - // identify the entry. - if (!Entry->addr) - return Plugin::error("Failure to register entry without address"); - - __tgt_offload_entry DeviceEntry = {0}; - - if (Entry->size) { - if (auto Err = registerGlobalOffloadEntry(Image, *Entry, DeviceEntry)) - return Err; - } else { - if (auto Err = registerKernelOffloadEntry(Image, *Entry, DeviceEntry)) - return Err; - } - - assert(DeviceEntry.addr && "Device addr of offload entry cannot be null"); - - DP("Entry point " DPxMOD " maps to%s %s (" DPxMOD ")\n", - DPxPTR(Entry - Begin), (Entry->size) ? " global" : "", Entry->name, - DPxPTR(DeviceEntry.addr)); - } - return Plugin::success(); -} - -Error GenericDeviceTy::registerGlobalOffloadEntry( - DeviceImageTy &Image, const __tgt_offload_entry &GlobalEntry, - __tgt_offload_entry &DeviceEntry) { - - GenericPluginTy &Plugin = Plugin::get(); - - DeviceEntry = GlobalEntry; - - // Create a metadata object for the device global. - GlobalTy DeviceGlobal(GlobalEntry.name, GlobalEntry.size); - - // Get the address of the device of the global. - GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); - if (auto Err = - GHandler.getGlobalMetadataFromDevice(*this, Image, DeviceGlobal)) - return Err; - - // Store the device address on the device entry. - DeviceEntry.addr = DeviceGlobal.getPtr(); - assert(DeviceEntry.addr && "Invalid device global's address"); - - // Note: In the current implementation declare target variables - // can either be link or to. This means that once unified - // memory is activated via the requires directive, the variable - // can be used directly from the host in both cases. - if (Plugin.getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY) { - // If unified memory is present any target link or to variables - // can access host addresses directly. There is no longer a - // need for device copies. - GlobalTy HostGlobal(GlobalEntry); - if (auto Err = - GHandler.writeGlobalToDevice(*this, HostGlobal, DeviceGlobal)) - return Err; - } - - // Add the device entry on the entry table. - Image.getOffloadEntryTable().addEntry(DeviceEntry); - - return Plugin::success(); -} - -Error GenericDeviceTy::registerKernelOffloadEntry( - DeviceImageTy &Image, const __tgt_offload_entry &KernelEntry, - __tgt_offload_entry &DeviceEntry) { - DeviceEntry = KernelEntry; - - // Create a kernel object. - auto KernelOrErr = constructKernel(KernelEntry); - if (!KernelOrErr) - return KernelOrErr.takeError(); - - GenericKernelTy &Kernel = *KernelOrErr; - - // Initialize the kernel. - if (auto Err = Kernel.init(*this, Image)) - return Err; - - // Set the device entry address to the kernel address and store the entry on - // the entry table. - DeviceEntry.addr = (void *)&Kernel; - Image.getOffloadEntryTable().addEntry(DeviceEntry); - - return Plugin::success(); -} - Error PinnedAllocationMapTy::insertEntry(void *HstPtr, void *DevAccessiblePtr, size_t Size, bool ExternallyLocked) { // Insert the new entry into the map. @@ -1757,23 +1671,25 @@ int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize, return OFFLOAD_SUCCESS; } -__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, - __tgt_device_image *TgtImage) { +int32_t __tgt_rtl_load_binary(int32_t DeviceId, __tgt_device_image *TgtImage, + __tgt_device_binary *Binary) { GenericPluginTy &Plugin = Plugin::get(); GenericDeviceTy &Device = Plugin.getDevice(DeviceId); - auto TableOrErr = Device.loadBinary(Plugin, TgtImage); - if (!TableOrErr) { - auto Err = TableOrErr.takeError(); + auto ImageOrErr = Device.loadBinary(Plugin, TgtImage); + if (!ImageOrErr) { + auto Err = ImageOrErr.takeError(); REPORT("Failure to load binary image %p on device %d: %s\n", TgtImage, DeviceId, toString(std::move(Err)).data()); - return nullptr; + return OFFLOAD_FAIL; } - __tgt_target_table *Table = *TableOrErr; - assert(Table != nullptr && "Invalid table"); + DeviceImageTy *Image = *ImageOrErr; + assert(Image != nullptr && "Invalid Image"); + + *Binary = __tgt_device_binary{reinterpret_cast(Image)}; - return Table; + return OFFLOAD_SUCCESS; } void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr, @@ -2077,6 +1993,58 @@ int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId) { return false; return Plugin::get().getDevice(DeviceId).useAutoZeroCopy(); } + +int32_t __tgt_rtl_get_global(__tgt_device_binary Binary, uint64_t Size, + const char *Name, void **DevicePtr) { + assert(Binary.handle && "Invalid device binary handle"); + DeviceImageTy &Image = *reinterpret_cast(Binary.handle); + + GenericPluginTy &Plugin = Plugin::get(); + GenericDeviceTy &Device = Image.getDevice(); + + GlobalTy DeviceGlobal(Name, Size); + GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); + if (auto Err = + GHandler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) { + REPORT("Failure to look up global address: %s\n", + toString(std::move(Err)).data()); + return OFFLOAD_FAIL; + } + + *DevicePtr = DeviceGlobal.getPtr(); + assert(DevicePtr && "Invalid device global's address"); + + // Save the loaded globals if we are recording. + if (RecordReplay.isRecording()) + RecordReplay.addEntry(Name, Size, *DevicePtr); + + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_get_function(__tgt_device_binary Binary, const char *Name, + void **KernelPtr) { + assert(Binary.handle && "Invalid device binary handle"); + DeviceImageTy &Image = *reinterpret_cast(Binary.handle); + + GenericDeviceTy &Device = Image.getDevice(); + + auto KernelOrErr = Device.constructKernel(Name); + if (Error Err = KernelOrErr.takeError()) { + REPORT("Failure to look up kernel: %s\n", toString(std::move(Err)).data()); + return OFFLOAD_FAIL; + } + + GenericKernelTy &Kernel = *KernelOrErr; + if (auto Err = Kernel.init(Device, Image)) { + REPORT("Failure to init kernel: %s\n", toString(std::move(Err)).data()); + return OFFLOAD_FAIL; + } + + // Note that this is not the kernel's device address. + *KernelPtr = &Kernel; + return OFFLOAD_SUCCESS; +} + #ifdef __cplusplus } #endif diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp index ce6b39898ae95..5ed73d103584d 100644 --- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -77,8 +77,9 @@ CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {} /// Class implementing the CUDA device images properties. struct CUDADeviceImageTy : public DeviceImageTy { /// Create the CUDA image with the id and the target image pointer. - CUDADeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage) - : DeviceImageTy(ImageId, TgtImage), Module(nullptr) {} + CUDADeviceImageTy(int32_t ImageId, GenericDeviceTy &Device, + const __tgt_device_image *TgtImage) + : DeviceImageTy(ImageId, Device, TgtImage), Module(nullptr) {} /// Load the image as a CUDA module. Error loadModule() { @@ -468,14 +469,13 @@ struct CUDADeviceTy : public GenericDeviceTy { } /// Allocate and construct a CUDA kernel. - Expected - constructKernel(const __tgt_offload_entry &KernelEntry) override { + Expected constructKernel(const char *Name) override { // Allocate and construct the CUDA kernel. CUDAKernelTy *CUDAKernel = Plugin::get().allocate(); if (!CUDAKernel) return Plugin::error("Failed to allocate memory for CUDA kernel"); - new (CUDAKernel) CUDAKernelTy(KernelEntry.name); + new (CUDAKernel) CUDAKernelTy(Name); return *CUDAKernel; } @@ -530,7 +530,7 @@ struct CUDADeviceTy : public GenericDeviceTy { // Allocate and initialize the image object. CUDADeviceImageTy *CUDAImage = Plugin::get().allocate(); - new (CUDAImage) CUDADeviceImageTy(ImageId, TgtImage); + new (CUDAImage) CUDADeviceImageTy(ImageId, *this, TgtImage); // Load the CUDA module. if (auto Err = CUDAImage->loadModule()) diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp index 6466afc543b56..38fc275804faf 100644 --- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp @@ -111,8 +111,9 @@ struct GenELF64KernelTy : public GenericKernelTy { /// Class implementing the GenELF64 device images properties. struct GenELF64DeviceImageTy : public DeviceImageTy { /// Create the GenELF64 image with the id and the target image pointer. - GenELF64DeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage) - : DeviceImageTy(ImageId, TgtImage), DynLib() {} + GenELF64DeviceImageTy(int32_t ImageId, GenericDeviceTy &Device, + const __tgt_device_image *TgtImage) + : DeviceImageTy(ImageId, Device, TgtImage), DynLib() {} /// Getter and setter for the dynamic library. DynamicLibrary &getDynamicLibrary() { return DynLib; } @@ -141,15 +142,14 @@ struct GenELF64DeviceTy : public GenericDeviceTy { std::string getComputeUnitKind() const override { return "generic-64bit"; } /// Construct the kernel for a specific image on the device. - Expected - constructKernel(const __tgt_offload_entry &KernelEntry) override { + Expected constructKernel(const char *Name) override { // Allocate and construct the kernel. GenELF64KernelTy *GenELF64Kernel = Plugin::get().allocate(); if (!GenELF64Kernel) return Plugin::error("Failed to allocate memory for GenELF64 kernel"); - new (GenELF64Kernel) GenELF64KernelTy(KernelEntry.name); + new (GenELF64Kernel) GenELF64KernelTy(Name); return *GenELF64Kernel; } @@ -163,7 +163,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy { // Allocate and initialize the image object. GenELF64DeviceImageTy *Image = Plugin::get().allocate(); - new (Image) GenELF64DeviceImageTy(ImageId, TgtImage); + new (Image) GenELF64DeviceImageTy(ImageId, *this, TgtImage); // Create a temporary file. char TmpFileName[] = "/tmp/tmpfile_XXXXXX"; diff --git a/openmp/libomptarget/src/PluginManager.cpp b/openmp/libomptarget/src/PluginManager.cpp index 50059ba23b1a7..f65ffc47d89a1 100644 --- a/openmp/libomptarget/src/PluginManager.cpp +++ b/openmp/libomptarget/src/PluginManager.cpp @@ -192,7 +192,9 @@ static void registerImageIntoTranslationTable(TranslationTable &TT, RTL.DeviceOffset + RTL.getNumberOfUserDevices(); if (TT.TargetsTable.size() < TargetsTableMinimumSize) { + TT.DeviceTables.resize(TargetsTableMinimumSize, {}); TT.TargetsImages.resize(TargetsTableMinimumSize, 0); + TT.TargetsEntries.resize(TargetsTableMinimumSize, {}); TT.TargetsTable.resize(TargetsTableMinimumSize, 0); } diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp index 404d7b6174e4a..9bdc6b7cd8c9a 100644 --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -107,8 +107,14 @@ llvm::Error DeviceTy::init() { } // Load binary to device. -__tgt_target_table *DeviceTy::loadBinary(__tgt_device_image *Img) { - return RTL->load_binary(RTLDeviceID, Img); +llvm::Expected<__tgt_device_binary> +DeviceTy::loadBinary(__tgt_device_image *Img) { + __tgt_device_binary Binary; + + if (RTL->load_binary(RTLDeviceID, Img, &Binary) != OFFLOAD_SUCCESS) + return llvm::createStringError(llvm::inconvertibleErrorCode(), + "Failed to load binary %p", Img); + return Binary; } void *DeviceTy::allocData(int64_t Size, void *HstPtr, int32_t Kind) { diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp index eb2ecfc2bc56b..04490ab076b65 100644 --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -168,19 +168,57 @@ static int initLibrary(DeviceTy &Device) { Rc = OFFLOAD_FAIL; break; } - // 2) load image into the target table. - __tgt_target_table *TargetTable = TransTable->TargetsTable[DeviceId] = - Device.loadBinary(Img); - // Unable to get table for this image: invalidate image and fail. - if (!TargetTable) { - REPORT("Unable to generate entries table for device id %d.\n", - DeviceId); - TransTable->TargetsImages[DeviceId] = 0; + + // 2) Load the image onto the given device. + auto BinaryOrErr = Device.loadBinary(Img); + if (llvm::Error Err = BinaryOrErr.takeError()) { + REPORT("Failed to load image %s\n", + llvm::toString(std::move(Err)).c_str()); Rc = OFFLOAD_FAIL; break; } - // Verify whether the two table sizes match. + // 3) Create the translation table. + llvm::SmallVector<__tgt_offload_entry> &DeviceEntries = + TransTable->TargetsEntries[DeviceId]; + for (__tgt_offload_entry &Entry : + llvm::make_range(Img->EntriesBegin, Img->EntriesEnd)) { + __tgt_device_binary &Binary = *BinaryOrErr; + + __tgt_offload_entry DeviceEntry = Entry; + if (Entry.size) { + if (Device.RTL->get_global(Binary, Entry.size, Entry.name, + &DeviceEntry.addr) != OFFLOAD_SUCCESS) + REPORT("Failed to load symbol %s\n", Entry.name); + + // If unified memory is active, the corresponding global is a device + // reference to the host global. We need to initialize the pointer on + // the deive to point to the memory on the host. + if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) { + if (Device.RTL->data_submit(DeviceId, DeviceEntry.addr, Entry.addr, + Entry.size) != OFFLOAD_SUCCESS) + REPORT("Failed to write symbol for USM %s\n", Entry.name); + } + } else { + if (Device.RTL->get_function(Binary, Entry.name, &DeviceEntry.addr) != + OFFLOAD_SUCCESS) + REPORT("Failed to load kernel %s\n", Entry.name); + } + DP("Entry point " DPxMOD " maps to%s %s (" DPxMOD ")\n", + DPxPTR(Entry.addr), (Entry.size) ? " global" : "", Entry.name, + DPxPTR(DeviceEntry.addr)); + + DeviceEntries.emplace_back(DeviceEntry); + } + + // Set the storage for the table and get a pointer to it. + __tgt_target_table DeviceTable{&DeviceEntries[0], + &DeviceEntries[0] + DeviceEntries.size()}; + TransTable->DeviceTables[DeviceId] = DeviceTable; + __tgt_target_table *TargetTable = TransTable->TargetsTable[DeviceId] = + &TransTable->DeviceTables[DeviceId]; + + // 4) Verify whether the two table sizes match. size_t Hsize = TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin; size_t Tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;