diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp index a529c379844e9..c66279fc0bcc5 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2653,7 +2653,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { // Allocate and construct the AMDGPU kernel. AMDGPUKernelTy AMDGPUKernel(Name); if (auto Err = AMDGPUKernel.init(*this, Image)) - return std::move(Err); + return Err; AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr); @@ -2661,12 +2661,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u, /*NumBlocks=*/1ul, KernelArgs, /*Args=*/nullptr, AsyncInfoWrapper)) - return std::move(Err); + return Err; Error Err = Plugin::success(); AsyncInfoWrapper.finalize(Err); - return std::move(Err); + return Err; } /// Envar for controlling the number of HSA queues per device. High number of diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp index 3f798a908e736..08946e2103501 100644 --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -231,9 +231,9 @@ struct RecordReplayTy { OS.close(); } - void saveKernelDescr(const char *Name, void **ArgPtrs, ptrdiff_t *ArgOffsets, - int32_t NumArgs, uint64_t NumTeamsClause, - uint32_t ThreadLimitClause, uint64_t LoopTripCount) { + void saveKernelDescr(const char *Name, void **ArgPtrs, int32_t NumArgs, + uint64_t NumTeamsClause, uint32_t ThreadLimitClause, + uint64_t LoopTripCount) { json::Object JsonKernelInfo; JsonKernelInfo["Name"] = Name; JsonKernelInfo["NumArgs"] = NumArgs; @@ -251,7 +251,7 @@ struct RecordReplayTy { json::Array JsonArgOffsets; for (int I = 0; I < NumArgs; ++I) - JsonArgOffsets.push_back(ArgOffsets[I]); + JsonArgOffsets.push_back(0); JsonKernelInfo["ArgOffsets"] = json::Value(std::move(JsonArgOffsets)); SmallString<128> JsonFilename = {Name, ".json"}; @@ -427,6 +427,11 @@ Expected GenericKernelTy::getKernelLaunchEnvironment( GenericDeviceTy &GenericDevice, AsyncInfoWrapperTy &AsyncInfoWrapper) const { + // Ctor/Dtor have no arguments, replaying uses the original kernel launch + // environment. + if (isCtorOrDtor() || RecordReplay.isReplaying()) + return nullptr; + // TODO: Check if the kernel needs a launch environment. auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy), /*HostPtr=*/nullptr, @@ -501,6 +506,15 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, getNumBlocks(GenericDevice, KernelArgs.NumTeams, KernelArgs.Tripcount, NumThreads, KernelArgs.ThreadLimit[0] > 0); + // Record the kernel description after we modified the argument count and num + // blocks/threads. + if (RecordReplay.isRecording()) { + RecordReplay.saveImage(getName(), getImage()); + RecordReplay.saveKernelInput(getName(), getImage()); + RecordReplay.saveKernelDescr(getName(), Ptrs.data(), KernelArgs.NumArgs, + NumBlocks, NumThreads, KernelArgs.Tripcount); + } + if (auto Err = printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks)) return Err; @@ -517,16 +531,20 @@ void *GenericKernelTy::prepareArgs( if (isCtorOrDtor()) return nullptr; - NumArgs += 1; + uint32_t KLEOffset = !!KernelLaunchEnvironment; + NumArgs += KLEOffset; Args.resize(NumArgs); Ptrs.resize(NumArgs); - Ptrs[0] = KernelLaunchEnvironment; - Args[0] = &Ptrs[0]; + if (KernelLaunchEnvironment) { + Ptrs[0] = KernelLaunchEnvironment; + Args[0] = &Ptrs[0]; + } - for (int I = 1; I < NumArgs; ++I) { - Ptrs[I] = (void *)((intptr_t)ArgPtrs[I - 1] + ArgOffsets[I - 1]); + for (int I = KLEOffset; I < NumArgs; ++I) { + Ptrs[I] = + (void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]); Args[I] = &Ptrs[I]; } return &Args[0]; @@ -719,7 +737,7 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) { Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { for (DeviceImageTy *Image : LoadedImages) if (auto Err = callGlobalDestructors(Plugin, *Image)) - return std::move(Err); + return Err; if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) { GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); @@ -808,7 +826,7 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, return std::move(Err); // Setup the global device memory pool if needed. - if (shouldSetupDeviceMemoryPool()) { + if (!RecordReplay.isReplaying() && shouldSetupDeviceMemoryPool()) { uint64_t HeapSize; auto SizeOrErr = getDeviceHeapSize(HeapSize); if (SizeOrErr) { @@ -1413,21 +1431,9 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs, GenericKernelTy &GenericKernel = *reinterpret_cast(EntryPtr); - if (RecordReplay.isRecording()) { - RecordReplay.saveImage(GenericKernel.getName(), GenericKernel.getImage()); - RecordReplay.saveKernelInput(GenericKernel.getName(), - GenericKernel.getImage()); - } - auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs, AsyncInfoWrapper); - if (RecordReplay.isRecording()) - RecordReplay.saveKernelDescr(GenericKernel.getName(), ArgPtrs, ArgOffsets, - KernelArgs.NumArgs, KernelArgs.NumTeams[0], - KernelArgs.ThreadLimit[0], - KernelArgs.Tripcount); - // 'finalize' here to guarantee next record-replay actions are in-sync AsyncInfoWrapper.finalize(Err); diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp index a6e28574a7f08..d1473c94af8a6 100644 --- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -1106,30 +1106,30 @@ struct CUDADeviceTy : public GenericDeviceTy { for (auto [Name, Priority] : Funcs) { GlobalTy FunctionAddr(Name.str(), sizeof(void *), &FunctionPtrs[Idx++]); if (auto Err = Handler.readGlobalFromDevice(*this, Image, FunctionAddr)) - return std::move(Err); + return Err; } // Copy the local buffer to the device. if (auto Err = dataSubmit(GlobalPtrStart, FunctionPtrs.data(), FunctionPtrs.size() * sizeof(void *), nullptr)) - return std::move(Err); + return Err; // Copy the created buffer to the appropriate symbols so the kernel can // iterate through them. GlobalTy StartGlobal(IsCtor ? "__init_array_start" : "__fini_array_start", sizeof(void *), &GlobalPtrStart); if (auto Err = Handler.writeGlobalToDevice(*this, Image, StartGlobal)) - return std::move(Err); + return Err; GlobalTy StopGlobal(IsCtor ? "__init_array_end" : "__fini_array_end", sizeof(void *), &GlobalPtrStop); if (auto Err = Handler.writeGlobalToDevice(*this, Image, StopGlobal)) - return std::move(Err); + return Err; CUDAKernelTy CUDAKernel(KernelName); if (auto Err = CUDAKernel.init(*this, Image)) - return std::move(Err); + return Err; AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr); @@ -1137,7 +1137,7 @@ struct CUDADeviceTy : public GenericDeviceTy { if (auto Err = CUDAKernel.launchImpl(*this, /*NumThread=*/1u, /*NumBlocks=*/1ul, KernelArgs, nullptr, AsyncInfoWrapper)) - return std::move(Err); + return Err; Error Err = Plugin::success(); AsyncInfoWrapper.finalize(Err); @@ -1145,7 +1145,7 @@ struct CUDADeviceTy : public GenericDeviceTy { if (free(Buffer, TARGET_ALLOC_DEVICE) != OFFLOAD_SUCCESS) return Plugin::error("Failed to free memory for global buffer"); - return std::move(Err); + return Err; } /// Stream manager for CUDA streams.