diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index a529c379844e9..c66279fc0bcc5 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2653,7 +2653,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     // Allocate and construct the AMDGPU kernel.
     AMDGPUKernelTy AMDGPUKernel(Name);
     if (auto Err = AMDGPUKernel.init(*this, Image))
-      return std::move(Err);
+      return Err;
 
     AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
 
@@ -2661,12 +2661,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     if (auto Err = AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
                                            /*NumBlocks=*/1ul, KernelArgs,
                                            /*Args=*/nullptr, AsyncInfoWrapper))
-      return std::move(Err);
+      return Err;
 
     Error Err = Plugin::success();
     AsyncInfoWrapper.finalize(Err);
 
-    return std::move(Err);
+    return Err;
   }
 
   /// Envar for controlling the number of HSA queues per device. High number of
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 3f798a908e736..08946e2103501 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -231,9 +231,9 @@ struct RecordReplayTy {
     OS.close();
   }
 
-  void saveKernelDescr(const char *Name, void **ArgPtrs, ptrdiff_t *ArgOffsets,
-                       int32_t NumArgs, uint64_t NumTeamsClause,
-                       uint32_t ThreadLimitClause, uint64_t LoopTripCount) {
+  void saveKernelDescr(const char *Name, void **ArgPtrs, int32_t NumArgs,
+                       uint64_t NumTeamsClause, uint32_t ThreadLimitClause,
+                       uint64_t LoopTripCount) {
     json::Object JsonKernelInfo;
     JsonKernelInfo["Name"] = Name;
     JsonKernelInfo["NumArgs"] = NumArgs;
@@ -251,7 +251,7 @@ struct RecordReplayTy {
 
     json::Array JsonArgOffsets;
     for (int I = 0; I < NumArgs; ++I)
-      JsonArgOffsets.push_back(ArgOffsets[I]);
+      JsonArgOffsets.push_back(0);
     JsonKernelInfo["ArgOffsets"] = json::Value(std::move(JsonArgOffsets));
 
     SmallString<128> JsonFilename = {Name, ".json"};
@@ -427,6 +427,11 @@ Expected<KernelLaunchEnvironmentTy *>
 GenericKernelTy::getKernelLaunchEnvironment(
     GenericDeviceTy &GenericDevice,
     AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+  // Ctor/Dtor have no arguments, replaying uses the original kernel launch
+  // environment.
+  if (isCtorOrDtor() || RecordReplay.isReplaying())
+    return nullptr;
+
   // TODO: Check if the kernel needs a launch environment.
   auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy),
                                             /*HostPtr=*/nullptr,
@@ -501,6 +506,15 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
       getNumBlocks(GenericDevice, KernelArgs.NumTeams, KernelArgs.Tripcount,
                    NumThreads, KernelArgs.ThreadLimit[0] > 0);
 
+  // Record the kernel description after we modified the argument count and num
+  // blocks/threads.
+  if (RecordReplay.isRecording()) {
+    RecordReplay.saveImage(getName(), getImage());
+    RecordReplay.saveKernelInput(getName(), getImage());
+    RecordReplay.saveKernelDescr(getName(), Ptrs.data(), KernelArgs.NumArgs,
+                                 NumBlocks, NumThreads, KernelArgs.Tripcount);
+  }
+
   if (auto Err =
           printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
     return Err;
@@ -517,16 +531,20 @@ void *GenericKernelTy::prepareArgs(
   if (isCtorOrDtor())
     return nullptr;
 
-  NumArgs += 1;
+  uint32_t KLEOffset = !!KernelLaunchEnvironment;
+  NumArgs += KLEOffset;
 
   Args.resize(NumArgs);
   Ptrs.resize(NumArgs);
 
-  Ptrs[0] = KernelLaunchEnvironment;
-  Args[0] = &Ptrs[0];
+  if (KernelLaunchEnvironment) {
+    Ptrs[0] = KernelLaunchEnvironment;
+    Args[0] = &Ptrs[0];
+  }
 
-  for (int I = 1; I < NumArgs; ++I) {
-    Ptrs[I] = (void *)((intptr_t)ArgPtrs[I - 1] + ArgOffsets[I - 1]);
+  for (int I = KLEOffset; I < NumArgs; ++I) {
+    Ptrs[I] =
+        (void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]);
     Args[I] = &Ptrs[I];
   }
   return &Args[0];
@@ -719,7 +737,7 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
 Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
   for (DeviceImageTy *Image : LoadedImages)
     if (auto Err = callGlobalDestructors(Plugin, *Image))
-      return std::move(Err);
+      return Err;
 
   if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
     GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
@@ -808,7 +826,7 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
     return std::move(Err);
 
   // Setup the global device memory pool if needed.
-  if (shouldSetupDeviceMemoryPool()) {
+  if (!RecordReplay.isReplaying() && shouldSetupDeviceMemoryPool()) {
     uint64_t HeapSize;
     auto SizeOrErr = getDeviceHeapSize(HeapSize);
     if (SizeOrErr) {
@@ -1413,21 +1431,9 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
   GenericKernelTy &GenericKernel =
       *reinterpret_cast<GenericKernelTy *>(EntryPtr);
 
-  if (RecordReplay.isRecording()) {
-    RecordReplay.saveImage(GenericKernel.getName(), GenericKernel.getImage());
-    RecordReplay.saveKernelInput(GenericKernel.getName(),
-                                 GenericKernel.getImage());
-  }
-
   auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs,
                                   AsyncInfoWrapper);
 
-  if (RecordReplay.isRecording())
-    RecordReplay.saveKernelDescr(GenericKernel.getName(), ArgPtrs, ArgOffsets,
-                                 KernelArgs.NumArgs, KernelArgs.NumTeams[0],
-                                 KernelArgs.ThreadLimit[0],
-                                 KernelArgs.Tripcount);
-
   // 'finalize' here to guarantee next record-replay actions are in-sync
   AsyncInfoWrapper.finalize(Err);
 
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index a6e28574a7f08..d1473c94af8a6 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -1106,30 +1106,30 @@ struct CUDADeviceTy : public GenericDeviceTy {
     for (auto [Name, Priority] : Funcs) {
       GlobalTy FunctionAddr(Name.str(), sizeof(void *), &FunctionPtrs[Idx++]);
       if (auto Err = Handler.readGlobalFromDevice(*this, Image, FunctionAddr))
-        return std::move(Err);
+        return Err;
     }
 
     // Copy the local buffer to the device.
     if (auto Err = dataSubmit(GlobalPtrStart, FunctionPtrs.data(),
                               FunctionPtrs.size() * sizeof(void *), nullptr))
-      return std::move(Err);
+      return Err;
 
     // Copy the created buffer to the appropriate symbols so the kernel can
     // iterate through them.
     GlobalTy StartGlobal(IsCtor ? "__init_array_start" : "__fini_array_start",
                          sizeof(void *), &GlobalPtrStart);
     if (auto Err = Handler.writeGlobalToDevice(*this, Image, StartGlobal))
-      return std::move(Err);
+      return Err;
 
     GlobalTy StopGlobal(IsCtor ? "__init_array_end" : "__fini_array_end",
                         sizeof(void *), &GlobalPtrStop);
     if (auto Err = Handler.writeGlobalToDevice(*this, Image, StopGlobal))
-      return std::move(Err);
+      return Err;
 
     CUDAKernelTy CUDAKernel(KernelName);
 
     if (auto Err = CUDAKernel.init(*this, Image))
-      return std::move(Err);
+      return Err;
 
     AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
 
@@ -1137,7 +1137,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (auto Err = CUDAKernel.launchImpl(*this, /*NumThread=*/1u,
                                          /*NumBlocks=*/1ul, KernelArgs, nullptr,
                                          AsyncInfoWrapper))
-      return std::move(Err);
+      return Err;
 
     Error Err = Plugin::success();
     AsyncInfoWrapper.finalize(Err);
@@ -1145,7 +1145,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (free(Buffer, TARGET_ALLOC_DEVICE) != OFFLOAD_SUCCESS)
       return Plugin::error("Failed to free memory for global buffer");
 
-    return std::move(Err);
+    return Err;
   }
 
   /// Stream manager for CUDA streams.