diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index fb51f96c07b74..7ccd0b4fef284 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -28,6 +28,9 @@
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Program.h"
 
 namespace llvm {
 namespace omp {
@@ -397,6 +400,65 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
   }
 
+  Expected<std::unique_ptr<MemoryBuffer>>
+  doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override {
+    // TODO: We should be able to use the 'nvidia-ptxjitcompiler' interface to
+    //       avoid the call to 'ptxas'.
+    SmallString<128> PTXInputFilePath;
+    std::error_code EC = sys::fs::createTemporaryFile("nvptx-pre-link-jit", "s",
+                                                      PTXInputFilePath);
+    if (EC)
+      return Plugin::error("Failed to create temporary file for ptxas");
+
+    // Write the file's contents to the output file.
+    Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
+        FileOutputBuffer::create(PTXInputFilePath, MB->getBuffer().size());
+    if (!OutputOrErr)
+      return OutputOrErr.takeError();
+    std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
+    llvm::copy(MB->getBuffer(), Output->getBufferStart());
+    if (Error E = Output->commit())
+      return std::move(E);
+
+    SmallString<128> PTXOutputFilePath;
+    EC = sys::fs::createTemporaryFile("nvptx-post-link-jit", "cubin",
+                                      PTXOutputFilePath);
+    if (EC)
+      return Plugin::error("Failed to create temporary file for ptxas");
+
+    // Try to find `ptxas` in the path to compile the PTX to a binary.
+    const auto ErrorOrPath = sys::findProgramByName("ptxas");
+    if (!ErrorOrPath)
+      return Plugin::error("Failed to find 'ptxas' on the PATH.");
+
+    std::string Arch = getComputeUnitKind();
+    StringRef Args[] = {*ErrorOrPath,
+                        "-m64",
+                        "-O2",
+                        "--gpu-name",
+                        Arch,
+                        "--output-file",
+                        PTXOutputFilePath,
+                        PTXInputFilePath};
+
+    std::string ErrMsg;
+    if (sys::ExecuteAndWait(*ErrorOrPath, Args, std::nullopt, {}, 0, 0,
+                            &ErrMsg))
+      return Plugin::error("Running 'ptxas' failed: %s\n", ErrMsg.c_str());
+
+    auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(PTXOutputFilePath.data());
+    if (!BufferOrErr)
+      return Plugin::error("Failed to open temporary file for ptxas");
+
+    // Clean up the temporary files afterwards.
+    if (sys::fs::remove(PTXOutputFilePath))
+      return Plugin::error("Failed to remove temporary file for ptxas");
+    if (sys::fs::remove(PTXInputFilePath))
+      return Plugin::error("Failed to remove temporary file for ptxas");
+
+    return std::move(*BufferOrErr);
+  }
+
   /// Allocate and construct a CUDA kernel.
   Expected<GenericKernelTy &>
   constructKernel(const __tgt_offload_entry &KernelEntry) override {