diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp index fb51f96c07b74..7ccd0b4fef284 100644 --- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -28,6 +28,9 @@ #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPGridValues.h" #include "llvm/Support/Error.h" +#include "llvm/Support/FileOutputBuffer.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Program.h" namespace llvm { namespace omp { @@ -397,6 +400,65 @@ struct CUDADeviceTy : public GenericDeviceTy { return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false); } + Expected> + doJITPostProcessing(std::unique_ptr MB) const override { + // TODO: We should be able to use the 'nvidia-ptxjitcompiler' interface to + // avoid the call to 'ptxas'. + SmallString<128> PTXInputFilePath; + std::error_code EC = sys::fs::createTemporaryFile("nvptx-pre-link-jit", "s", + PTXInputFilePath); + if (EC) + return Plugin::error("Failed to create temporary file for ptxas"); + + // Write the file's contents to the output file. + Expected> OutputOrErr = + FileOutputBuffer::create(PTXInputFilePath, MB->getBuffer().size()); + if (!OutputOrErr) + return OutputOrErr.takeError(); + std::unique_ptr Output = std::move(*OutputOrErr); + llvm::copy(MB->getBuffer(), Output->getBufferStart()); + if (Error E = Output->commit()) + return std::move(E); + + SmallString<128> PTXOutputFilePath; + EC = sys::fs::createTemporaryFile("nvptx-post-link-jit", "cubin", + PTXOutputFilePath); + if (EC) + return Plugin::error("Failed to create temporary file for ptxas"); + + // Try to find `ptxas` in the path to compile the PTX to a binary. + const auto ErrorOrPath = sys::findProgramByName("ptxas"); + if (!ErrorOrPath) + return Plugin::error("Failed to find 'ptxas' on the PATH."); + + std::string Arch = getComputeUnitKind(); + StringRef Args[] = {*ErrorOrPath, + "-m64", + "-O2", + "--gpu-name", + Arch, + "--output-file", + PTXOutputFilePath, + PTXInputFilePath}; + + std::string ErrMsg; + if (sys::ExecuteAndWait(*ErrorOrPath, Args, std::nullopt, {}, 0, 0, + &ErrMsg)) + return Plugin::error("Running 'ptxas' failed: %s\n", ErrMsg.c_str()); + + auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(PTXOutputFilePath.data()); + if (!BufferOrErr) + return Plugin::error("Failed to open temporary file for ptxas"); + + // Clean up the temporary files afterwards. + if (sys::fs::remove(PTXOutputFilePath)) + return Plugin::error("Failed to remove temporary file for ptxas"); + if (sys::fs::remove(PTXInputFilePath)) + return Plugin::error("Failed to remove temporary file for ptxas"); + + return std::move(*BufferOrErr); + } + /// Allocate and construct a CUDA kernel. Expected constructKernel(const __tgt_offload_entry &KernelEntry) override {