Skip to content

Commit

Permalink
[CUDA] Include PTX in non-RDC mode using the new driver (#84367)
Browse files Browse the repository at this point in the history
Summary:
The old driver embed PTX in rdc-mode and so does the `nvcc` compiler.
The new drivers currently does not do this, so we should keep it
consistent in this case. This simply requires adding the assembler
output as an input to the offloading action that gets fed to fatbin.
  • Loading branch information
jhuber6 committed Mar 7, 2024
1 parent 69b8bc7 commit 3a56b5a
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 22 deletions.
3 changes: 3 additions & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,9 @@ RISC-V Support
CUDA/HIP Language Changes
^^^^^^^^^^^^^^^^^^^^^^^^^

- PTX is no longer included by default when compiling for CUDA. Using
``--cuda-include-ptx=all`` will return the old behavior.

CUDA Support
^^^^^^^^^^^^

Expand Down
8 changes: 8 additions & 0 deletions clang/lib/Driver/Driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4625,7 +4625,15 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
DDeps.add(*A, *TCAndArch->first, TCAndArch->second.data(), Kind);
OffloadAction::DeviceDependences DDep;
DDep.add(*A, *TCAndArch->first, TCAndArch->second.data(), Kind);

// Compiling CUDA in non-RDC mode uses the PTX output if available.
for (Action *Input : A->getInputs())
if (Kind == Action::OFK_Cuda && A->getType() == types::TY_Object &&
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
false))
DDep.add(*Input, *TCAndArch->first, TCAndArch->second.data(), Kind);
OffloadActions.push_back(C.MakeAction<OffloadAction>(DDep, A->getType()));

++TCAndArch;
}
}
Expand Down
22 changes: 12 additions & 10 deletions clang/lib/Driver/ToolChains/Cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -503,18 +503,20 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
Exec, CmdArgs, Inputs, Output));
}

static bool shouldIncludePTX(const ArgList &Args, const char *gpu_arch) {
bool includePTX = true;
for (Arg *A : Args) {
if (!(A->getOption().matches(options::OPT_cuda_include_ptx_EQ) ||
A->getOption().matches(options::OPT_no_cuda_include_ptx_EQ)))
continue;
static bool shouldIncludePTX(const ArgList &Args, StringRef InputArch) {
// The new driver does not include PTX by default to avoid overhead.
bool includePTX = !Args.hasFlag(options::OPT_offload_new_driver,
options::OPT_no_offload_new_driver, false);
for (Arg *A : Args.filtered(options::OPT_cuda_include_ptx_EQ,
options::OPT_no_cuda_include_ptx_EQ)) {
A->claim();
const StringRef ArchStr = A->getValue();
if (ArchStr == "all" || ArchStr == gpu_arch) {
includePTX = A->getOption().matches(options::OPT_cuda_include_ptx_EQ);
continue;
}
if (A->getOption().matches(options::OPT_cuda_include_ptx_EQ) &&
(ArchStr == "all" || ArchStr == InputArch))
includePTX = true;
else if (A->getOption().matches(options::OPT_no_cuda_include_ptx_EQ) &&
(ArchStr == "all" || ArchStr == InputArch))
includePTX = false;
}
return includePTX;
}
Expand Down
25 changes: 13 additions & 12 deletions clang/test/Driver/cuda-phases.cu
Original file line number Diff line number Diff line change
Expand Up @@ -244,31 +244,32 @@
// NEW-DRIVER-RDC-NEXT: 18: assembler, {17}, object, (host-cuda)
// NEW-DRIVER-RDC-NEXT: 19: clang-linker-wrapper, {18}, image, (host-cuda)

// RUN: %clang -### -target powerpc64le-ibm-linux-gnu -ccc-print-phases --offload-new-driver -fgpu-rdc \
// RUN: %clang -### -target powerpc64le-ibm-linux-gnu -ccc-print-phases --offload-new-driver \
// RUN: --offload-arch=sm_52 --offload-arch=sm_70 %s 2>&1 | FileCheck --check-prefix=NEW-DRIVER %s
// NEW-DRIVER: 0: input, "[[INPUT:.+]]", cuda
// NEW-DRIVER-NEXT: 1: preprocessor, {0}, cuda-cpp-output
// NEW-DRIVER-NEXT: 2: compiler, {1}, ir
// NEW-DRIVER-NEXT: 3: input, "[[INPUT]]", cuda, (device-cuda, sm_52)
// NEW-DRIVER: 0: input, "[[CUDA:.+]]", cuda, (host-cuda)
// NEW-DRIVER-NEXT: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
// NEW-DRIVER-NEXT: 2: compiler, {1}, ir, (host-cuda)
// NEW-DRIVER-NEXT: 3: input, "[[CUDA]]", cuda, (device-cuda, sm_52)
// NEW-DRIVER-NEXT: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_52)
// NEW-DRIVER-NEXT: 5: compiler, {4}, ir, (device-cuda, sm_52)
// NEW-DRIVER-NEXT: 6: backend, {5}, assembler, (device-cuda, sm_52)
// NEW-DRIVER-NEXT: 7: assembler, {6}, object, (device-cuda, sm_52)
// NEW-DRIVER-NEXT: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {7}, object
// NEW-DRIVER-NEXT: 9: input, "[[INPUT]]", cuda, (device-cuda, sm_70)
// NEW-DRIVER-NEXT: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {7}, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {6}, object
// NEW-DRIVER-NEXT: 9: input, "[[CUDA]]", cuda, (device-cuda, sm_70)
// NEW-DRIVER-NEXT: 10: preprocessor, {9}, cuda-cpp-output, (device-cuda, sm_70)
// NEW-DRIVER-NEXT: 11: compiler, {10}, ir, (device-cuda, sm_70)
// NEW-DRIVER-NEXT: 12: backend, {11}, assembler, (device-cuda, sm_70)
// NEW-DRIVER-NEXT: 13: assembler, {12}, object, (device-cuda, sm_70)
// NEW-DRIVER-NEXT: 14: offload, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {13}, object
// NEW-DRIVER-NEXT: 15: clang-offload-packager, {8, 14}, image
// NEW-DRIVER-NEXT: 16: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (powerpc64le-ibm-linux-gnu)" {15}, ir
// NEW-DRIVER-NEXT: 14: offload, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {13}, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {12}, object
// NEW-DRIVER-NEXT: 15: linker, {8, 14}, cuda-fatbin, (device-cuda)
// NEW-DRIVER-NEXT: 16: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {15}, ir
// NEW-DRIVER-NEXT: 17: backend, {16}, assembler, (host-cuda)
// NEW-DRIVER-NEXT: 18: assembler, {17}, object, (host-cuda)
// NEW-DRIVER-NEXT: 19: clang-linker-wrapper, {18}, image, (host-cuda)

// RUN: %clang -### --target=powerpc64le-ibm-linux-gnu -ccc-print-phases --offload-new-driver \
// RUN: --offload-arch=sm_52 --offload-arch=sm_70 %s %S/Inputs/empty.cpp 2>&1 | FileCheck --check-prefix=NON-CUDA-INPUT %s

// NON-CUDA-INPUT: 0: input, "[[CUDA:.+]]", cuda, (host-cuda)
// NON-CUDA-INPUT-NEXT: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
// NON-CUDA-INPUT-NEXT: 2: compiler, {1}, ir, (host-cuda)
Expand All @@ -277,13 +278,13 @@
// NON-CUDA-INPUT-NEXT: 5: compiler, {4}, ir, (device-cuda, sm_52)
// NON-CUDA-INPUT-NEXT: 6: backend, {5}, assembler, (device-cuda, sm_52)
// NON-CUDA-INPUT-NEXT: 7: assembler, {6}, object, (device-cuda, sm_52)
// NON-CUDA-INPUT-NEXT: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {7}, object
// NON-CUDA-INPUT-NEXT: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {7}, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {6}, object
// NON-CUDA-INPUT-NEXT: 9: input, "[[CUDA]]", cuda, (device-cuda, sm_70)
// NON-CUDA-INPUT-NEXT: 10: preprocessor, {9}, cuda-cpp-output, (device-cuda, sm_70)
// NON-CUDA-INPUT-NEXT: 11: compiler, {10}, ir, (device-cuda, sm_70)
// NON-CUDA-INPUT-NEXT: 12: backend, {11}, assembler, (device-cuda, sm_70)
// NON-CUDA-INPUT-NEXT: 13: assembler, {12}, object, (device-cuda, sm_70)
// NON-CUDA-INPUT-NEXT: 14: offload, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {13}, object
// NON-CUDA-INPUT-NEXT: 14: offload, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {13}, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {12}, object
// NON-CUDA-INPUT-NEXT: 15: linker, {8, 14}, cuda-fatbin, (device-cuda)
// NON-CUDA-INPUT-NEXT: 16: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {15}, ir
// NON-CUDA-INPUT-NEXT: 17: backend, {16}, assembler, (host-cuda)
Expand Down

0 comments on commit 3a56b5a

Please sign in to comment.