Skip to content

Commit

Permalink
[HIP]: Add -fhip-emit-relocatable to override link job creation for -…
Browse files Browse the repository at this point in the history
…fno-gpu-rdc

Differential Revision: https://reviews.llvm.org/D153667

Change-Id: Idcc5c7c25dc350b8dc9a1865fd67982904d06ecd
  • Loading branch information
jrbyrnes committed Jun 29, 2023
1 parent 21b6da3 commit be8a65b
Show file tree
Hide file tree
Showing 6 changed files with 118 additions and 10 deletions.
4 changes: 4 additions & 0 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -1124,6 +1124,10 @@ def gpu_bundle_output : Flag<["--"], "gpu-bundle-output">,
Group<f_Group>, HelpText<"Bundle output files of HIP device compilation">;
def no_gpu_bundle_output : Flag<["--"], "no-gpu-bundle-output">,
Group<f_Group>, HelpText<"Do not bundle output files of HIP device compilation">;
def fhip_emit_relocatable : Flag<["-"], "fhip-emit-relocatable">, Group<f_Group>,
HelpText<"Compile HIP source to relocatable">;
def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">, Group<f_Group>,
HelpText<"Do not override toolchain to compile HIP source to relocatable">;
def cuid_EQ : Joined<["-"], "cuid=">, Flags<[CC1Option]>,
HelpText<"An ID for compilation unit, which should be the same for the same "
"compilation unit but different for different compilation units. "
Expand Down
51 changes: 41 additions & 10 deletions clang/lib/Driver/Driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2946,7 +2946,12 @@ class OffloadingActionBuilder final {
CudaActionBuilderBase(Compilation &C, DerivedArgList &Args,
const Driver::InputList &Inputs,
Action::OffloadKind OFKind)
: DeviceActionBuilder(C, Args, Inputs, OFKind) {}
: DeviceActionBuilder(C, Args, Inputs, OFKind) {

CompileDeviceOnly = C.getDriver().offloadDeviceOnly();
Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
options::OPT_fno_gpu_rdc, /*Default=*/false);
}

ActionBuilderReturnCode addDeviceDependences(Action *HostAction) override {
// While generating code for CUDA, we only depend on the host input action
Expand Down Expand Up @@ -3099,9 +3104,6 @@ class OffloadingActionBuilder final {
!C.hasOffloadToolChain<Action::OFK_HIP>())
return false;

Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
options::OPT_fno_gpu_rdc, /*Default=*/false);

const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
assert(HostTC && "No toolchain for host compilation.");
if (HostTC->getTriple().isNVPTX() ||
Expand All @@ -3120,7 +3122,6 @@ class OffloadingActionBuilder final {
: C.getSingleOffloadToolChain<Action::OFK_HIP>());

CompileHostOnly = C.getDriver().offloadHostOnly();
CompileDeviceOnly = C.getDriver().offloadDeviceOnly();
EmitLLVM = Args.getLastArg(options::OPT_emit_llvm);
EmitAsm = Args.getLastArg(options::OPT_S);
FixedCUID = Args.getLastArgValue(options::OPT_cuid_EQ);
Expand Down Expand Up @@ -3352,16 +3353,40 @@ class OffloadingActionBuilder final {
// only compilation. Bundle other type of output files only if
// --gpu-bundle-output is specified for device only compilation.
std::optional<bool> BundleOutput;
std::optional<bool> EmitReloc;

public:
HIPActionBuilder(Compilation &C, DerivedArgList &Args,
const Driver::InputList &Inputs)
: CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {

DefaultCudaArch = CudaArch::GFX906;

if (Args.hasArg(options::OPT_fhip_emit_relocatable,
options::OPT_fno_hip_emit_relocatable)) {
EmitReloc = Args.hasFlag(options::OPT_fhip_emit_relocatable,
options::OPT_fno_hip_emit_relocatable, false);

if (*EmitReloc) {
if (Relocatable) {
C.getDriver().Diag(diag::err_opt_not_valid_with_opt)
<< "-fhip-emit-relocatable"
<< "-fgpu-rdc";
}

if (!CompileDeviceOnly) {
C.getDriver().Diag(diag::err_opt_not_valid_without_opt)
<< "-fhip-emit-relocatable"
<< "--cuda-device-only";
}
}
}

if (Args.hasArg(options::OPT_gpu_bundle_output,
options::OPT_no_gpu_bundle_output))
BundleOutput = Args.hasFlag(options::OPT_gpu_bundle_output,
options::OPT_no_gpu_bundle_output, true);
options::OPT_no_gpu_bundle_output, true) &&
(!EmitReloc || !*EmitReloc);
}

bool canUseBundlerUnbundler() const override { return true; }
Expand Down Expand Up @@ -3408,8 +3433,10 @@ class OffloadingActionBuilder final {
assert(!CompileHostOnly &&
"Not expecting HIP actions in host-only compilation.");

bool ShouldLink = !EmitReloc || !*EmitReloc;

if (!Relocatable && CurPhase == phases::Backend && !EmitLLVM &&
!EmitAsm) {
!EmitAsm && ShouldLink) {
// If we are in backend phase, we attempt to generate the fat binary.
// We compile each arch to IR and use a link action to generate code
// object containing ISA. Then we use a special "link" action to create
Expand Down Expand Up @@ -3485,6 +3512,8 @@ class OffloadingActionBuilder final {

return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success;
} else if (CurPhase == phases::Link) {
if (!ShouldLink)
return ABRT_Success;
// Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
// This happens to each device action originated from each input file.
// Later on, device actions in DeviceLinkerInputs are used to create
Expand Down Expand Up @@ -3522,8 +3551,11 @@ class OffloadingActionBuilder final {
CudaDeviceActions.clear();
}

return (CompileDeviceOnly && CurPhase == FinalPhase) ? ABRT_Ignore_Host
: ABRT_Success;
return (CompileDeviceOnly &&
(CurPhase == FinalPhase ||
(!ShouldLink && CurPhase == phases::Assemble)))
? ABRT_Ignore_Host
: ABRT_Success;
}

void appendLinkDeviceActions(ActionList &AL) override {
Expand Down Expand Up @@ -3674,7 +3706,6 @@ class OffloadingActionBuilder final {
++InactiveBuilders;
continue;
}

auto RetCode =
SB->getDeviceDependences(DDeps, CurPhase, FinalPhase, Phases);

Expand Down
17 changes: 17 additions & 0 deletions clang/test/Driver/hip-dependent-options.hip
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// RUN: %clang -### --target=x86_64-linux-gnu \
// RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
// RUN: -c -fhip-emit-relocatable -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
// RUN: %S/Inputs/hip_multiple_inputs/b.hip --gpu-bundle-output \
// RUN: 2>&1 | FileCheck -check-prefixes=RELOCRDC %s

// RELOCRDC: error: option '-fhip-emit-relocatable' cannot be specified with '-fgpu-rdc'

// RUN: %clang -### --target=x86_64-linux-gnu \
// RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
// RUN: -c -fhip-emit-relocatable -nogpuinc -nogpulib \
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
// RUN: %S/Inputs/hip_multiple_inputs/b.hip --gpu-bundle-output \
// RUN: 2>&1 | FileCheck -check-prefixes=RELOCHOST %s

// RELOCHOST: error: option '-fhip-emit-relocatable' cannot be specified without '--cuda-device-only'
9 changes: 9 additions & 0 deletions clang/test/Driver/hip-device-compile.hip
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,14 @@
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,ASM,NBUN %s

// Output relocatable.
// RUN: %clang -c --cuda-device-only -### --target=x86_64-linux-gnu \
// RUN: -o a.o -x hip --cuda-gpu-arch=gfx900 -fhip-emit-relocatable \
// RUN: --hip-device-lib=lib1.bc \
// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,NBUN,RELOC %s

// Output bundled assembly.
// RUN: %clang -c -S --cuda-device-only -### --target=x86_64-linux-gnu \
// RUN: -o a.s -x hip --cuda-gpu-arch=gfx900 --no-gpu-bundle-output \
Expand All @@ -68,6 +76,7 @@
// LLBUN-SAME: "-o" "{{.*}}.ll"
// ASM-SAME: "-o" "a.s"
// ASMBUN-SAME: "-o" "{{.*}}.s"
// RELOC-SAME: "-o" "a.o"
// CHECK-SAME: {{".*a.cu"}}

// CHECK-NOT: {{"*.llvm-link"}}
Expand Down
37 changes: 37 additions & 0 deletions clang/test/Driver/hip-phases.hip
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,43 @@
// DASM-NOT: clang-offload-bundler
// DASM-NOT: host

//
// Test single gpu architecture with compile to relocatable in device-only
// compilation mode.
//
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
// RUN: --cuda-gpu-arch=gfx803 %s --cuda-device-only -fhip-emit-relocatable 2>&1 \
// RUN: | FileCheck -check-prefixes=RELOC %s
// RELOC-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
// RELOC-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
// RELOC-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
// RELOC-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
// RELOC-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-[[T]], [[ARCH]])
// RELOC-NOT: linker
// RELOC-DAG: [[P5:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P4]]}, object

//
// Test two gpu architectures with compile to relocatable in device-only
// compilation mode.
//
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-device-only -fhip-emit-relocatable 2>&1 \
// RUN: | FileCheck -check-prefixes=RELOC2 %s
// RELOC2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
// RELOC2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
// RELOC2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
// RELOC2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
// RELOC2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-[[T]], [[ARCH]])
// RELOC2-NOT: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (device-[[T]], [[ARCH]])
// RELOC2-DAG: [[P5:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P4]]}, object
// RELOC2-DAG: [[P6:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH2:gfx900]])
// RELOC2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
// RELOC2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-[[T]], [[ARCH2]])
// RELOC2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-[[T]], [[ARCH2]])
// RELOC2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-[[T]], [[ARCH2]])
// RELOC2-NOT: linker
// RELOC2-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P10]]}, object

//
// Test two gpu architectures with complete compilation in device-only
// compilation mode.
Expand Down
10 changes: 10 additions & 0 deletions clang/test/Driver/hip-rdc-device-only.hip
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,16 @@
// RUN: %S/Inputs/hip_multiple_inputs/b.hip --gpu-bundle-output \
// RUN: 2>&1 | FileCheck -check-prefixes=COMMON,EMITBC %s

// With `-fno-hip-emit-relocatable`, the output should be the same as the aforementioned line
// as `-fgpu-rdc` in HIP implies `-fno-hip-emit-relocatable`.

// RUN: %clang -### --target=x86_64-linux-gnu \
// RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
// RUN: -c -fno-hip-emit-relocatable -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
// RUN: %S/Inputs/hip_multiple_inputs/b.hip --gpu-bundle-output \
// RUN: 2>&1 | FileCheck -check-prefixes=COMMON,EMITBC %s

// RUN: %clang -### --target=x86_64-linux-gnu \
// RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
// RUN: -S -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \
Expand Down

0 comments on commit be8a65b

Please sign in to comment.