-
Notifications
You must be signed in to change notification settings - Fork 15.2k
HIP non-RDC: enable new offload driver on Windows via linker wrapper #167918
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-clang-codegen @llvm/pr-subscribers-clang-driver Author: Yaxun (Sam) Liu (yxsamliu) ChangesUse clang linker wrapper to device-link and embed HIP fat binary directly. Match CUDA non-RDC flow in new driver by producing .hipfb like .fatbin. Previously, llvm offload binary is used to package the device IR's and embed them in the host object file, then clang linker wrapper is used with each host object file to extract device IR's, perform device linking, bundle code objects into a fat binary, wrap it in a host object file, then merge it with the original host object by the host linker with '-r' option. However, the host linker in MSVC toolchain does not support '-r' option. The new approach still package the device IR's with llvm offload binary, but instead of embed it in a host object, it is passed to clang linker wrapper directly, where device IR's are extracted and linked, fat binary is generated, then embeded in the host object directly. Compared with the old offload driver, this approach can parallelize the device linking for different GPU's by using the parallelization feature of clang linker wrapper. Fixes: SWDEV-565994 Patch is 24.52 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167918.diff 9 Files Affected:
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index b463f889df373..34bf41d8b7ed1 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -1280,8 +1280,7 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
return nullptr;
}
if (CGM.getLangOpts().OffloadViaLLVM ||
- (CGM.getLangOpts().OffloadingNewDriver &&
- (CGM.getLangOpts().HIP || RelocatableDeviceCode)))
+ (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
createOffloadingEntries();
else
return makeModuleCtorFunction();
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 9fd64d4aac514..e54c0e30bddab 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4413,10 +4413,6 @@ void Driver::BuildDefaultActions(Compilation &C, DerivedArgList &Args,
options::OPT_no_offload_new_driver,
C.isOffloadingHostKind(Action::OFK_Cuda));
- bool HIPNoRDC =
- C.isOffloadingHostKind(Action::OFK_HIP) &&
- !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
-
// Builder to be used to build offloading actions.
std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
!UseNewOffloadingDriver
@@ -4550,7 +4546,7 @@ void Driver::BuildDefaultActions(Compilation &C, DerivedArgList &Args,
// Check if this Linker Job should emit a static library.
if (ShouldEmitStaticLibrary(Args)) {
LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
- } else if ((UseNewOffloadingDriver && !HIPNoRDC) ||
+ } else if (UseNewOffloadingDriver ||
Args.hasArg(options::OPT_offload_link)) {
LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
@@ -4887,20 +4883,6 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
<< "-fhip-emit-relocatable"
<< "--offload-device-only";
- // For HIP non-rdc non-device-only compilation, create a linker wrapper
- // action for each host object to link, bundle and wrap device files in
- // it.
- if ((isa<AssembleJobAction>(HostAction) ||
- (isa<BackendJobAction>(HostAction) &&
- HostAction->getType() == types::TY_LTO_BC)) &&
- HIPNoRDC && !offloadDeviceOnly()) {
- ActionList AL{HostAction};
- HostAction = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object);
- HostAction->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
- /*BoundArch=*/nullptr);
- return HostAction;
- }
-
// Don't build offloading actions if we do not have a compile action. If
// preprocessing only ignore embedding.
if (!(isa<CompileJobAction>(HostAction) ||
@@ -5070,8 +5052,22 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
// embedded in the host and linked.
Action *PackagerAction =
C.MakeAction<OffloadPackagerJobAction>(OffloadActions, types::TY_Image);
- DDep.add(*PackagerAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
- nullptr, C.getActiveOffloadKinds());
+
+ // For HIP non-RDC compilation, wrap the device binary with linker wrapper
+ // before bundling with host code. Do not bind a specific GPU arch here,
+ // as the packaged image may contain entries for multiple GPUs.
+ if (HIPNoRDC && !offloadDeviceOnly()) {
+ ActionList AL{PackagerAction};
+ PackagerAction =
+ C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_HIP_FATBIN);
+ DDep.add(*PackagerAction,
+ *C.getOffloadToolChains<Action::OFK_HIP>().first->second,
+ /*BoundArch=*/nullptr, Action::OFK_HIP);
+ } else {
+ DDep.add(*PackagerAction,
+ *C.getSingleOffloadToolChain<Action::OFK_Host>(), nullptr,
+ C.getActiveOffloadKinds());
+ }
}
// HIP wants '--offload-device-only' to create a fatbinary by default.
@@ -5194,6 +5190,16 @@ Action *Driver::ConstructPhaseAction(
return C.MakeAction<CompileJobAction>(Input, types::TY_LLVM_BC);
}
case phases::Backend: {
+ // Skip a redundant Backend phase for HIP device code when using the new
+ // offload driver, where mid-end is done in linker wrapper.
+ {
+ if (TargetDeviceOffloadKind == Action::OFK_HIP &&
+ Args.hasFlag(options::OPT_offload_new_driver,
+ options::OPT_no_offload_new_driver, false) &&
+ !offloadDeviceOnly())
+ return Input;
+ }
+
if (isUsingLTO() && TargetDeviceOffloadKind == Action::OFK_None) {
types::ID Output;
if (Args.hasArg(options::OPT_ffat_lto_objects) &&
@@ -5213,7 +5219,8 @@ Action *Driver::ConstructPhaseAction(
if (Args.hasArg(options::OPT_emit_llvm) ||
TargetDeviceOffloadKind == Action::OFK_SYCL ||
(((Input->getOffloadingToolChain() &&
- Input->getOffloadingToolChain()->getTriple().isAMDGPU()) ||
+ Input->getOffloadingToolChain()->getTriple().isAMDGPU() &&
+ TargetDeviceOffloadKind != Action::OFK_None) ||
TargetDeviceOffloadKind == Action::OFK_HIP) &&
((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
false) ||
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 80389937ee218..2456cbb0fb399 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7636,7 +7636,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-fcuda-include-gpubinary");
CmdArgs.push_back(CudaDeviceInput->getFilename());
} else if (!HostOffloadingInputs.empty()) {
- if (IsCuda && !IsRDCMode) {
+ if ((IsCuda || IsHIP) && !IsRDCMode) {
assert(HostOffloadingInputs.size() == 1 && "Only one input expected");
CmdArgs.push_back("-fcuda-include-gpubinary");
CmdArgs.push_back(HostOffloadingInputs.front().getFilename());
@@ -9086,7 +9086,7 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
auto ShouldForward = [&](const llvm::DenseSet<unsigned> &Set, Arg *A,
const ToolChain &TC) {
// CMake hack to avoid printing verbose informatoin for HIP non-RDC mode.
- if (A->getOption().matches(OPT_v) && JA.getType() == types::TY_Object)
+ if (A->getOption().matches(OPT_v) && JA.getType() == types::TY_HIP_FATBIN)
return false;
return (Set.contains(A->getOption().getID()) ||
(A->getOption().getGroup().isValid() &&
@@ -9168,7 +9168,7 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
// non-RDC mode compilation. This confuses default CMake implicit linker
// argument parsing when the language is set to HIP and the system linker is
// also `ld.lld`.
- if (Args.hasArg(options::OPT_v) && JA.getType() != types::TY_Object)
+ if (Args.hasArg(options::OPT_v) && JA.getType() != types::TY_HIP_FATBIN)
CmdArgs.push_back("--wrapper-verbose");
if (Arg *A = Args.getLastArg(options::OPT_cuda_path_EQ))
CmdArgs.push_back(
@@ -9240,14 +9240,13 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
// We use action type to differentiate two use cases of the linker wrapper.
// TY_Image for normal linker wrapper work.
- // TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable
- // object.
- assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image);
- if (JA.getType() == types::TY_Object) {
+ // TY_HIP_FATBIN for HIP fno-gpu-rdc emitting a fat binary without wrapping.
+ assert(JA.getType() == types::TY_HIP_FATBIN ||
+ JA.getType() == types::TY_Image);
+ if (JA.getType() == types::TY_HIP_FATBIN) {
CmdArgs.append({"-o", Output.getFilename()});
for (auto Input : Inputs)
CmdArgs.push_back(Input.getFilename());
- CmdArgs.push_back("-r");
} else
for (const char *LinkArg : LinkCommand->getArguments())
CmdArgs.push_back(LinkArg);
diff --git a/clang/test/Driver/hip-binding.hip b/clang/test/Driver/hip-binding.hip
index 4d15f9739ba7f..1c02cfaadfad2 100644
--- a/clang/test/Driver/hip-binding.hip
+++ b/clang/test/Driver/hip-binding.hip
@@ -101,6 +101,6 @@
// RUN: | FileCheck -check-prefix=LTO-NO-RDC %s
// LTO-NO-RDC: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[LTO_908:.+]]"
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[LTO_90A:.+]]"
-// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
-// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[PKG]]"], output: "[[OBJ:.+]]"
-// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[OBJ]]"], output: "hip-binding.o"
+// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
+// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "Offload::Linker", inputs: ["[[PKG]]"], output: "[[HIPFB:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[HIPFB]]"], output: "hip-binding.o"
diff --git a/clang/test/Driver/hip-phases.hip b/clang/test/Driver/hip-phases.hip
index 13f682f18a3ab..47e4bfca68a39 100644
--- a/clang/test/Driver/hip-phases.hip
+++ b/clang/test/Driver/hip-phases.hip
@@ -33,32 +33,33 @@
// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
// OLDN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
-// NEW-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
// OLDN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
// OLDR-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
// OLD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
// OLD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
-// NEW-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, ir
+// NEW-DAG: [[P6:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P5]]}, ir
// OLDN-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
-// NEW-DAG: [[P10:[0-9]+]]: llvm-offload-binary, {[[P9]]}, image, (device-[[T]])
+// NEW-DAG: [[P7:[0-9]+]]: llvm-offload-binary, {[[P6]]}, image, (device-[[T]])
+// NEWN-DAG: [[P8:[0-9]+]]: clang-linker-wrapper, {[[P7]]}, hip-fatbin, (device-[[T]])
+// NEWLTO-DAG: [[P8:[0-9]+]]: clang-linker-wrapper, {[[P7]]}, hip-fatbin, (device-[[T]])
// OLDR-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
// OLDN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
-// NEW-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P10]]}, ir
+// NEWN-DAG: [[P9:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P8]]}, ir
+// NEWLTO-DAG: [[P9:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P8]]}, ir
+// NEWR-DAG: [[P8:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P7]]}, ir
// OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
// OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
// OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
-// NEWN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// NEWN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
-// NEWLTO-DAG: [[P13:[0-9]+]]: backend, {[[P11]]}, lto-bc, (host-hip)
-// NEWR-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// NEWR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// NEWN-DAG: [[P10:[0-9]+]]: backend, {[[P9]]}, assembler, (host-[[T]])
+// NEWN-DAG: [[P11:[0-9]+]]: assembler, {[[P10]]}, object, (host-[[T]])
+// NEWLTO-DAG: [[P10:[0-9]+]]: backend, {[[P9]]}, lto-bc, (host-hip)
+// NEWR-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (host-[[T]])
+// NEWR-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (host-[[T]])
// OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
-// NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
-// NEWLTO-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
+// NEWN-DAG: [[P12:[0-9]+]]: clang-linker-wrapper, {[[P11]]}, image, (host-[[T]])
// OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
-// NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
-// NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image
+// NEWR-DAG: [[P11:[0-9]+]]: clang-linker-wrapper, {[[P10]]}, image, (host-[[T]])
//
// Test single gpu architecture up to the assemble phase.
@@ -613,7 +614,6 @@
// MIXED-DAG: input, "{{.*}}empty.hip", hip, (device-hip, gfx803)
// MIXED-DAG: input, "{{.*}}empty.hip", hip, (device-hip, gfx900)
// MIXED-DAG: input, "{{.*}}empty.cpp", c++
-// MIXED-NEG-NOT: input, "{{.*}}empty.cpp", c++, (host-hip)
// MIXED-NEG-NOT: input, "{{.*}}empty.cpp", c++, (device-hip
// MIXED2-DAG: input, "{{.*}}empty.hip", hip, (host-hip)
@@ -658,17 +658,15 @@
// LTO-NEXT: 3: input, "[[INPUT]]", hip, (device-hip, gfx908)
// LTO-NEXT: 4: preprocessor, {3}, hip-cpp-output, (device-hip, gfx908)
// LTO-NEXT: 5: compiler, {4}, ir, (device-hip, gfx908)
-// LTO-NEXT: 6: backend, {5}, lto-bc, (device-hip, gfx908)
-// LTO-NEXT: 7: offload, "device-hip (amdgcn-amd-amdhsa:gfx908)" {6}, lto-bc
-// LTO-NEXT: 8: input, "[[INPUT]]", hip, (device-hip, gfx90a)
-// LTO-NEXT: 9: preprocessor, {8}, hip-cpp-output, (device-hip, gfx90a)
-// LTO-NEXT: 10: compiler, {9}, ir, (device-hip, gfx90a)
-// LTO-NEXT: 11: backend, {10}, lto-bc, (device-hip, gfx90a)
-// LTO-NEXT: 12: offload, "device-hip (amdgcn-amd-amdhsa:gfx90a)" {11}, lto-bc
-// LTO-NEXT: 13: llvm-offload-binary, {7, 12}, image, (device-hip)
-// LTO-NEXT: 14: offload, "host-hip (x86_64-unknown-linux-gnu)" {2}, "device-hip (x86_64-unknown-linux-gnu)" {13}, ir
-// LTO-NEXT: 15: backend, {14}, assembler, (host-hip)
-// LTO-NEXT: 16: assembler, {15}, object, (host-hip)
+// LTO-NEXT: 6: offload, "device-hip (amdgcn-amd-amdhsa:gfx908)" {5}, ir
+// LTO-NEXT: 7: input, "[[INPUT]]", hip, (device-hip, gfx90a)
+// LTO-NEXT: 8: preprocessor, {7}, hip-cpp-output, (device-hip, gfx90a)
+// LTO-NEXT: 9: compiler, {8}, ir, (device-hip, gfx90a)
+// LTO-NEXT: 10: offload, "device-hip (amdgcn-amd-amdhsa:gfx90a)" {9}, ir
+// LTO-NEXT: 11: llvm-offload-binary, {6, 10}, image, (device-hip)
+// LTO-NEXT: 12: offload, "host-hip (x86_64-unknown-linux-gnu)" {2}, "device-hip (x86_64-unknown-linux-gnu)" {11}, ir
+// LTO-NEXT: 13: backend, {12}, assembler, (host-hip)
+// LTO-NEXT: 14: assembler, {13}, object, (host-hip)
//
// Test the new driver when not bundling
diff --git a/clang/test/Driver/hip-spirv-translator-new-driver.c b/clang/test/Driver/hip-spirv-translator-new-driver.c
index 67d894e2eb506..64ae58e5c34cf 100644
--- a/clang/test/Driver/hip-spirv-translator-new-driver.c
+++ b/clang/test/Driver/hip-spirv-translator-new-driver.c
@@ -6,4 +6,4 @@
// RUN: | FileCheck %s
// CHECK-NOT: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" "[[OUTPUT_FILE:.*.o]]" {{.*}}"[[OUTPUT_FILE]]"
-// CHECK: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" {{".*.tmp.o"}}
+// CHECK: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" {{".*.hipfb"}}
diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip
index 840334e19e7f2..40db927f619cc 100644
--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@@ -101,19 +101,18 @@
// NEW-SAME: "--image=file=[[OBJ_DEV_A_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
// NEW-SAME: "--image=file=[[OBJ_DEV_A_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
+// NEW: [[WRAPPER:".*clang-linker-wrapper]]"
+// NEW-SAME: "-o" "[[HIPFB_A:.*.hipfb]]" "[[PACKAGE_A]]"
+
// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
// CHECK-SAME: "-emit-obj"
// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
-// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_A]]"
-// OLD-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
-// NEW-SAME: {{.*}} "-o" [[A_OBJ_HOST_TMP:".*o"]] "-x" "hip"
+// NEW-SAME: {{.*}} "-fcuda-include-gpubinary" "[[HIPFB_A]]"
+// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
// CHECK-SAME: {{.*}} [[A_SRC]]
-// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
-// NEW: "--linker-path={{.*}}" "-o" [[A_OBJ_HOST:".*o"]] [[A_OBJ_HOST_TMP]] "-r"
-
//
// Compile device code in b.hip to code object for gfx803.
//
@@ -173,19 +172,18 @@
// NEW-SAME: "--image=file=[[OBJ_DEV_B_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
// NEW-SAME: "--image=file=[[OBJ_DEV_B_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
+// NEW: [[WRAPPER:".*clang-linker-wrapper]]"
+// NEW-SAME: "-o" "[[HIPFB_B:.*.hipfb]]" "[[PACKAGE_B]]"
+
// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
// CHECK-SAME: "-emit-obj"
// CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_B]]"
-// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_B]]"
-// OLD-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
-// NEW-SAME: {{.*}} "-o" [[B_OBJ_HOST_TMP:".*o"]] "-x" "hip"
+// NEW-SAME: {{.*}} "-fcuda-include-gpubinary" "[[HIPFB_B]]"
+// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
// CHECK-SAME: {{.*}} [[B_SRC]]
-// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
-// NEW: "--linker-path={{.*}}" "-o" [[B_OBJ_HOST:".*o"]] [[B_OBJ_HOST_TMP]] "-r"
-
//
// Link host objects.
//
@@ -219,5 +217,5 @@
// RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc -nogpulib -nogpuinc \
// RUN: --offload-new-driver --offload-arch=gfx908 -v %s 2>&1 | FileCheck %s --check-prefix=VERBOSE
// VERBOSE: clang-linker-wrapper
-// VERBOSE-NOT: --device-compiler=amdgcn-amd-amdhsa=-v
-// VERBOSE-NOT: --wrapper-verbose
+// VERBOSE-NOT: --device-compiler=amdgcn-amd-amdhsa=-v {{.*}}-o {{.*}}.hipfb
+// VERBOSE-NOT: --wrapper-verbose {{.*}}-o {{.*}}.hipfb
diff --git a/clang/test/Driver/linker-wrapper-hip-no-rdc.c b/clang/test/Driver/linker-wrapper-hip-no-rdc.c
new file mode 100644
index 0000000000000..8bbbaf269bfca
--- /dev/null
+++ b/clang/test/Driver/linker-wrapper-hip-no-rdc.c
@@ -0,0 +1,40 @@
+// UNSUPPORTED: system-windows
+// REQUIRES: amdgpu-registered-target
+
+// Test HIP non-RDC linker wrapper behavior with new offload driver.
+// The linker wrapper should output .hipfb files directly without using -r option.
+
+// An externally visible variable so static libraries extract.
+__attribute__((visibility("protected"), used)) int x;
+
+// Create device binaries and package them
+// RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -o %t.amdgpu.bc
+// RUN: llvm-offload-binary -o %t.out \
+// RUN: --image=file=%t.amdgpu.bc,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx1100 \
+// RUN: --image=file=%t.amdgpu.bc,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx1200
+
+// Test that linker wrapper outputs .hipfb file without -r option for HIP non-RDC
+// The linker wrapper is called directly with the packaged device binary (not embedded in host object)
+// Note: When called directly (not through the driver), the linker wrapper processes architectures
+// from the packaged binary. The test verifies it can process at least one architecture correctly.
+// RUN: clang-linker-wrapper %t.out -o %t.hipfb 2>&1
+
+// Verify the fat binary was created
+// RUN: test -f %t.hipfb
+
+// List code objects in the fat binary
+// RUN: clang-offload-bundler -type=o -input=%t.hipfb -list | FileCheck %s --check-prefix=HIP-FATBIN-LIST
+
+// HIP-FATBIN-LIST-DAG: hip-amdgcn-amd-amdhsa--gfx1100
+// HIP-FATBIN-LIST-DAG: hip-amdgcn-amd-a...
[truncated]
|
|
@llvm/pr-subscribers-clang Author: Yaxun (Sam) Liu (yxsamliu) ChangesUse clang linker wrapper to device-link and embed HIP fat binary directly. Match CUDA non-RDC flow in new driver by producing .hipfb like .fatbin. Previously, llvm offload binary is used to package the device IR's and embed them in the host object file, then clang linker wrapper is used with each host object file to extract device IR's, perform device linking, bundle code objects into a fat binary, wrap it in a host object file, then merge it with the original host object by the host linker with '-r' option. However, the host linker in MSVC toolchain does not support '-r' option. The new approach still package the device IR's with llvm offload binary, but instead of embed it in a host object, it is passed to clang linker wrapper directly, where device IR's are extracted and linked, fat binary is generated, then embeded in the host object directly. Compared with the old offload driver, this approach can parallelize the device linking for different GPU's by using the parallelization feature of clang linker wrapper. Fixes: SWDEV-565994 Patch is 24.52 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167918.diff 9 Files Affected:
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index b463f889df373..34bf41d8b7ed1 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -1280,8 +1280,7 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
return nullptr;
}
if (CGM.getLangOpts().OffloadViaLLVM ||
- (CGM.getLangOpts().OffloadingNewDriver &&
- (CGM.getLangOpts().HIP || RelocatableDeviceCode)))
+ (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
createOffloadingEntries();
else
return makeModuleCtorFunction();
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 9fd64d4aac514..e54c0e30bddab 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4413,10 +4413,6 @@ void Driver::BuildDefaultActions(Compilation &C, DerivedArgList &Args,
options::OPT_no_offload_new_driver,
C.isOffloadingHostKind(Action::OFK_Cuda));
- bool HIPNoRDC =
- C.isOffloadingHostKind(Action::OFK_HIP) &&
- !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
-
// Builder to be used to build offloading actions.
std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
!UseNewOffloadingDriver
@@ -4550,7 +4546,7 @@ void Driver::BuildDefaultActions(Compilation &C, DerivedArgList &Args,
// Check if this Linker Job should emit a static library.
if (ShouldEmitStaticLibrary(Args)) {
LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
- } else if ((UseNewOffloadingDriver && !HIPNoRDC) ||
+ } else if (UseNewOffloadingDriver ||
Args.hasArg(options::OPT_offload_link)) {
LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
@@ -4887,20 +4883,6 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
<< "-fhip-emit-relocatable"
<< "--offload-device-only";
- // For HIP non-rdc non-device-only compilation, create a linker wrapper
- // action for each host object to link, bundle and wrap device files in
- // it.
- if ((isa<AssembleJobAction>(HostAction) ||
- (isa<BackendJobAction>(HostAction) &&
- HostAction->getType() == types::TY_LTO_BC)) &&
- HIPNoRDC && !offloadDeviceOnly()) {
- ActionList AL{HostAction};
- HostAction = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object);
- HostAction->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
- /*BoundArch=*/nullptr);
- return HostAction;
- }
-
// Don't build offloading actions if we do not have a compile action. If
// preprocessing only ignore embedding.
if (!(isa<CompileJobAction>(HostAction) ||
@@ -5070,8 +5052,22 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
// embedded in the host and linked.
Action *PackagerAction =
C.MakeAction<OffloadPackagerJobAction>(OffloadActions, types::TY_Image);
- DDep.add(*PackagerAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
- nullptr, C.getActiveOffloadKinds());
+
+ // For HIP non-RDC compilation, wrap the device binary with linker wrapper
+ // before bundling with host code. Do not bind a specific GPU arch here,
+ // as the packaged image may contain entries for multiple GPUs.
+ if (HIPNoRDC && !offloadDeviceOnly()) {
+ ActionList AL{PackagerAction};
+ PackagerAction =
+ C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_HIP_FATBIN);
+ DDep.add(*PackagerAction,
+ *C.getOffloadToolChains<Action::OFK_HIP>().first->second,
+ /*BoundArch=*/nullptr, Action::OFK_HIP);
+ } else {
+ DDep.add(*PackagerAction,
+ *C.getSingleOffloadToolChain<Action::OFK_Host>(), nullptr,
+ C.getActiveOffloadKinds());
+ }
}
// HIP wants '--offload-device-only' to create a fatbinary by default.
@@ -5194,6 +5190,16 @@ Action *Driver::ConstructPhaseAction(
return C.MakeAction<CompileJobAction>(Input, types::TY_LLVM_BC);
}
case phases::Backend: {
+ // Skip a redundant Backend phase for HIP device code when using the new
+ // offload driver, where mid-end is done in linker wrapper.
+ {
+ if (TargetDeviceOffloadKind == Action::OFK_HIP &&
+ Args.hasFlag(options::OPT_offload_new_driver,
+ options::OPT_no_offload_new_driver, false) &&
+ !offloadDeviceOnly())
+ return Input;
+ }
+
if (isUsingLTO() && TargetDeviceOffloadKind == Action::OFK_None) {
types::ID Output;
if (Args.hasArg(options::OPT_ffat_lto_objects) &&
@@ -5213,7 +5219,8 @@ Action *Driver::ConstructPhaseAction(
if (Args.hasArg(options::OPT_emit_llvm) ||
TargetDeviceOffloadKind == Action::OFK_SYCL ||
(((Input->getOffloadingToolChain() &&
- Input->getOffloadingToolChain()->getTriple().isAMDGPU()) ||
+ Input->getOffloadingToolChain()->getTriple().isAMDGPU() &&
+ TargetDeviceOffloadKind != Action::OFK_None) ||
TargetDeviceOffloadKind == Action::OFK_HIP) &&
((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
false) ||
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 80389937ee218..2456cbb0fb399 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7636,7 +7636,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-fcuda-include-gpubinary");
CmdArgs.push_back(CudaDeviceInput->getFilename());
} else if (!HostOffloadingInputs.empty()) {
- if (IsCuda && !IsRDCMode) {
+ if ((IsCuda || IsHIP) && !IsRDCMode) {
assert(HostOffloadingInputs.size() == 1 && "Only one input expected");
CmdArgs.push_back("-fcuda-include-gpubinary");
CmdArgs.push_back(HostOffloadingInputs.front().getFilename());
@@ -9086,7 +9086,7 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
auto ShouldForward = [&](const llvm::DenseSet<unsigned> &Set, Arg *A,
const ToolChain &TC) {
// CMake hack to avoid printing verbose informatoin for HIP non-RDC mode.
- if (A->getOption().matches(OPT_v) && JA.getType() == types::TY_Object)
+ if (A->getOption().matches(OPT_v) && JA.getType() == types::TY_HIP_FATBIN)
return false;
return (Set.contains(A->getOption().getID()) ||
(A->getOption().getGroup().isValid() &&
@@ -9168,7 +9168,7 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
// non-RDC mode compilation. This confuses default CMake implicit linker
// argument parsing when the language is set to HIP and the system linker is
// also `ld.lld`.
- if (Args.hasArg(options::OPT_v) && JA.getType() != types::TY_Object)
+ if (Args.hasArg(options::OPT_v) && JA.getType() != types::TY_HIP_FATBIN)
CmdArgs.push_back("--wrapper-verbose");
if (Arg *A = Args.getLastArg(options::OPT_cuda_path_EQ))
CmdArgs.push_back(
@@ -9240,14 +9240,13 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
// We use action type to differentiate two use cases of the linker wrapper.
// TY_Image for normal linker wrapper work.
- // TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable
- // object.
- assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image);
- if (JA.getType() == types::TY_Object) {
+ // TY_HIP_FATBIN for HIP fno-gpu-rdc emitting a fat binary without wrapping.
+ assert(JA.getType() == types::TY_HIP_FATBIN ||
+ JA.getType() == types::TY_Image);
+ if (JA.getType() == types::TY_HIP_FATBIN) {
CmdArgs.append({"-o", Output.getFilename()});
for (auto Input : Inputs)
CmdArgs.push_back(Input.getFilename());
- CmdArgs.push_back("-r");
} else
for (const char *LinkArg : LinkCommand->getArguments())
CmdArgs.push_back(LinkArg);
diff --git a/clang/test/Driver/hip-binding.hip b/clang/test/Driver/hip-binding.hip
index 4d15f9739ba7f..1c02cfaadfad2 100644
--- a/clang/test/Driver/hip-binding.hip
+++ b/clang/test/Driver/hip-binding.hip
@@ -101,6 +101,6 @@
// RUN: | FileCheck -check-prefix=LTO-NO-RDC %s
// LTO-NO-RDC: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[LTO_908:.+]]"
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[LTO_90A:.+]]"
-// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
-// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[PKG]]"], output: "[[OBJ:.+]]"
-// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[OBJ]]"], output: "hip-binding.o"
+// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
+// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "Offload::Linker", inputs: ["[[PKG]]"], output: "[[HIPFB:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[HIPFB]]"], output: "hip-binding.o"
diff --git a/clang/test/Driver/hip-phases.hip b/clang/test/Driver/hip-phases.hip
index 13f682f18a3ab..47e4bfca68a39 100644
--- a/clang/test/Driver/hip-phases.hip
+++ b/clang/test/Driver/hip-phases.hip
@@ -33,32 +33,33 @@
// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
// OLDN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
-// NEW-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
// OLDN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
// OLDR-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
// OLD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
// OLD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
-// NEW-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, ir
+// NEW-DAG: [[P6:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P5]]}, ir
// OLDN-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
-// NEW-DAG: [[P10:[0-9]+]]: llvm-offload-binary, {[[P9]]}, image, (device-[[T]])
+// NEW-DAG: [[P7:[0-9]+]]: llvm-offload-binary, {[[P6]]}, image, (device-[[T]])
+// NEWN-DAG: [[P8:[0-9]+]]: clang-linker-wrapper, {[[P7]]}, hip-fatbin, (device-[[T]])
+// NEWLTO-DAG: [[P8:[0-9]+]]: clang-linker-wrapper, {[[P7]]}, hip-fatbin, (device-[[T]])
// OLDR-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
// OLDN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
-// NEW-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P10]]}, ir
+// NEWN-DAG: [[P9:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P8]]}, ir
+// NEWLTO-DAG: [[P9:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P8]]}, ir
+// NEWR-DAG: [[P8:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P7]]}, ir
// OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
// OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
// OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
-// NEWN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// NEWN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
-// NEWLTO-DAG: [[P13:[0-9]+]]: backend, {[[P11]]}, lto-bc, (host-hip)
-// NEWR-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// NEWR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// NEWN-DAG: [[P10:[0-9]+]]: backend, {[[P9]]}, assembler, (host-[[T]])
+// NEWN-DAG: [[P11:[0-9]+]]: assembler, {[[P10]]}, object, (host-[[T]])
+// NEWLTO-DAG: [[P10:[0-9]+]]: backend, {[[P9]]}, lto-bc, (host-hip)
+// NEWR-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (host-[[T]])
+// NEWR-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (host-[[T]])
// OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
-// NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
-// NEWLTO-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
+// NEWN-DAG: [[P12:[0-9]+]]: clang-linker-wrapper, {[[P11]]}, image, (host-[[T]])
// OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
-// NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
-// NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image
+// NEWR-DAG: [[P11:[0-9]+]]: clang-linker-wrapper, {[[P10]]}, image, (host-[[T]])
//
// Test single gpu architecture up to the assemble phase.
@@ -613,7 +614,6 @@
// MIXED-DAG: input, "{{.*}}empty.hip", hip, (device-hip, gfx803)
// MIXED-DAG: input, "{{.*}}empty.hip", hip, (device-hip, gfx900)
// MIXED-DAG: input, "{{.*}}empty.cpp", c++
-// MIXED-NEG-NOT: input, "{{.*}}empty.cpp", c++, (host-hip)
// MIXED-NEG-NOT: input, "{{.*}}empty.cpp", c++, (device-hip
// MIXED2-DAG: input, "{{.*}}empty.hip", hip, (host-hip)
@@ -658,17 +658,15 @@
// LTO-NEXT: 3: input, "[[INPUT]]", hip, (device-hip, gfx908)
// LTO-NEXT: 4: preprocessor, {3}, hip-cpp-output, (device-hip, gfx908)
// LTO-NEXT: 5: compiler, {4}, ir, (device-hip, gfx908)
-// LTO-NEXT: 6: backend, {5}, lto-bc, (device-hip, gfx908)
-// LTO-NEXT: 7: offload, "device-hip (amdgcn-amd-amdhsa:gfx908)" {6}, lto-bc
-// LTO-NEXT: 8: input, "[[INPUT]]", hip, (device-hip, gfx90a)
-// LTO-NEXT: 9: preprocessor, {8}, hip-cpp-output, (device-hip, gfx90a)
-// LTO-NEXT: 10: compiler, {9}, ir, (device-hip, gfx90a)
-// LTO-NEXT: 11: backend, {10}, lto-bc, (device-hip, gfx90a)
-// LTO-NEXT: 12: offload, "device-hip (amdgcn-amd-amdhsa:gfx90a)" {11}, lto-bc
-// LTO-NEXT: 13: llvm-offload-binary, {7, 12}, image, (device-hip)
-// LTO-NEXT: 14: offload, "host-hip (x86_64-unknown-linux-gnu)" {2}, "device-hip (x86_64-unknown-linux-gnu)" {13}, ir
-// LTO-NEXT: 15: backend, {14}, assembler, (host-hip)
-// LTO-NEXT: 16: assembler, {15}, object, (host-hip)
+// LTO-NEXT: 6: offload, "device-hip (amdgcn-amd-amdhsa:gfx908)" {5}, ir
+// LTO-NEXT: 7: input, "[[INPUT]]", hip, (device-hip, gfx90a)
+// LTO-NEXT: 8: preprocessor, {7}, hip-cpp-output, (device-hip, gfx90a)
+// LTO-NEXT: 9: compiler, {8}, ir, (device-hip, gfx90a)
+// LTO-NEXT: 10: offload, "device-hip (amdgcn-amd-amdhsa:gfx90a)" {9}, ir
+// LTO-NEXT: 11: llvm-offload-binary, {6, 10}, image, (device-hip)
+// LTO-NEXT: 12: offload, "host-hip (x86_64-unknown-linux-gnu)" {2}, "device-hip (x86_64-unknown-linux-gnu)" {11}, ir
+// LTO-NEXT: 13: backend, {12}, assembler, (host-hip)
+// LTO-NEXT: 14: assembler, {13}, object, (host-hip)
//
// Test the new driver when not bundling
diff --git a/clang/test/Driver/hip-spirv-translator-new-driver.c b/clang/test/Driver/hip-spirv-translator-new-driver.c
index 67d894e2eb506..64ae58e5c34cf 100644
--- a/clang/test/Driver/hip-spirv-translator-new-driver.c
+++ b/clang/test/Driver/hip-spirv-translator-new-driver.c
@@ -6,4 +6,4 @@
// RUN: | FileCheck %s
// CHECK-NOT: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" "[[OUTPUT_FILE:.*.o]]" {{.*}}"[[OUTPUT_FILE]]"
-// CHECK: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" {{".*.tmp.o"}}
+// CHECK: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" {{".*.hipfb"}}
diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip
index 840334e19e7f2..40db927f619cc 100644
--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@@ -101,19 +101,18 @@
// NEW-SAME: "--image=file=[[OBJ_DEV_A_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
// NEW-SAME: "--image=file=[[OBJ_DEV_A_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
+// NEW: [[WRAPPER:".*clang-linker-wrapper]]"
+// NEW-SAME: "-o" "[[HIPFB_A:.*.hipfb]]" "[[PACKAGE_A]]"
+
// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
// CHECK-SAME: "-emit-obj"
// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
-// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_A]]"
-// OLD-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
-// NEW-SAME: {{.*}} "-o" [[A_OBJ_HOST_TMP:".*o"]] "-x" "hip"
+// NEW-SAME: {{.*}} "-fcuda-include-gpubinary" "[[HIPFB_A]]"
+// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
// CHECK-SAME: {{.*}} [[A_SRC]]
-// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
-// NEW: "--linker-path={{.*}}" "-o" [[A_OBJ_HOST:".*o"]] [[A_OBJ_HOST_TMP]] "-r"
-
//
// Compile device code in b.hip to code object for gfx803.
//
@@ -173,19 +172,18 @@
// NEW-SAME: "--image=file=[[OBJ_DEV_B_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
// NEW-SAME: "--image=file=[[OBJ_DEV_B_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
+// NEW: [[WRAPPER:".*clang-linker-wrapper]]"
+// NEW-SAME: "-o" "[[HIPFB_B:.*.hipfb]]" "[[PACKAGE_B]]"
+
// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
// CHECK-SAME: "-emit-obj"
// CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_B]]"
-// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_B]]"
-// OLD-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
-// NEW-SAME: {{.*}} "-o" [[B_OBJ_HOST_TMP:".*o"]] "-x" "hip"
+// NEW-SAME: {{.*}} "-fcuda-include-gpubinary" "[[HIPFB_B]]"
+// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
// CHECK-SAME: {{.*}} [[B_SRC]]
-// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
-// NEW: "--linker-path={{.*}}" "-o" [[B_OBJ_HOST:".*o"]] [[B_OBJ_HOST_TMP]] "-r"
-
//
// Link host objects.
//
@@ -219,5 +217,5 @@
// RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc -nogpulib -nogpuinc \
// RUN: --offload-new-driver --offload-arch=gfx908 -v %s 2>&1 | FileCheck %s --check-prefix=VERBOSE
// VERBOSE: clang-linker-wrapper
-// VERBOSE-NOT: --device-compiler=amdgcn-amd-amdhsa=-v
-// VERBOSE-NOT: --wrapper-verbose
+// VERBOSE-NOT: --device-compiler=amdgcn-amd-amdhsa=-v {{.*}}-o {{.*}}.hipfb
+// VERBOSE-NOT: --wrapper-verbose {{.*}}-o {{.*}}.hipfb
diff --git a/clang/test/Driver/linker-wrapper-hip-no-rdc.c b/clang/test/Driver/linker-wrapper-hip-no-rdc.c
new file mode 100644
index 0000000000000..8bbbaf269bfca
--- /dev/null
+++ b/clang/test/Driver/linker-wrapper-hip-no-rdc.c
@@ -0,0 +1,40 @@
+// UNSUPPORTED: system-windows
+// REQUIRES: amdgpu-registered-target
+
+// Test HIP non-RDC linker wrapper behavior with new offload driver.
+// The linker wrapper should output .hipfb files directly without using -r option.
+
+// An externally visible variable so static libraries extract.
+__attribute__((visibility("protected"), used)) int x;
+
+// Create device binaries and package them
+// RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -o %t.amdgpu.bc
+// RUN: llvm-offload-binary -o %t.out \
+// RUN: --image=file=%t.amdgpu.bc,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx1100 \
+// RUN: --image=file=%t.amdgpu.bc,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx1200
+
+// Test that linker wrapper outputs .hipfb file without -r option for HIP non-RDC
+// The linker wrapper is called directly with the packaged device binary (not embedded in host object)
+// Note: When called directly (not through the driver), the linker wrapper processes architectures
+// from the packaged binary. The test verifies it can process at least one architecture correctly.
+// RUN: clang-linker-wrapper %t.out -o %t.hipfb 2>&1
+
+// Verify the fat binary was created
+// RUN: test -f %t.hipfb
+
+// List code objects in the fat binary
+// RUN: clang-offload-bundler -type=o -input=%t.hipfb -list | FileCheck %s --check-prefix=HIP-FATBIN-LIST
+
+// HIP-FATBIN-LIST-DAG: hip-amdgcn-amd-amdhsa--gfx1100
+// HIP-FATBIN-LIST-DAG: hip-amdgcn-amd-a...
[truncated]
|
jhuber6
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this approach is good overall, my main concern is checking filenames instead of binary magic currently
48f7428 to
66bef6e
Compare
66bef6e to
cdcb6cc
Compare
cdcb6cc to
1ee2d53
Compare
Use clang linker wrapper to device-link and embed HIP fat binary directly. Match CUDA non-RDC flow in new driver by producing .hipfb like .fatbin. Previously, llvm offload binary is used to package the device IR's and embed them in the host object file, then clang linker wrapper is used with each host object file to extract device IR's, perform device linking, bundle code objects into a fat binary, wrap it in a host object file, then merge it with the original host object by the host linker with '-r' option. However, the host linker in MSVC toolchain does not support '-r' option. The new approach still package the device IR's with llvm offload binary, but instead of embed it in a host object, it is passed to clang linker wrapper directly, where device IR's are extracted and linked, fat binary is generated, then embeded in the host object directly. Compared with the old offload driver, this approach can parallelize the device linking for different GPU's by using the parallelization feature of clang linker wrapper.
1ee2d53 to
3f6d07c
Compare
jhuber6
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG, thanks
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/108/builds/19881 Here is the relevant piece of the build log for the reference |
#167918 broke buildbots: https://lab.llvm.org/buildbot/#/builders/64/builds/6531 https://lab.llvm.org/buildbot/#/builders/108/builds/19881 with error: # | clang: error: unable to execute command: posix_spawn failed: No such file or directory # | clang: error: ld.lld command failed with exit code 1 (use -v to see invocation) This is due to the test requiring lld but these buildbots do not build them. Fix the lit test by adding REQUIRES: lld
llvm/llvm-project#167918 broke buildbots: https://lab.llvm.org/buildbot/#/builders/64/builds/6531 https://lab.llvm.org/buildbot/#/builders/108/builds/19881 with error: # | clang: error: unable to execute command: posix_spawn failed: No such file or directory # | clang: error: ld.lld command failed with exit code 1 (use -v to see invocation) This is due to the test requiring lld but these buildbots do not build them. Fix the lit test by adding REQUIRES: lld
Use clang linker wrapper to device-link and embed HIP fat binary directly. Match CUDA non-RDC flow in new driver by producing .hipfb like .fatbin.
Previously, llvm offload binary is used to package the device IR's and embed them in the host object file, then clang linker wrapper is used with each host object file to extract device IR's, perform device linking, bundle code objects into a fat binary, wrap it in a host object file, then merge it with the original host object by the host linker with '-r' option. However, the host linker in MSVC toolchain does not support '-r' option.
The new approach still package the device IR's with llvm offload binary, but instead of embed it in a host object, it is passed to clang linker wrapper directly, where device IR's are extracted and linked, fat binary is generated, then embeded in the host object directly. Compared with the old offload driver, this approach can parallelize the device linking for different GPU's by using the parallelization feature of clang linker wrapper.
Fixes: SWDEV-565994