diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index b463f889df373..34bf41d8b7ed1 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -1280,8 +1280,7 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() { return nullptr; } if (CGM.getLangOpts().OffloadViaLLVM || - (CGM.getLangOpts().OffloadingNewDriver && - (CGM.getLangOpts().HIP || RelocatableDeviceCode))) + (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)) createOffloadingEntries(); else return makeModuleCtorFunction(); diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 9fd64d4aac514..04fd68692d8d8 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -4413,10 +4413,6 @@ void Driver::BuildDefaultActions(Compilation &C, DerivedArgList &Args, options::OPT_no_offload_new_driver, C.isOffloadingHostKind(Action::OFK_Cuda)); - bool HIPNoRDC = - C.isOffloadingHostKind(Action::OFK_HIP) && - !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false); - // Builder to be used to build offloading actions. std::unique_ptr OffloadBuilder = !UseNewOffloadingDriver @@ -4550,7 +4546,7 @@ void Driver::BuildDefaultActions(Compilation &C, DerivedArgList &Args, // Check if this Linker Job should emit a static library. if (ShouldEmitStaticLibrary(Args)) { LA = C.MakeAction(LinkerInputs, types::TY_Image); - } else if ((UseNewOffloadingDriver && !HIPNoRDC) || + } else if (UseNewOffloadingDriver || Args.hasArg(options::OPT_offload_link)) { LA = C.MakeAction(LinkerInputs, types::TY_Image); LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(), @@ -4887,20 +4883,6 @@ Action *Driver::BuildOffloadingActions(Compilation &C, << "-fhip-emit-relocatable" << "--offload-device-only"; - // For HIP non-rdc non-device-only compilation, create a linker wrapper - // action for each host object to link, bundle and wrap device files in - // it. - if ((isa(HostAction) || - (isa(HostAction) && - HostAction->getType() == types::TY_LTO_BC)) && - HIPNoRDC && !offloadDeviceOnly()) { - ActionList AL{HostAction}; - HostAction = C.MakeAction(AL, types::TY_Object); - HostAction->propagateHostOffloadInfo(C.getActiveOffloadKinds(), - /*BoundArch=*/nullptr); - return HostAction; - } - // Don't build offloading actions if we do not have a compile action. If // preprocessing only ignore embedding. if (!(isa(HostAction) || @@ -5065,6 +5047,21 @@ Action *Driver::BuildOffloadingActions(Compilation &C, DDep.add(*FatbinAction, *C.getOffloadToolChains().first->second, nullptr, Action::OFK_HIP); + } else if (HIPNoRDC) { + // Package all the offloading actions into a single output that can be + // embedded in the host and linked. + Action *PackagerAction = + C.MakeAction(OffloadActions, types::TY_Image); + + // For HIP non-RDC compilation, wrap the device binary with linker wrapper + // before bundling with host code. Do not bind a specific GPU arch here, + // as the packaged image may contain entries for multiple GPUs. + ActionList AL{PackagerAction}; + PackagerAction = + C.MakeAction(AL, types::TY_HIP_FATBIN); + DDep.add(*PackagerAction, + *C.getOffloadToolChains().first->second, + /*BoundArch=*/nullptr, Action::OFK_HIP); } else { // Package all the offloading actions into a single output that can be // embedded in the host and linked. @@ -5194,6 +5191,14 @@ Action *Driver::ConstructPhaseAction( return C.MakeAction(Input, types::TY_LLVM_BC); } case phases::Backend: { + // Skip a redundant Backend phase for HIP device code when using the new + // offload driver, where mid-end is done in linker wrapper. + if (TargetDeviceOffloadKind == Action::OFK_HIP && + Args.hasFlag(options::OPT_offload_new_driver, + options::OPT_no_offload_new_driver, false) && + !offloadDeviceOnly()) + return Input; + if (isUsingLTO() && TargetDeviceOffloadKind == Action::OFK_None) { types::ID Output; if (Args.hasArg(options::OPT_ffat_lto_objects) && @@ -5213,7 +5218,8 @@ Action *Driver::ConstructPhaseAction( if (Args.hasArg(options::OPT_emit_llvm) || TargetDeviceOffloadKind == Action::OFK_SYCL || (((Input->getOffloadingToolChain() && - Input->getOffloadingToolChain()->getTriple().isAMDGPU()) || + Input->getOffloadingToolChain()->getTriple().isAMDGPU() && + TargetDeviceOffloadKind != Action::OFK_None) || TargetDeviceOffloadKind == Action::OFK_HIP) && ((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) || diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 4fe9cc4990eb7..c5d40c9825fab 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -7636,7 +7636,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-fcuda-include-gpubinary"); CmdArgs.push_back(CudaDeviceInput->getFilename()); } else if (!HostOffloadingInputs.empty()) { - if (IsCuda && !IsRDCMode) { + if ((IsCuda || IsHIP) && !IsRDCMode) { assert(HostOffloadingInputs.size() == 1 && "Only one input expected"); CmdArgs.push_back("-fcuda-include-gpubinary"); CmdArgs.push_back(HostOffloadingInputs.front().getFilename()); @@ -9093,7 +9093,7 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, auto ShouldForward = [&](const llvm::DenseSet &Set, Arg *A, const ToolChain &TC) { // CMake hack to avoid printing verbose informatoin for HIP non-RDC mode. - if (A->getOption().matches(OPT_v) && JA.getType() == types::TY_Object) + if (A->getOption().matches(OPT_v) && JA.getType() == types::TY_HIP_FATBIN) return false; return (Set.contains(A->getOption().getID()) || (A->getOption().getGroup().isValid() && @@ -9175,7 +9175,7 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, // non-RDC mode compilation. This confuses default CMake implicit linker // argument parsing when the language is set to HIP and the system linker is // also `ld.lld`. - if (Args.hasArg(options::OPT_v) && JA.getType() != types::TY_Object) + if (Args.hasArg(options::OPT_v) && JA.getType() != types::TY_HIP_FATBIN) CmdArgs.push_back("--wrapper-verbose"); if (Arg *A = Args.getLastArg(options::OPT_cuda_path_EQ)) CmdArgs.push_back( @@ -9247,14 +9247,14 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, // We use action type to differentiate two use cases of the linker wrapper. // TY_Image for normal linker wrapper work. - // TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable - // object. - assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image); - if (JA.getType() == types::TY_Object) { + // TY_HIP_FATBIN for HIP fno-gpu-rdc emitting a fat binary without wrapping. + assert(JA.getType() == types::TY_HIP_FATBIN || + JA.getType() == types::TY_Image); + if (JA.getType() == types::TY_HIP_FATBIN) { + CmdArgs.push_back("--emit-fatbin-only"); CmdArgs.append({"-o", Output.getFilename()}); for (auto Input : Inputs) CmdArgs.push_back(Input.getFilename()); - CmdArgs.push_back("-r"); } else for (const char *LinkArg : LinkCommand->getArguments()) CmdArgs.push_back(LinkArg); diff --git a/clang/test/Driver/hip-binding.hip b/clang/test/Driver/hip-binding.hip index 4d15f9739ba7f..1c02cfaadfad2 100644 --- a/clang/test/Driver/hip-binding.hip +++ b/clang/test/Driver/hip-binding.hip @@ -101,6 +101,6 @@ // RUN: | FileCheck -check-prefix=LTO-NO-RDC %s // LTO-NO-RDC: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[LTO_908:.+]]" // LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[LTO_90A:.+]]" -// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]" -// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[PKG]]"], output: "[[OBJ:.+]]" -// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[OBJ]]"], output: "hip-binding.o" +// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]" +// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "Offload::Linker", inputs: ["[[PKG]]"], output: "[[HIPFB:.+]]" +// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[HIPFB]]"], output: "hip-binding.o" diff --git a/clang/test/Driver/hip-phases.hip b/clang/test/Driver/hip-phases.hip index 13f682f18a3ab..47e4bfca68a39 100644 --- a/clang/test/Driver/hip-phases.hip +++ b/clang/test/Driver/hip-phases.hip @@ -33,32 +33,33 @@ // BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) // BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]]) // OLDN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]]) -// NEW-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]]) // OLDN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]]) // OLDR-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]]) // OLD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]]) // OLD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image -// NEW-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, ir +// NEW-DAG: [[P6:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P5]]}, ir // OLDN-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]]) -// NEW-DAG: [[P10:[0-9]+]]: llvm-offload-binary, {[[P9]]}, image, (device-[[T]]) +// NEW-DAG: [[P7:[0-9]+]]: llvm-offload-binary, {[[P6]]}, image, (device-[[T]]) +// NEWN-DAG: [[P8:[0-9]+]]: clang-linker-wrapper, {[[P7]]}, hip-fatbin, (device-[[T]]) +// NEWLTO-DAG: [[P8:[0-9]+]]: clang-linker-wrapper, {[[P7]]}, hip-fatbin, (device-[[T]]) // OLDR-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]]) // OLDN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir -// NEW-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P10]]}, ir +// NEWN-DAG: [[P9:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P8]]}, ir +// NEWLTO-DAG: [[P9:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P8]]}, ir +// NEWR-DAG: [[P8:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P7]]}, ir // OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object // OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]]) // OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]]) -// NEWN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]]) -// NEWN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]]) -// NEWLTO-DAG: [[P13:[0-9]+]]: backend, {[[P11]]}, lto-bc, (host-hip) -// NEWR-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]]) -// NEWR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]]) +// NEWN-DAG: [[P10:[0-9]+]]: backend, {[[P9]]}, assembler, (host-[[T]]) +// NEWN-DAG: [[P11:[0-9]+]]: assembler, {[[P10]]}, object, (host-[[T]]) +// NEWLTO-DAG: [[P10:[0-9]+]]: backend, {[[P9]]}, lto-bc, (host-hip) +// NEWR-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (host-[[T]]) +// NEWR-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (host-[[T]]) // OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]]) -// NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]]) -// NEWLTO-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]]) +// NEWN-DAG: [[P12:[0-9]+]]: clang-linker-wrapper, {[[P11]]}, image, (host-[[T]]) // OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]]) -// NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]]) -// NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image +// NEWR-DAG: [[P11:[0-9]+]]: clang-linker-wrapper, {[[P10]]}, image, (host-[[T]]) // // Test single gpu architecture up to the assemble phase. @@ -613,7 +614,6 @@ // MIXED-DAG: input, "{{.*}}empty.hip", hip, (device-hip, gfx803) // MIXED-DAG: input, "{{.*}}empty.hip", hip, (device-hip, gfx900) // MIXED-DAG: input, "{{.*}}empty.cpp", c++ -// MIXED-NEG-NOT: input, "{{.*}}empty.cpp", c++, (host-hip) // MIXED-NEG-NOT: input, "{{.*}}empty.cpp", c++, (device-hip // MIXED2-DAG: input, "{{.*}}empty.hip", hip, (host-hip) @@ -658,17 +658,15 @@ // LTO-NEXT: 3: input, "[[INPUT]]", hip, (device-hip, gfx908) // LTO-NEXT: 4: preprocessor, {3}, hip-cpp-output, (device-hip, gfx908) // LTO-NEXT: 5: compiler, {4}, ir, (device-hip, gfx908) -// LTO-NEXT: 6: backend, {5}, lto-bc, (device-hip, gfx908) -// LTO-NEXT: 7: offload, "device-hip (amdgcn-amd-amdhsa:gfx908)" {6}, lto-bc -// LTO-NEXT: 8: input, "[[INPUT]]", hip, (device-hip, gfx90a) -// LTO-NEXT: 9: preprocessor, {8}, hip-cpp-output, (device-hip, gfx90a) -// LTO-NEXT: 10: compiler, {9}, ir, (device-hip, gfx90a) -// LTO-NEXT: 11: backend, {10}, lto-bc, (device-hip, gfx90a) -// LTO-NEXT: 12: offload, "device-hip (amdgcn-amd-amdhsa:gfx90a)" {11}, lto-bc -// LTO-NEXT: 13: llvm-offload-binary, {7, 12}, image, (device-hip) -// LTO-NEXT: 14: offload, "host-hip (x86_64-unknown-linux-gnu)" {2}, "device-hip (x86_64-unknown-linux-gnu)" {13}, ir -// LTO-NEXT: 15: backend, {14}, assembler, (host-hip) -// LTO-NEXT: 16: assembler, {15}, object, (host-hip) +// LTO-NEXT: 6: offload, "device-hip (amdgcn-amd-amdhsa:gfx908)" {5}, ir +// LTO-NEXT: 7: input, "[[INPUT]]", hip, (device-hip, gfx90a) +// LTO-NEXT: 8: preprocessor, {7}, hip-cpp-output, (device-hip, gfx90a) +// LTO-NEXT: 9: compiler, {8}, ir, (device-hip, gfx90a) +// LTO-NEXT: 10: offload, "device-hip (amdgcn-amd-amdhsa:gfx90a)" {9}, ir +// LTO-NEXT: 11: llvm-offload-binary, {6, 10}, image, (device-hip) +// LTO-NEXT: 12: offload, "host-hip (x86_64-unknown-linux-gnu)" {2}, "device-hip (x86_64-unknown-linux-gnu)" {11}, ir +// LTO-NEXT: 13: backend, {12}, assembler, (host-hip) +// LTO-NEXT: 14: assembler, {13}, object, (host-hip) // // Test the new driver when not bundling diff --git a/clang/test/Driver/hip-spirv-translator-new-driver.c b/clang/test/Driver/hip-spirv-translator-new-driver.c index 67d894e2eb506..64ae58e5c34cf 100644 --- a/clang/test/Driver/hip-spirv-translator-new-driver.c +++ b/clang/test/Driver/hip-spirv-translator-new-driver.c @@ -6,4 +6,4 @@ // RUN: | FileCheck %s // CHECK-NOT: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" "[[OUTPUT_FILE:.*.o]]" {{.*}}"[[OUTPUT_FILE]]" -// CHECK: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" {{".*.tmp.o"}} +// CHECK: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" {{".*.hipfb"}} diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip index 840334e19e7f2..bc560330d5696 100644 --- a/clang/test/Driver/hip-toolchain-no-rdc.hip +++ b/clang/test/Driver/hip-toolchain-no-rdc.hip @@ -101,19 +101,19 @@ // NEW-SAME: "--image=file=[[OBJ_DEV_A_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip" // NEW-SAME: "--image=file=[[OBJ_DEV_A_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip" +// NEW: [[WRAPPER:".*clang-linker-wrapper]]" +// NEW-SAME: "--emit-fatbin-only" +// NEW-SAME: "-o" "[[HIPFB_A:.*.hipfb]]" "[[PACKAGE_A]]" + // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu" // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" // CHECK-SAME: "-emit-obj" // CHECK-SAME: {{.*}} "-main-file-name" "a.cu" // OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]" -// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_A]]" -// OLD-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip" -// NEW-SAME: {{.*}} "-o" [[A_OBJ_HOST_TMP:".*o"]] "-x" "hip" +// NEW-SAME: {{.*}} "-fcuda-include-gpubinary" "[[HIPFB_A]]" +// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip" // CHECK-SAME: {{.*}} [[A_SRC]] -// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu" -// NEW: "--linker-path={{.*}}" "-o" [[A_OBJ_HOST:".*o"]] [[A_OBJ_HOST_TMP]] "-r" - // // Compile device code in b.hip to code object for gfx803. // @@ -173,19 +173,19 @@ // NEW-SAME: "--image=file=[[OBJ_DEV_B_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip" // NEW-SAME: "--image=file=[[OBJ_DEV_B_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip" +// NEW: [[WRAPPER:".*clang-linker-wrapper]]" +// NEW-SAME: "--emit-fatbin-only" +// NEW-SAME: "-o" "[[HIPFB_B:.*.hipfb]]" "[[PACKAGE_B]]" + // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu" // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" // CHECK-SAME: "-emit-obj" // CHECK-SAME: {{.*}} "-main-file-name" "b.hip" // OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_B]]" -// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_B]]" -// OLD-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip" -// NEW-SAME: {{.*}} "-o" [[B_OBJ_HOST_TMP:".*o"]] "-x" "hip" +// NEW-SAME: {{.*}} "-fcuda-include-gpubinary" "[[HIPFB_B]]" +// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip" // CHECK-SAME: {{.*}} [[B_SRC]] -// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu" -// NEW: "--linker-path={{.*}}" "-o" [[B_OBJ_HOST:".*o"]] [[B_OBJ_HOST_TMP]] "-r" - // // Link host objects. // @@ -219,5 +219,5 @@ // RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc -nogpulib -nogpuinc \ // RUN: --offload-new-driver --offload-arch=gfx908 -v %s 2>&1 | FileCheck %s --check-prefix=VERBOSE // VERBOSE: clang-linker-wrapper -// VERBOSE-NOT: --device-compiler=amdgcn-amd-amdhsa=-v -// VERBOSE-NOT: --wrapper-verbose +// VERBOSE-NOT: --device-compiler=amdgcn-amd-amdhsa=-v {{.*}}-o {{.*}}.hipfb +// VERBOSE-NOT: --wrapper-verbose {{.*}}-o {{.*}}.hipfb diff --git a/clang/test/Driver/linker-wrapper-hip-no-rdc.c b/clang/test/Driver/linker-wrapper-hip-no-rdc.c new file mode 100644 index 0000000000000..d6838896f7093 --- /dev/null +++ b/clang/test/Driver/linker-wrapper-hip-no-rdc.c @@ -0,0 +1,40 @@ +// UNSUPPORTED: system-windows +// REQUIRES: amdgpu-registered-target + +// Test HIP non-RDC linker wrapper behavior with new offload driver. +// The linker wrapper should output .hipfb files directly without using -r option. + +// An externally visible variable so static libraries extract. +__attribute__((visibility("protected"), used)) int x; + +// Create device binaries and package them +// RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -o %t.amdgpu.bc +// RUN: llvm-offload-binary -o %t.out \ +// RUN: --image=file=%t.amdgpu.bc,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx1100 \ +// RUN: --image=file=%t.amdgpu.bc,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx1200 + +// Test that linker wrapper outputs .hipfb file without -r option for HIP non-RDC +// The linker wrapper is called directly with the packaged device binary (not embedded in host object) +// Note: When called directly (not through the driver), the linker wrapper processes architectures +// from the packaged binary. The test verifies it can process at least one architecture correctly. +// RUN: clang-linker-wrapper --emit-fatbin-only --linker-path=/usr/bin/ld %t.out -o %t.hipfb 2>&1 + +// Verify the fat binary was created +// RUN: test -f %t.hipfb + +// List code objects in the fat binary +// RUN: clang-offload-bundler -type=o -input=%t.hipfb -list | FileCheck %s --check-prefix=HIP-FATBIN-LIST + +// HIP-FATBIN-LIST-DAG: hip-amdgcn-amd-amdhsa--gfx1100 +// HIP-FATBIN-LIST-DAG: hip-amdgcn-amd-amdhsa--gfx1200 +// HIP-FATBIN-LIST-DAG: host-x86_64-unknown-linux-gnu- + +// Extract code objects for both architectures from the fat binary +// RUN: clang-offload-bundler -type=o -targets=hip-amdgcn-amd-amdhsa--gfx1100,hip-amdgcn-amd-amdhsa--gfx1200 \ +// RUN: -output=%t.gfx1100.co -output=%t.gfx1200.co -input=%t.hipfb -unbundle + +// Verify extracted code objects exist and are not empty +// RUN: test -f %t.gfx1100.co +// RUN: test -s %t.gfx1100.co +// RUN: test -f %t.gfx1200.co +// RUN: test -s %t.gfx1200.co diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index bfeca17d2147e..bd4b40192c9f2 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -945,10 +945,12 @@ Error handleOverrideImages( } /// Transforms all the extracted offloading input files into an image that can -/// be registered by the runtime. +/// be registered by the runtime. If NeedsWrapping is false, writes bundled +/// output directly without wrapping or host linking. Expected> linkAndWrapDeviceFiles(ArrayRef> LinkerInputFiles, - const InputArgList &Args, char **Argv, int Argc) { + const InputArgList &Args, char **Argv, int Argc, + bool NeedsWrapping) { llvm::TimeTraceScope TimeScope("Handle all device input"); std::mutex ImageMtx; @@ -1036,8 +1038,9 @@ linkAndWrapDeviceFiles(ArrayRef> LinkerInputFiles, if (Err) return std::move(Err); - // Create a binary image of each offloading image and embed it into a new - // object file. + // Create a binary image of each offloading image and either embed it into a + // new object file, or if all inputs were direct offload binaries, emit the + // fat binary directly (e.g. .hipfb / .fatbin). SmallVector WrappedOutput; for (auto &[Kind, Input] : Images) { // We sort the entries before bundling so they appear in a deterministic @@ -1050,6 +1053,26 @@ linkAndWrapDeviceFiles(ArrayRef> LinkerInputFiles, auto BundledImagesOrErr = bundleLinkedOutput(Input, Args, Kind); if (!BundledImagesOrErr) return BundledImagesOrErr.takeError(); + + if (!NeedsWrapping) { + if (BundledImagesOrErr->size() != 1) + return createStringError( + "Expected a single bundled image for direct fat binary output"); + + Expected> FOBOrErr = + FileOutputBuffer::create( + ExecutableName, BundledImagesOrErr->front()->getBufferSize()); + if (!FOBOrErr) + return FOBOrErr.takeError(); + std::unique_ptr FOB = std::move(*FOBOrErr); + llvm::copy(BundledImagesOrErr->front()->getBuffer(), + FOB->getBufferStart()); + if (Error E = FOB->commit()) + return std::move(E); + + continue; + } + auto OutputOrErr = wrapDeviceImages(*BundledImagesOrErr, Args, Kind); if (!OutputOrErr) return OutputOrErr.takeError(); @@ -1324,15 +1347,22 @@ int main(int Argc, char **Argv) { if (!DeviceInputFiles) reportError(DeviceInputFiles.takeError()); - // Link and wrap the device images extracted from the linker input. - auto FilesOrErr = - linkAndWrapDeviceFiles(*DeviceInputFiles, Args, Argv, Argc); + // Check if we should emit fat binary directly without wrapping or host + // linking. + bool EmitFatbinOnly = Args.hasArg(OPT_emit_fatbin_only); + + // Link and process the device images. The function may emit a direct fat + // binary if --emit-fatbin-only is specified. + auto FilesOrErr = linkAndWrapDeviceFiles(*DeviceInputFiles, Args, Argv, + Argc, !EmitFatbinOnly); if (!FilesOrErr) reportError(FilesOrErr.takeError()); // Run the host linking job with the rendered arguments. - if (Error Err = runLinker(*FilesOrErr, Args)) - reportError(std::move(Err)); + if (!EmitFatbinOnly) { + if (Error Err = runLinker(*FilesOrErr, Args)) + reportError(std::move(Err)); + } } if (const opt::Arg *Arg = Args.getLastArg(OPT_wrapper_time_trace_eq)) { diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td index 87f911c749bf6..ef3a16b2f58bb 100644 --- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td +++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td @@ -64,6 +64,11 @@ def should_extract : CommaJoined<["--"], "should-extract=">, Flags<[WrapperOnlyOption]>, MetaVarName<"">, HelpText<"Set of device architectures we should always extract if found.">; +def emit_fatbin_only + : Flag<["--"], "emit-fatbin-only">, + Flags<[WrapperOnlyOption]>, + HelpText<"Emit fat binary directly without wrapping or host linking">; + // Flags passed to the device linker. def arch_EQ : Joined<["--"], "arch=">, Flags<[DeviceOnlyOption, HelpHidden]>, MetaVarName<"">,