diff --git a/clang/include/clang/Basic/TargetOptions.h b/clang/include/clang/Basic/TargetOptions.h index ba3acd0295871..8bb03249b7f83 100644 --- a/clang/include/clang/Basic/TargetOptions.h +++ b/clang/include/clang/Basic/TargetOptions.h @@ -83,7 +83,7 @@ class TargetOptions { enum CodeObjectVersionKind { COV_None, COV_2 = 200, // Unsupported. - COV_3 = 300, // Unsupported. + COV_3 = 300, COV_4 = 400, COV_5 = 500, }; diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index a89d6b6579f11..640044622fc09 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4682,9 +4682,9 @@ defm amdgpu_ieee : BoolOption<"m", "amdgpu-ieee", def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, Group, HelpText<"Specify code object ABI version. Defaults to 4. (AMDGPU only)">, Visibility<[ClangOption, CC1Option]>, - Values<"none,4,5">, + Values<"none,3,4,5">, NormalizedValuesScope<"TargetOptions">, - NormalizedValues<["COV_None", "COV_4", "COV_5"]>, + NormalizedValues<["COV_None", "COV_3", "COV_4", "COV_5"]>, MarshallingInfoEnum, "COV_4">; defm cumode : SimpleMFlag<"cumode", diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index f104ec5a881cb..25fd940584624 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -2338,7 +2338,7 @@ getAMDGPUCodeObjectArgument(const Driver &D, const llvm::opt::ArgList &Args) { void tools::checkAMDGPUCodeObjectVersion(const Driver &D, const llvm::opt::ArgList &Args) { - const unsigned MinCodeObjVer = 4; + const unsigned MinCodeObjVer = 3; const unsigned MaxCodeObjVer = 5; if (auto *CodeObjArg = getAMDGPUCodeObjectArgument(D, Args)) { diff --git a/clang/test/CodeGenCUDA/amdgpu-code-object-version.cu b/clang/test/CodeGenCUDA/amdgpu-code-object-version.cu index ff5deaf9ab850..0ddd63faf46f2 100644 --- a/clang/test/CodeGenCUDA/amdgpu-code-object-version.cu +++ b/clang/test/CodeGenCUDA/amdgpu-code-object-version.cu @@ -3,6 +3,9 @@ // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ // RUN: -o - %s | FileCheck %s -check-prefix=V4 +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -mcode-object-version=3 -o - %s | FileCheck -check-prefix=V3 %s + // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ // RUN: -mcode-object-version=4 -o - %s | FileCheck -check-prefix=V4 %s @@ -15,6 +18,7 @@ // RUN: not %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ // RUN: -mcode-object-version=4.1 -o - %s 2>&1| FileCheck %s -check-prefix=INV +// V3: !{{.*}} = !{i32 1, !"amdgpu_code_object_version", i32 300} // V4: !{{.*}} = !{i32 1, !"amdgpu_code_object_version", i32 400} // V5: !{{.*}} = !{i32 1, !"amdgpu_code_object_version", i32 500} // NONE-NOT: !{{.*}} = !{i32 1, !"amdgpu_code_object_version", diff --git a/clang/test/Driver/hip-code-object-version.hip b/clang/test/Driver/hip-code-object-version.hip index af5f9a3da21df..33559b6576e7d 100644 --- a/clang/test/Driver/hip-code-object-version.hip +++ b/clang/test/Driver/hip-code-object-version.hip @@ -1,5 +1,20 @@ // REQUIRES: amdgpu-registered-target +// Check bundle ID for code object v3. + +// RUN: not %clang -### --target=x86_64-linux-gnu \ +// RUN: -mcode-object-version=3 \ +// RUN: --offload-arch=gfx906 --rocm-path=%S/Inputs/rocm \ +// RUN: %s 2>&1 | FileCheck -check-prefix=V3 %s + +// RUN: not %clang -### --target=x86_64-linux-gnu \ +// RUN: -mcode-object-version=4 -mcode-object-version=3 \ +// RUN: --offload-arch=gfx906 --rocm-path=%S/Inputs/rocm \ +// RUN: %s 2>&1 | FileCheck -check-prefix=V3 %s + +// V3: "-mcode-object-version=3" +// V3: "-mllvm" "--amdhsa-code-object-version=3" +// V3: "-targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa--gfx906" // Check bundle ID for code object version 4. @@ -47,13 +62,6 @@ // INVALID_2: error: invalid integral value '2' in '-mcode-object-version=2' // INVALID_2-NOT: error: invalid integral value -// RUN: not %clang -### --target=x86_64-linux-gnu \ -// RUN: -mcode-object-version=3 \ -// RUN: --offload-arch=gfx906 --rocm-path=%S/Inputs/rocm \ -// RUN: %s 2>&1 | FileCheck -check-prefix=INVALID_3 %s -// INVALID_3: error: invalid integral value '3' in '-mcode-object-version=3' -// INVALID_3-NOT: error: invalid integral value - // Check LLVM code object version option --amdhsa-code-object-version // is passed to -cc1 and -cc1as, and -mcode-object-version is passed // to -cc1 but not -cc1as. diff --git a/clang/test/Driver/hip-device-libs.hip b/clang/test/Driver/hip-device-libs.hip index 6ac5778721ba5..71d9554da696b 100644 --- a/clang/test/Driver/hip-device-libs.hip +++ b/clang/test/Driver/hip-device-libs.hip @@ -168,6 +168,12 @@ // RUN: --rocm-path=%S/Inputs/rocm %S/Inputs/hip_multiple_inputs/b.hip \ // RUN: 2>&1 | FileCheck %s --check-prefixes=NOABI4 +// Test -mcode-object-version=3 +// RUN: %clang -### --target=x86_64-linux-gnu --offload-arch=gfx900 \ +// RUN: -mcode-object-version=3 \ +// RUN: --rocm-path=%S/Inputs/rocm %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck %s --check-prefixes=ABI4 + // Test -mcode-object-version=4 // RUN: %clang -### --target=x86_64-linux-gnu --offload-arch=gfx900 \ // RUN: -mcode-object-version=4 \ diff --git a/lld/test/ELF/amdgpu-abi-version.s b/lld/test/ELF/amdgpu-abi-version.s index cda9f5aafa5ee..455a52aec9210 100644 --- a/lld/test/ELF/amdgpu-abi-version.s +++ b/lld/test/ELF/amdgpu-abi-version.s @@ -1,4 +1,11 @@ # REQUIRES: amdgpu +# RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=3 -filetype=obj %s -o %t.o +# RUN: ld.lld -shared %t.o -o %t.so +# RUN: llvm-readobj --file-headers %t.so | FileCheck --check-prefix=COV3 %s + +# COV3: OS/ABI: AMDGPU_HSA (0x40) +# COV3: ABIVersion: 1 + # RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 -filetype=obj %s -o %t.o # RUN: ld.lld -shared %t.o -o %t.so # RUN: llvm-readobj --file-headers %t.so | FileCheck --check-prefix=COV4 %s diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index ed9581ccc93df..8022816d7e616 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1409,10 +1409,12 @@ The AMDGPU backend uses the following ELF header: object conforms: * ``ELFABIVERSION_AMDGPU_HSA_V2`` is used to specify the version of AMD HSA - runtime ABI for code object V2. Can no longer be emitted by this version of LLVM. + runtime ABI for code object V2. Specify using the Clang option + ``-mcode-object-version=2``. * ``ELFABIVERSION_AMDGPU_HSA_V3`` is used to specify the version of AMD HSA - runtime ABI for code object V3. Can no longer be emitted by this version of LLVM. + runtime ABI for code object V3. Specify using the Clang option + ``-mcode-object-version=3``. * ``ELFABIVERSION_AMDGPU_HSA_V4`` is used to specify the version of AMD HSA runtime ABI for code object V4. Specify using the Clang option @@ -3400,7 +3402,8 @@ Code Object V3 Metadata +++++++++++++++++++++++ .. warning:: - Code object V3 generation is no longer supported by this version of LLVM. + Code object V3 is not the default code object version emitted by this version + of LLVM. Code object V3 and above metadata is specified by the ``NT_AMDGPU_METADATA`` note record (see :ref:`amdgpu-note-records-v3-onwards`). diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 8d0ef67a615df..aadc4a68ea132 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -341,6 +341,9 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) { if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { switch (CodeObjectVersion) { + case AMDGPU::AMDHSA_COV3: + HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV3()); + break; case AMDGPU::AMDHSA_COV4: HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4()); break; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index b51a876750b58..5060cd3aec581 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -49,14 +49,14 @@ namespace AMDGPU { namespace HSAMD { //===----------------------------------------------------------------------===// -// HSAMetadataStreamerV4 +// HSAMetadataStreamerV3 //===----------------------------------------------------------------------===// -void MetadataStreamerMsgPackV4::dump(StringRef HSAMetadataString) const { +void MetadataStreamerMsgPackV3::dump(StringRef HSAMetadataString) const { errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n'; } -void MetadataStreamerMsgPackV4::verify(StringRef HSAMetadataString) const { +void MetadataStreamerMsgPackV3::verify(StringRef HSAMetadataString) const { errs() << "AMDGPU HSA Metadata Parser Test: "; msgpack::Document FromHSAMetadataString; @@ -78,7 +78,7 @@ void MetadataStreamerMsgPackV4::verify(StringRef HSAMetadataString) const { } std::optional -MetadataStreamerMsgPackV4::getAccessQualifier(StringRef AccQual) const { +MetadataStreamerMsgPackV3::getAccessQualifier(StringRef AccQual) const { return StringSwitch>(AccQual) .Case("read_only", StringRef("read_only")) .Case("write_only", StringRef("write_only")) @@ -86,7 +86,7 @@ MetadataStreamerMsgPackV4::getAccessQualifier(StringRef AccQual) const { .Default(std::nullopt); } -std::optional MetadataStreamerMsgPackV4::getAddressSpaceQualifier( +std::optional MetadataStreamerMsgPackV3::getAddressSpaceQualifier( unsigned AddressSpace) const { switch (AddressSpace) { case AMDGPUAS::PRIVATE_ADDRESS: @@ -107,7 +107,7 @@ std::optional MetadataStreamerMsgPackV4::getAddressSpaceQualifier( } StringRef -MetadataStreamerMsgPackV4::getValueKind(Type *Ty, StringRef TypeQual, +MetadataStreamerMsgPackV3::getValueKind(Type *Ty, StringRef TypeQual, StringRef BaseTypeName) const { if (TypeQual.contains("pipe")) return "pipe"; @@ -134,7 +134,7 @@ MetadataStreamerMsgPackV4::getValueKind(Type *Ty, StringRef TypeQual, : "by_value"); } -std::string MetadataStreamerMsgPackV4::getTypeName(Type *Ty, +std::string MetadataStreamerMsgPackV3::getTypeName(Type *Ty, bool Signed) const { switch (Ty->getTypeID()) { case Type::IntegerTyID: { @@ -173,7 +173,7 @@ std::string MetadataStreamerMsgPackV4::getTypeName(Type *Ty, } msgpack::ArrayDocNode -MetadataStreamerMsgPackV4::getWorkGroupDimensions(MDNode *Node) const { +MetadataStreamerMsgPackV3::getWorkGroupDimensions(MDNode *Node) const { auto Dims = HSAMetadataDoc->getArrayNode(); if (Node->getNumOperands() != 3) return Dims; @@ -184,20 +184,14 @@ MetadataStreamerMsgPackV4::getWorkGroupDimensions(MDNode *Node) const { return Dims; } -void MetadataStreamerMsgPackV4::emitVersion() { +void MetadataStreamerMsgPackV3::emitVersion() { auto Version = HSAMetadataDoc->getArrayNode(); - Version.push_back(Version.getDocument()->getNode(VersionMajorV4)); - Version.push_back(Version.getDocument()->getNode(VersionMinorV4)); + Version.push_back(Version.getDocument()->getNode(VersionMajorV3)); + Version.push_back(Version.getDocument()->getNode(VersionMinorV3)); getRootMetadata("amdhsa.version") = Version; } -void MetadataStreamerMsgPackV4::emitTargetID( - const IsaInfo::AMDGPUTargetID &TargetID) { - getRootMetadata("amdhsa.target") = - HSAMetadataDoc->getNode(TargetID.toString(), /*Copy=*/true); -} - -void MetadataStreamerMsgPackV4::emitPrintf(const Module &Mod) { +void MetadataStreamerMsgPackV3::emitPrintf(const Module &Mod) { auto Node = Mod.getNamedMetadata("llvm.printf.fmts"); if (!Node) return; @@ -210,7 +204,7 @@ void MetadataStreamerMsgPackV4::emitPrintf(const Module &Mod) { getRootMetadata("amdhsa.printf") = Printf; } -void MetadataStreamerMsgPackV4::emitKernelLanguage(const Function &Func, +void MetadataStreamerMsgPackV3::emitKernelLanguage(const Function &Func, msgpack::MapDocNode Kern) { // TODO: What about other languages? auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version"); @@ -229,7 +223,7 @@ void MetadataStreamerMsgPackV4::emitKernelLanguage(const Function &Func, Kern[".language_version"] = LanguageVersion; } -void MetadataStreamerMsgPackV4::emitKernelAttrs(const Function &Func, +void MetadataStreamerMsgPackV3::emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern) { if (auto Node = Func.getMetadata("reqd_work_group_size")) @@ -254,7 +248,7 @@ void MetadataStreamerMsgPackV4::emitKernelAttrs(const Function &Func, Kern[".kind"] = Kern.getDocument()->getNode("fini"); } -void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF, +void MetadataStreamerMsgPackV3::emitKernelArgs(const MachineFunction &MF, msgpack::MapDocNode Kern) { auto &Func = MF.getFunction(); unsigned Offset = 0; @@ -267,7 +261,7 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF, Kern[".args"] = Args; } -void MetadataStreamerMsgPackV4::emitKernelArg(const Argument &Arg, +void MetadataStreamerMsgPackV3::emitKernelArg(const Argument &Arg, unsigned &Offset, msgpack::ArrayDocNode Args) { auto Func = Arg.getParent(); @@ -332,7 +326,7 @@ void MetadataStreamerMsgPackV4::emitKernelArg(const Argument &Arg, AccQual, TypeQual); } -void MetadataStreamerMsgPackV4::emitKernelArg( +void MetadataStreamerMsgPackV3::emitKernelArg( const DataLayout &DL, Type *Ty, Align Alignment, StringRef ValueKind, unsigned &Offset, msgpack::ArrayDocNode Args, MaybeAlign PointeeAlign, StringRef Name, StringRef TypeName, StringRef BaseTypeName, @@ -381,7 +375,7 @@ void MetadataStreamerMsgPackV4::emitKernelArg( Args.push_back(Arg); } -void MetadataStreamerMsgPackV4::emitHiddenKernelArgs( +void MetadataStreamerMsgPackV3::emitHiddenKernelArgs( const MachineFunction &MF, unsigned &Offset, msgpack::ArrayDocNode Args) { auto &Func = MF.getFunction(); const GCNSubtarget &ST = MF.getSubtarget(); @@ -454,10 +448,9 @@ void MetadataStreamerMsgPackV4::emitHiddenKernelArgs( } } -msgpack::MapDocNode -MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF, - const SIProgramInfo &ProgramInfo, - unsigned CodeObjectVersion) const { +msgpack::MapDocNode MetadataStreamerMsgPackV3::getHSAKernelProps( + const MachineFunction &MF, const SIProgramInfo &ProgramInfo, + unsigned CodeObjectVersion) const { const GCNSubtarget &STM = MF.getSubtarget(); const SIMachineFunctionInfo &MFI = *MF.getInfo(); const Function &F = MF.getFunction(); @@ -502,19 +495,18 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF, return Kern; } -bool MetadataStreamerMsgPackV4::emitTo(AMDGPUTargetStreamer &TargetStreamer) { +bool MetadataStreamerMsgPackV3::emitTo(AMDGPUTargetStreamer &TargetStreamer) { return TargetStreamer.EmitHSAMetadata(*HSAMetadataDoc, true); } -void MetadataStreamerMsgPackV4::begin(const Module &Mod, +void MetadataStreamerMsgPackV3::begin(const Module &Mod, const IsaInfo::AMDGPUTargetID &TargetID) { emitVersion(); - emitTargetID(TargetID); emitPrintf(Mod); getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode(); } -void MetadataStreamerMsgPackV4::end() { +void MetadataStreamerMsgPackV3::end() { std::string HSAMetadataString; raw_string_ostream StrOS(HSAMetadataString); HSAMetadataDoc->toYAML(StrOS); @@ -525,7 +517,7 @@ void MetadataStreamerMsgPackV4::end() { verify(StrOS.str()); } -void MetadataStreamerMsgPackV4::emitKernel(const MachineFunction &MF, +void MetadataStreamerMsgPackV3::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) { auto &Func = MF.getFunction(); if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL && @@ -550,6 +542,31 @@ void MetadataStreamerMsgPackV4::emitKernel(const MachineFunction &MF, Kernels.push_back(Kern); } +//===----------------------------------------------------------------------===// +// HSAMetadataStreamerV4 +//===----------------------------------------------------------------------===// + +void MetadataStreamerMsgPackV4::emitVersion() { + auto Version = HSAMetadataDoc->getArrayNode(); + Version.push_back(Version.getDocument()->getNode(VersionMajorV4)); + Version.push_back(Version.getDocument()->getNode(VersionMinorV4)); + getRootMetadata("amdhsa.version") = Version; +} + +void MetadataStreamerMsgPackV4::emitTargetID( + const IsaInfo::AMDGPUTargetID &TargetID) { + getRootMetadata("amdhsa.target") = + HSAMetadataDoc->getNode(TargetID.toString(), /*Copy=*/true); +} + +void MetadataStreamerMsgPackV4::begin(const Module &Mod, + const IsaInfo::AMDGPUTargetID &TargetID) { + emitVersion(); + emitTargetID(TargetID); + emitPrintf(Mod); + getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode(); +} + //===----------------------------------------------------------------------===// // HSAMetadataStreamerV5 //===----------------------------------------------------------------------===// @@ -663,7 +680,7 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs( void MetadataStreamerMsgPackV5::emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern) { - MetadataStreamerMsgPackV4::emitKernelAttrs(Func, Kern); + MetadataStreamerMsgPackV3::emitKernelAttrs(Func, Kern); if (Func.getFnAttribute("uniform-work-group-size").getValueAsBool()) Kern[".uniform_work_group_size"] = Kern.getDocument()->getNode(1); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index 18a7b5d7a9633..d2b3b8917ce0f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -62,7 +62,7 @@ class MetadataStreamer { msgpack::MapDocNode Kern) = 0; }; -class MetadataStreamerMsgPackV4 : public MetadataStreamer { +class MetadataStreamerMsgPackV3 : public MetadataStreamer { protected: std::unique_ptr HSAMetadataDoc = std::make_unique(); @@ -89,8 +89,6 @@ class MetadataStreamerMsgPackV4 : public MetadataStreamer { void emitVersion() override; - void emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID); - void emitPrintf(const Module &Mod); void emitKernelLanguage(const Function &Func, msgpack::MapDocNode Kern); @@ -122,8 +120,8 @@ class MetadataStreamerMsgPackV4 : public MetadataStreamer { } public: - MetadataStreamerMsgPackV4() = default; - ~MetadataStreamerMsgPackV4() = default; + MetadataStreamerMsgPackV3() = default; + ~MetadataStreamerMsgPackV3() = default; bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override; @@ -136,6 +134,19 @@ class MetadataStreamerMsgPackV4 : public MetadataStreamer { const SIProgramInfo &ProgramInfo) override; }; +class MetadataStreamerMsgPackV4 : public MetadataStreamerMsgPackV3 { +protected: + void emitVersion() override; + void emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID); + +public: + MetadataStreamerMsgPackV4() = default; + ~MetadataStreamerMsgPackV4() = default; + + void begin(const Module &Mod, + const IsaInfo::AMDGPUTargetID &TargetID) override; +}; + class MetadataStreamerMsgPackV5 final : public MetadataStreamerMsgPackV4 { protected: void emitVersion() override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 21abfb42d11ba..3d70ed150df12 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -6478,6 +6478,11 @@ bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) return legalizeTrapEndpgm(MI, MRI, B); + const Module *M = B.getMF().getFunction().getParent(); + unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M); + if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3) + return legalizeTrapHsaQueuePtr(MI, MRI, B); + return ST.supportsGetDoorbellID() ? legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B); } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 42af09e27e471..6b8c03c1620d2 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -424,6 +424,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( switch (CodeObjectVersion) { default: break; + case AMDGPU::AMDHSA_COV3: case AMDGPU::AMDHSA_COV4: case AMDGPU::AMDHSA_COV5: if (getTargetID()->isXnackSupported()) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 9c5b166c96522..f8f1e6d6c9097 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5992,6 +5992,11 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) return lowerTrapEndpgm(Op, DAG); + const Module *M = DAG.getMachineFunction().getFunction().getParent(); + unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M); + if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3) + return lowerTrapHsaQueuePtr(Op, DAG); + return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) : lowerTrapHsaQueuePtr(Op, DAG); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 5fff19eada75d..d123b384a27d4 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -128,6 +128,8 @@ std::optional getHsaAbiVersion(const MCSubtargetInfo *STI) { return std::nullopt; switch (AmdhsaCodeObjectVersion) { + case 3: + return ELF::ELFABIVERSION_AMDGPU_HSA_V3; case 4: return ELF::ELFABIVERSION_AMDGPU_HSA_V4; case 5: @@ -138,6 +140,12 @@ std::optional getHsaAbiVersion(const MCSubtargetInfo *STI) { } } +bool isHsaAbiVersion3(const MCSubtargetInfo *STI) { + if (std::optional HsaAbiVer = getHsaAbiVersion(STI)) + return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V3; + return false; +} + bool isHsaAbiVersion4(const MCSubtargetInfo *STI) { if (std::optional HsaAbiVer = getHsaAbiVersion(STI)) return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V4; @@ -166,6 +174,7 @@ unsigned getCodeObjectVersion(const Module &M) { unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion) { switch (CodeObjectVersion) { + case AMDHSA_COV3: case AMDHSA_COV4: return 48; case AMDHSA_COV5: @@ -179,6 +188,7 @@ unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion) { // central TD file. unsigned getHostcallImplicitArgPosition(unsigned CodeObjectVersion) { switch (CodeObjectVersion) { + case AMDHSA_COV3: case AMDHSA_COV4: return 24; case AMDHSA_COV5: @@ -189,6 +199,7 @@ unsigned getHostcallImplicitArgPosition(unsigned CodeObjectVersion) { unsigned getDefaultQueueImplicitArgPosition(unsigned CodeObjectVersion) { switch (CodeObjectVersion) { + case AMDHSA_COV3: case AMDHSA_COV4: return 32; case AMDHSA_COV5: @@ -199,6 +210,7 @@ unsigned getDefaultQueueImplicitArgPosition(unsigned CodeObjectVersion) { unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion) { switch (CodeObjectVersion) { + case AMDHSA_COV3: case AMDHSA_COV4: return 40; case AMDHSA_COV5: @@ -762,6 +774,15 @@ std::string AMDGPUTargetID::toString() const { std::string Features; if (STI.getTargetTriple().getOS() == Triple::AMDHSA) { switch (CodeObjectVersion) { + case AMDGPU::AMDHSA_COV3: + // xnack. + if (isXnackOnOrAny()) + Features += "+xnack"; + // In code object v2 and v3, "sramecc" feature was spelled with a + // hyphen ("sram-ecc"). + if (isSramEccOnOrAny()) + Features += "+sram-ecc"; + break; case AMDGPU::AMDHSA_COV4: case AMDGPU::AMDHSA_COV5: // sramecc. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 1e0994d0862cf..bb2964f592f66 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -42,12 +42,19 @@ namespace AMDGPU { struct IsaVersion; -enum { AMDHSA_COV4 = 4, AMDHSA_COV5 = 5 }; +enum { + AMDHSA_COV3 = 3, + AMDHSA_COV4 = 4, + AMDHSA_COV5 = 5 +}; /// \returns True if \p STI is AMDHSA. bool isHsaAbi(const MCSubtargetInfo &STI); /// \returns HSA OS ABI Version identification. std::optional getHsaAbiVersion(const MCSubtargetInfo *STI); +/// \returns True if HSA OS ABI Version identification is 3, +/// false otherwise. +bool isHsaAbiVersion3(const MCSubtargetInfo *STI); /// \returns True if HSA OS ABI Version identification is 4, /// false otherwise. bool isHsaAbiVersion4(const MCSubtargetInfo *STI); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll index 4bdbe6604782a..c25ecafa1f7c0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -1,11 +1,38 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V3 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V3 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) { +; GFX8V3-LABEL: addrspacecast: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x40 +; GFX8V3-NEXT: v_mov_b32_e32 v2, 1 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_mov_b32 s4, s0 +; GFX8V3-NEXT: s_mov_b32 s5, s3 +; GFX8V3-NEXT: s_cmp_lg_u32 s0, -1 +; GFX8V3-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 +; GFX8V3-NEXT: s_mov_b32 s6, s1 +; GFX8V3-NEXT: s_mov_b32 s7, s2 +; GFX8V3-NEXT: s_cmp_lg_u32 s1, -1 +; GFX8V3-NEXT: v_mov_b32_e32 v0, s4 +; GFX8V3-NEXT: s_cselect_b64 s[0:1], s[6:7], 0 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s5 +; GFX8V3-NEXT: flat_store_dword v[0:1], v2 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: v_mov_b32_e32 v2, 2 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V3-NEXT: flat_store_dword v[0:1], v2 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; ; GFX8V4-LABEL: addrspacecast: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -55,6 +82,30 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V5-NEXT: s_waitcnt vmcnt(0) ; GFX8V5-NEXT: s_endpgm ; +; GFX9V3-LABEL: addrspacecast: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V3-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX9V3-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX9V3-NEXT: v_mov_b32_e32 v2, 1 +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: s_mov_b32 s2, s0 +; GFX9V3-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V3-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX9V3-NEXT: s_mov_b32 s4, s1 +; GFX9V3-NEXT: s_cmp_lg_u32 s1, -1 +; GFX9V3-NEXT: v_mov_b32_e32 v0, s2 +; GFX9V3-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX9V3-NEXT: v_mov_b32_e32 v1, s3 +; GFX9V3-NEXT: flat_store_dword v[0:1], v2 +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V3-NEXT: v_mov_b32_e32 v2, 2 +; GFX9V3-NEXT: v_mov_b32_e32 v1, s1 +; GFX9V3-NEXT: flat_store_dword v[0:1], v2 +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -110,6 +161,19 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr } define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { +; GFX8V3-LABEL: llvm_amdgcn_is_shared: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_load_dword s0, s[4:5], 0x40 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V3-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: flat_store_dword v[0:1], v0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; ; GFX8V4-LABEL: llvm_amdgcn_is_shared: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -136,6 +200,18 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; GFX8V5-NEXT: s_waitcnt vmcnt(0) ; GFX8V5-NEXT: s_endpgm ; +; GFX9V3-LABEL: llvm_amdgcn_is_shared: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V3-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: s_cmp_eq_u32 s1, s3 +; GFX9V3-NEXT: s_cselect_b32 s0, 1, 0 +; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V3-NEXT: global_store_dword v[0:1], v0, off +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; ; GFX9V4-LABEL: llvm_amdgcn_is_shared: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -166,6 +242,19 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { } define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { +; GFX8V3-LABEL: llvm_amdgcn_is_private: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_load_dword s0, s[4:5], 0x44 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V3-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: flat_store_dword v[0:1], v0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; ; GFX8V4-LABEL: llvm_amdgcn_is_private: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -192,6 +281,18 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; GFX8V5-NEXT: s_waitcnt vmcnt(0) ; GFX8V5-NEXT: s_endpgm ; +; GFX9V3-LABEL: llvm_amdgcn_is_private: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V3-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: s_cmp_eq_u32 s1, s3 +; GFX9V3-NEXT: s_cselect_b32 s0, 1, 0 +; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V3-NEXT: global_store_dword v[0:1], v0, off +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; ; GFX9V4-LABEL: llvm_amdgcn_is_private: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -222,6 +323,11 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { } define amdgpu_kernel void @llvm_trap() { +; GFX8V3-LABEL: llvm_trap: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8V3-NEXT: s_trap 2 +; ; GFX8V4-LABEL: llvm_trap: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_mov_b64 s[0:1], s[4:5] @@ -233,6 +339,11 @@ define amdgpu_kernel void @llvm_trap() { ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_trap 2 ; +; GFX9V3-LABEL: llvm_trap: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX9V3-NEXT: s_trap 2 +; ; GFX9V4-LABEL: llvm_trap: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_trap 2 @@ -245,6 +356,10 @@ define amdgpu_kernel void @llvm_trap() { } define amdgpu_kernel void @llvm_debugtrap() { +; GFX8V3-LABEL: llvm_debugtrap: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_trap 3 +; ; GFX8V4-LABEL: llvm_debugtrap: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_trap 3 @@ -253,6 +368,10 @@ define amdgpu_kernel void @llvm_debugtrap() { ; GFX8V5: ; %bb.0: ; GFX8V5-NEXT: s_trap 3 ; +; GFX9V3-LABEL: llvm_debugtrap: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_trap 3 +; ; GFX9V4-LABEL: llvm_debugtrap: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_trap 3 @@ -265,6 +384,32 @@ define amdgpu_kernel void @llvm_debugtrap() { } define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { +; GFX8V3-LABEL: llvm_amdgcn_queue_ptr: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: v_mov_b32_e32 v0, s6 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s7 +; GFX8V3-NEXT: s_add_u32 s0, s8, 8 +; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V3-NEXT: s_addc_u32 s1, s9, 0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s4 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s5 +; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s10 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s11 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v3, s1 +; GFX8V3-NEXT: v_mov_b32_e32 v2, s0 +; GFX8V3-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: v_mov_b32_e32 v0, s6 @@ -315,6 +460,23 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { ; GFX8V5-NEXT: s_waitcnt vmcnt(0) ; GFX8V5-NEXT: s_endpgm ; +; GFX9V3-LABEL: llvm_amdgcn_queue_ptr: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: v_mov_b32_e32 v2, 0 +; GFX9V3-NEXT: global_load_ubyte v0, v2, s[6:7] glc +; GFX9V3-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc +; GFX9V3-NEXT: global_load_ubyte v0, v2, s[4:5] glc +; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: v_mov_b32_e32 v0, s10 +; GFX9V3-NEXT: v_mov_b32_e32 v1, s11 +; GFX9V3-NEXT: ; kill: killed $sgpr6_sgpr7 +; GFX9V3-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; ; GFX9V4-LABEL: llvm_amdgcn_queue_ptr: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll new file mode 100644 index 0000000000000..20d0aea61f276 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll @@ -0,0 +1,148 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs -amdgpu-verify-hsa-metadata -filetype=obj -o /dev/null < %s 2>&1 | FileCheck --check-prefix=PARSER %s + +; CHECK-LABEL: {{^}}min_64_max_64: +; CHECK: SGPRBlocks: 0 +; CHECK: VGPRBlocks: 0 +; CHECK: NumSGPRsForWavesPerEU: 1 +; CHECK: NumVGPRsForWavesPerEU: 1 +define amdgpu_kernel void @min_64_max_64() #0 { +entry: + ret void +} +attributes #0 = {"amdgpu-flat-work-group-size"="64,64"} + +; CHECK-LABEL: {{^}}min_64_max_128: +; CHECK: SGPRBlocks: 0 +; CHECK: VGPRBlocks: 0 +; CHECK: NumSGPRsForWavesPerEU: 1 +; CHECK: NumVGPRsForWavesPerEU: 1 +define amdgpu_kernel void @min_64_max_128() #1 { +entry: + ret void +} +attributes #1 = {"amdgpu-flat-work-group-size"="64,128"} + +; CHECK-LABEL: {{^}}min_128_max_128: +; CHECK: SGPRBlocks: 0 +; CHECK: VGPRBlocks: 0 +; CHECK: NumSGPRsForWavesPerEU: 1 +; CHECK: NumVGPRsForWavesPerEU: 1 +define amdgpu_kernel void @min_128_max_128() #2 { +entry: + ret void +} +attributes #2 = {"amdgpu-flat-work-group-size"="128,128"} + +; CHECK-LABEL: {{^}}min_1024_max_1024 +; CHECK: SGPRBlocks: 0 +; CHECK: VGPRBlocks: 10 +; CHECK: NumSGPRsForWavesPerEU: 2{{$}} +; CHECK: NumVGPRsForWavesPerEU: 43 +@var = addrspace(1) global float 0.0 +define amdgpu_kernel void @min_1024_max_1024() #3 { + %val0 = load volatile float, ptr addrspace(1) @var + %val1 = load volatile float, ptr addrspace(1) @var + %val2 = load volatile float, ptr addrspace(1) @var + %val3 = load volatile float, ptr addrspace(1) @var + %val4 = load volatile float, ptr addrspace(1) @var + %val5 = load volatile float, ptr addrspace(1) @var + %val6 = load volatile float, ptr addrspace(1) @var + %val7 = load volatile float, ptr addrspace(1) @var + %val8 = load volatile float, ptr addrspace(1) @var + %val9 = load volatile float, ptr addrspace(1) @var + %val10 = load volatile float, ptr addrspace(1) @var + %val11 = load volatile float, ptr addrspace(1) @var + %val12 = load volatile float, ptr addrspace(1) @var + %val13 = load volatile float, ptr addrspace(1) @var + %val14 = load volatile float, ptr addrspace(1) @var + %val15 = load volatile float, ptr addrspace(1) @var + %val16 = load volatile float, ptr addrspace(1) @var + %val17 = load volatile float, ptr addrspace(1) @var + %val18 = load volatile float, ptr addrspace(1) @var + %val19 = load volatile float, ptr addrspace(1) @var + %val20 = load volatile float, ptr addrspace(1) @var + %val21 = load volatile float, ptr addrspace(1) @var + %val22 = load volatile float, ptr addrspace(1) @var + %val23 = load volatile float, ptr addrspace(1) @var + %val24 = load volatile float, ptr addrspace(1) @var + %val25 = load volatile float, ptr addrspace(1) @var + %val26 = load volatile float, ptr addrspace(1) @var + %val27 = load volatile float, ptr addrspace(1) @var + %val28 = load volatile float, ptr addrspace(1) @var + %val29 = load volatile float, ptr addrspace(1) @var + %val30 = load volatile float, ptr addrspace(1) @var + %val31 = load volatile float, ptr addrspace(1) @var + %val32 = load volatile float, ptr addrspace(1) @var + %val33 = load volatile float, ptr addrspace(1) @var + %val34 = load volatile float, ptr addrspace(1) @var + %val35 = load volatile float, ptr addrspace(1) @var + %val36 = load volatile float, ptr addrspace(1) @var + %val37 = load volatile float, ptr addrspace(1) @var + %val38 = load volatile float, ptr addrspace(1) @var + %val39 = load volatile float, ptr addrspace(1) @var + %val40 = load volatile float, ptr addrspace(1) @var + + store volatile float %val0, ptr addrspace(1) @var + store volatile float %val1, ptr addrspace(1) @var + store volatile float %val2, ptr addrspace(1) @var + store volatile float %val3, ptr addrspace(1) @var + store volatile float %val4, ptr addrspace(1) @var + store volatile float %val5, ptr addrspace(1) @var + store volatile float %val6, ptr addrspace(1) @var + store volatile float %val7, ptr addrspace(1) @var + store volatile float %val8, ptr addrspace(1) @var + store volatile float %val9, ptr addrspace(1) @var + store volatile float %val10, ptr addrspace(1) @var + store volatile float %val11, ptr addrspace(1) @var + store volatile float %val12, ptr addrspace(1) @var + store volatile float %val13, ptr addrspace(1) @var + store volatile float %val14, ptr addrspace(1) @var + store volatile float %val15, ptr addrspace(1) @var + store volatile float %val16, ptr addrspace(1) @var + store volatile float %val17, ptr addrspace(1) @var + store volatile float %val18, ptr addrspace(1) @var + store volatile float %val19, ptr addrspace(1) @var + store volatile float %val20, ptr addrspace(1) @var + store volatile float %val21, ptr addrspace(1) @var + store volatile float %val22, ptr addrspace(1) @var + store volatile float %val23, ptr addrspace(1) @var + store volatile float %val24, ptr addrspace(1) @var + store volatile float %val25, ptr addrspace(1) @var + store volatile float %val26, ptr addrspace(1) @var + store volatile float %val27, ptr addrspace(1) @var + store volatile float %val28, ptr addrspace(1) @var + store volatile float %val29, ptr addrspace(1) @var + store volatile float %val30, ptr addrspace(1) @var + store volatile float %val31, ptr addrspace(1) @var + store volatile float %val32, ptr addrspace(1) @var + store volatile float %val33, ptr addrspace(1) @var + store volatile float %val34, ptr addrspace(1) @var + store volatile float %val35, ptr addrspace(1) @var + store volatile float %val36, ptr addrspace(1) @var + store volatile float %val37, ptr addrspace(1) @var + store volatile float %val38, ptr addrspace(1) @var + store volatile float %val39, ptr addrspace(1) @var + store volatile float %val40, ptr addrspace(1) @var + + ret void +} +attributes #3 = {"amdgpu-flat-work-group-size"="1024,1024"} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 300} + +; CHECK: amdhsa.kernels: +; CHECK: .max_flat_workgroup_size: 64 +; CHECK: .name: min_64_max_64 +; CHECK: .max_flat_workgroup_size: 128 +; CHECK: .name: min_64_max_128 +; CHECK: .max_flat_workgroup_size: 128 +; CHECK: .name: min_128_max_128 +; CHECK: .max_flat_workgroup_size: 1024 +; CHECK: .name: min_1024_max_1024 +; CHECK: amdhsa.version: +; CHECK: - 1 +; CHECK: - 0 + +; PARSER: AMDGPU HSA Metadata Parser Test: PASS diff --git a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target-v3.ll b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target-v3.ll new file mode 100644 index 0000000000000..6c553e3726abf --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target-v3.ll @@ -0,0 +1,168 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=V3-GFX600 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck --check-prefixes=V3-GFX600 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx601 < %s | FileCheck --check-prefixes=V3-GFX601 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=pitcairn < %s | FileCheck --check-prefixes=V3-GFX601 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=verde < %s | FileCheck --check-prefixes=V3-GFX601 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx602 < %s | FileCheck --check-prefixes=V3-GFX602 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hainan < %s | FileCheck --check-prefixes=V3-GFX602 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=oland < %s | FileCheck --check-prefixes=V3-GFX602 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=V3-GFX700 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck --check-prefixes=V3-GFX700 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx701 < %s | FileCheck --check-prefixes=V3-GFX701 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck --check-prefixes=V3-GFX701 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx702 < %s | FileCheck --check-prefixes=V3-GFX702 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck --check-prefixes=V3-GFX703 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kabini < %s | FileCheck --check-prefixes=V3-GFX703 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=mullins < %s | FileCheck --check-prefixes=V3-GFX703 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx704 < %s | FileCheck --check-prefixes=V3-GFX704 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck --check-prefixes=V3-GFX704 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx705 < %s | FileCheck --check-prefixes=V3-GFX705 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 < %s | FileCheck --check-prefixes=V3-GFX801-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX801-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX801-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=carrizo < %s | FileCheck --check-prefixes=V3-GFX801-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=carrizo -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX801-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=carrizo -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX801-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 < %s | FileCheck --check-prefixes=V3-GFX802 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland < %s | FileCheck --check-prefixes=V3-GFX802 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck --check-prefixes=V3-GFX802 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefixes=V3-GFX803 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck --check-prefixes=V3-GFX803 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=polaris10 < %s | FileCheck --check-prefixes=V3-GFX803 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=polaris11 < %s | FileCheck --check-prefixes=V3-GFX803 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx805 < %s | FileCheck --check-prefixes=V3-GFX805 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tongapro < %s | FileCheck --check-prefixes=V3-GFX805 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 < %s | FileCheck --check-prefixes=V3-GFX810-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX810-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX810-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=stoney < %s | FileCheck --check-prefixes=V3-GFX810-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=stoney -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX810-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=stoney -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX810-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefixes=V3-GFX900-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX900-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX900-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 < %s | FileCheck --check-prefixes=V3-GFX902-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX902-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX902-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 < %s | FileCheck --check-prefixes=V3-GFX904-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX904-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX904-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck --check-prefixes=V3-GFX906-SRAMECC-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=-sramecc < %s | FileCheck --check-prefixes=V3-GFX906-NOSRAMECC-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sramecc < %s | FileCheck --check-prefixes=V3-GFX906-SRAMECC-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX906-SRAMECC-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX906-SRAMECC-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=-sramecc,-xnack < %s | FileCheck --check-prefixes=V3-GFX906-NOSRAMECC-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sramecc,-xnack < %s | FileCheck --check-prefixes=V3-GFX906-SRAMECC-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=-sramecc,+xnack < %s | FileCheck --check-prefixes=V3-GFX906-NOSRAMECC-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sramecc,+xnack < %s | FileCheck --check-prefixes=V3-GFX906-SRAMECC-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck --check-prefixes=V3-GFX908-SRAMECC-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=-sramecc < %s | FileCheck --check-prefixes=V3-GFX908-NOSRAMECC-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=+sramecc < %s | FileCheck --check-prefixes=V3-GFX908-SRAMECC-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX908-SRAMECC-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX908-SRAMECC-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=-sramecc,-xnack < %s | FileCheck --check-prefixes=V3-GFX908-NOSRAMECC-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=+sramecc,-xnack < %s | FileCheck --check-prefixes=V3-GFX908-SRAMECC-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=-sramecc,+xnack < %s | FileCheck --check-prefixes=V3-GFX908-NOSRAMECC-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=+sramecc,+xnack < %s | FileCheck --check-prefixes=V3-GFX908-SRAMECC-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx909 < %s | FileCheck --check-prefixes=V3-GFX909-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx909 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX909-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx909 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX909-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90c < %s | FileCheck --check-prefixes=V3-GFX90C-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90c -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX90C-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90c -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX90C-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck --check-prefixes=V3-GFX940-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX940-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX940-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=V3-GFX1010-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX1010-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX1010-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1011 < %s | FileCheck --check-prefixes=V3-GFX1011-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1011 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX1011-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1011 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX1011-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1012 < %s | FileCheck --check-prefixes=V3-GFX1012-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1012 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX1012-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1012 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX1012-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1013 < %s | FileCheck --check-prefixes=V3-GFX1013-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1013 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX1013-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1013 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX1013-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck --check-prefixes=V3-GFX1030 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck --check-prefixes=V3-GFX1031 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1032 < %s | FileCheck --check-prefixes=V3-GFX1032 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1033 < %s | FileCheck --check-prefixes=V3-GFX1033 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1034 < %s | FileCheck --check-prefixes=V3-GFX1034 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1035 < %s | FileCheck --check-prefixes=V3-GFX1035 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1036 < %s | FileCheck --check-prefixes=V3-GFX1036 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=V3-GFX1100 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1101 < %s | FileCheck --check-prefixes=V3-GFX1101 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 < %s | FileCheck --check-prefixes=V3-GFX1102 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1103 < %s | FileCheck --check-prefixes=V3-GFX1103 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1150 < %s | FileCheck --check-prefixes=V3-GFX1150 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1151 < %s | FileCheck --check-prefixes=V3-GFX1151 %s + +; V3-GFX600: .amdgcn_target "amdgcn-amd-amdhsa--gfx600" +; V3-GFX601: .amdgcn_target "amdgcn-amd-amdhsa--gfx601" +; V3-GFX602: .amdgcn_target "amdgcn-amd-amdhsa--gfx602" +; V3-GFX700: .amdgcn_target "amdgcn-amd-amdhsa--gfx700" +; V3-GFX701: .amdgcn_target "amdgcn-amd-amdhsa--gfx701" +; V3-GFX702: .amdgcn_target "amdgcn-amd-amdhsa--gfx702" +; V3-GFX703: .amdgcn_target "amdgcn-amd-amdhsa--gfx703" +; V3-GFX704: .amdgcn_target "amdgcn-amd-amdhsa--gfx704" +; V3-GFX705: .amdgcn_target "amdgcn-amd-amdhsa--gfx705" +; V3-GFX801-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx801" +; V3-GFX801-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx801+xnack" +; V3-GFX802: .amdgcn_target "amdgcn-amd-amdhsa--gfx802" +; V3-GFX803: .amdgcn_target "amdgcn-amd-amdhsa--gfx803" +; V3-GFX805: .amdgcn_target "amdgcn-amd-amdhsa--gfx805" +; V3-GFX810-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx810" +; V3-GFX810-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx810+xnack" +; V3-GFX900-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx900" +; V3-GFX900-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx900+xnack" +; V3-GFX902-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx902" +; V3-GFX902-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx902+xnack" +; V3-GFX904-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx904" +; V3-GFX904-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack" +; V3-GFX906-NOSRAMECC-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx906" +; V3-GFX906-SRAMECC-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+sram-ecc" +; V3-GFX906-NOSRAMECC-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+xnack" +; V3-GFX906-SRAMECC-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+xnack+sram-ecc" +; V3-GFX908-NOSRAMECC-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx908" +; V3-GFX908-SRAMECC-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx908+sram-ecc" +; V3-GFX908-NOSRAMECC-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx908+xnack" +; V3-GFX908-SRAMECC-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx908+xnack+sram-ecc" +; V3-GFX909-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx909" +; V3-GFX909-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx909+xnack" +; V3-GFX90C-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx90c" +; V3-GFX90C-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx90c+xnack" +; V3-GFX940-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx940+sram-ecc" +; V3-GFX940-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx940+xnack+sram-ecc" +; V3-GFX1010-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010" +; V3-GFX1010-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010+xnack" +; V3-GFX1011-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1011" +; V3-GFX1011-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1011+xnack" +; V3-GFX1012-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1012" +; V3-GFX1012-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1012+xnack" +; V3-GFX1013-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1013" +; V3-GFX1013-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1013+xnack" +; V3-GFX1030: .amdgcn_target "amdgcn-amd-amdhsa--gfx1030" +; V3-GFX1031: .amdgcn_target "amdgcn-amd-amdhsa--gfx1031" +; V3-GFX1032: .amdgcn_target "amdgcn-amd-amdhsa--gfx1032" +; V3-GFX1033: .amdgcn_target "amdgcn-amd-amdhsa--gfx1033" +; V3-GFX1034: .amdgcn_target "amdgcn-amd-amdhsa--gfx1034" +; V3-GFX1035: .amdgcn_target "amdgcn-amd-amdhsa--gfx1035" +; V3-GFX1036: .amdgcn_target "amdgcn-amd-amdhsa--gfx1036" +; V3-GFX1100: .amdgcn_target "amdgcn-amd-amdhsa--gfx1100" +; V3-GFX1101: .amdgcn_target "amdgcn-amd-amdhsa--gfx1101" +; V3-GFX1102: .amdgcn_target "amdgcn-amd-amdhsa--gfx1102" +; V3-GFX1103: .amdgcn_target "amdgcn-amd-amdhsa--gfx1103" +; V3-GFX1150: .amdgcn_target "amdgcn-amd-amdhsa--gfx1150" +; V3-GFX1151: .amdgcn_target "amdgcn-amd-amdhsa--gfx1151" + + + +define amdgpu_kernel void @directive_amdgcn_target() { + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 300} diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-v3.ll similarity index 98% rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-.ll rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-v3.ll index 042abe382283a..37b124e7f59a0 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-v3.ll @@ -142,7 +142,7 @@ define amdgpu_kernel void @test_no_default_queue(i8 %a) #3 ; CHECK: amdhsa.version: ; CHECK-NEXT: - 1 -; CHECK-NEXT: - 1 +; CHECK-NEXT: - 0 ; CHECK-NOT: amdhsa.printf: attributes #0 = { optnone noinline "amdgpu-no-default-queue" "amdgpu-no-completion-action" "amdgpu-implicitarg-num-bytes"="48" } @@ -151,7 +151,7 @@ attributes #2 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-implici attributes #3 = { optnone noinline "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="48" } !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 300} !1 = !{i32 0} !2 = !{!"none"} diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll index fb08fd2c45085..8e8023aa16f13 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll @@ -39,4 +39,4 @@ define internal void @bar.5() { ; PARSER: AMDGPU HSA Metadata Parser Test: PASS !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 300} diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll similarity index 99% rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll index dc3a6e8b633b2..69efc47008e6a 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll @@ -1739,14 +1739,14 @@ define amdgpu_kernel void @unknown_addrspace_kernarg(ptr addrspace(12345) %ptr) ; CHECK-NEXT: - '2:1:8:%g\n' ; CHECK: amdhsa.version: ; CHECK-NEXT: - 1 -; CHECK-NEXT: - 1 +; CHECK-NEXT: - 0 attributes #0 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="56" } attributes #1 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" } attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" } !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 300} !llvm.printf.fmts = !{!100, !101} diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v4.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll similarity index 99% rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v4.ll rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll index f4892ebdc9c93..47b882494c919 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v4.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll @@ -286,7 +286,7 @@ entry: ; CHECK: amdhsa.version: ; CHECK-NEXT: - 1 -; CHECK-NEXT: - 1 +; CHECK-NEXT: - 0 ; We don't have a use of llvm.amdgcn.implicitarg.ptr, so optnone to ; avoid optimizing out the implicit argument allocation. @@ -298,4 +298,4 @@ attributes #4 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" } attributes #5 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" } !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 300} diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-asan.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-v3-asan.ll similarity index 96% rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-asan.ll rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-v3-asan.ll index 22c6e14776220..cb3ae289721bc 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-asan.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-v3-asan.ll @@ -35,12 +35,12 @@ define amdgpu_kernel void @test_kernel(i8 %a) #0 ; CHECK: amdhsa.version: ; CHECK-NEXT: - 1 -; CHECK-NEXT: - 1 +; CHECK-NEXT: - 0 attributes #0 = { sanitize_address "amdgpu-implicitarg-num-bytes"="48" } !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 300} !1 = !{i32 0} !2 = !{!"none"} !3 = !{!"char"} diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v3.ll similarity index 99% rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v3.ll index 8f90025fe8e29..a3f8c5cff95df 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v3.ll @@ -296,4 +296,4 @@ attributes #3 = { "amdgpu-implicitarg-num-bytes"="48" "amdgpu-no-hostcall-ptr" } attributes #4 = { noinline } !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 300} diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-images.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll similarity index 98% rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-images.ll rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll index 6d49f22eb429b..b7f58bbb51bb2 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-images.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll @@ -96,10 +96,10 @@ define amdgpu_kernel void @test(ptr addrspace(1) %a, ; CHECK: amdhsa.version: ; CHECK-NEXT: - 1 -; CHECK-NEXT: - 1 +; CHECK-NEXT: - 0 !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 300} !1 = !{!"image1d_t", !"image1d_array_t", !"image1d_buffer_t", !"image2d_t", !"image2d_array_t", !"image2d_array_depth_t", diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1-v3.ll similarity index 80% rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1.ll rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1-v3.ll index fc5e6e2731253..8117037baaffc 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1-v3.ll @@ -5,9 +5,9 @@ ; CHECK: --- ; CHECK: amdhsa.version: ; CHECK-NEXT: - 1 -; CHECK-NEXT: - 1 +; CHECK-NEXT: - 0 ; CHECK: ... !opencl.ocl.version = !{} !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 300} diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3-v3.ll similarity index 81% rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3.ll rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3-v3.ll index 1ec79c95bc2a3..ea744863a9b88 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3-v3.ll @@ -5,10 +5,10 @@ ; CHECK: --- ; CHECK: amdhsa.version: ; CHECK-NEXT: - 1 -; CHECK-NEXT: - 1 +; CHECK-NEXT: - 0 ; CHECK: ... !opencl.ocl.version = !{!0} !llvm.module.flags = !{!1} !0 = !{i32 1} -!1 = !{i32 1, !"amdgpu_code_object_version", i32 400} +!1 = !{i32 1, !"amdgpu_code_object_version", i32 300} diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll similarity index 99% rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll index e45c4d1786faf..d6f7a92af9dcb 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll @@ -158,11 +158,11 @@ define amdgpu_kernel void @num_spilled_vgprs() #1 { ; CHECK: amdhsa.version: ; CHECK-NEXT: - 1 -; CHECK-NEXT: - 1 +; CHECK-NEXT: - 0 attributes #0 = { "amdgpu-num-sgpr"="14" } attributes #1 = { "amdgpu-num-vgpr"="20" } attributes #2 = { "amdgpu-flat-work-group-size"="1,256" } !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 300} diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll index 9e6c0ef86906d..9760e93eb48e6 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -1,11 +1,36 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V3 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V3 %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) { +; GFX8V3-LABEL: addrspacecast: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x40 +; GFX8V3-NEXT: v_mov_b32_e32 v4, 1 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_cmp_lg_u32 s0, -1 +; GFX8V3-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8V3-NEXT: s_cselect_b32 s0, s0, 0 +; GFX8V3-NEXT: s_cmp_lg_u32 s1, -1 +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s3 +; GFX8V3-NEXT: s_cselect_b32 s0, s2, 0 +; GFX8V3-NEXT: s_cselect_b32 s1, s1, 0 +; GFX8V3-NEXT: v_mov_b32_e32 v2, s1 +; GFX8V3-NEXT: v_mov_b32_e32 v3, s0 +; GFX8V3-NEXT: flat_store_dword v[0:1], v4 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, 2 +; GFX8V3-NEXT: flat_store_dword v[2:3], v0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; ; GFX8V4-LABEL: addrspacecast: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -52,6 +77,30 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V5-NEXT: s_waitcnt vmcnt(0) ; GFX8V5-NEXT: s_endpgm ; +; GFX9V3-LABEL: addrspacecast: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V3-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX9V3-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX9V3-NEXT: v_mov_b32_e32 v4, 1 +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V3-NEXT: s_cselect_b32 s2, s3, 0 +; GFX9V3-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9V3-NEXT: s_cmp_lg_u32 s1, -1 +; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V3-NEXT: v_mov_b32_e32 v1, s2 +; GFX9V3-NEXT: s_cselect_b32 s0, s5, 0 +; GFX9V3-NEXT: s_cselect_b32 s1, s1, 0 +; GFX9V3-NEXT: v_mov_b32_e32 v2, s1 +; GFX9V3-NEXT: v_mov_b32_e32 v3, s0 +; GFX9V3-NEXT: flat_store_dword v[0:1], v4 +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: v_mov_b32_e32 v0, 2 +; GFX9V3-NEXT: flat_store_dword v[2:3], v0 +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -107,6 +156,18 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr } define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { +; GFX8V3-LABEL: llvm_amdgcn_is_shared: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_load_dword s0, s[4:5], 0x40 +; GFX8V3-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V3-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX8V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V3-NEXT: flat_store_dword v[0:1], v0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; ; GFX8V4-LABEL: llvm_amdgcn_is_shared: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x40 @@ -131,6 +192,18 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; GFX8V5-NEXT: s_waitcnt vmcnt(0) ; GFX8V5-NEXT: s_endpgm ; +; GFX9V3-LABEL: llvm_amdgcn_is_shared: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_load_dword s2, s[4:5], 0x4 +; GFX9V3-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: s_cmp_eq_u32 s2, s1 +; GFX9V3-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V3-NEXT: global_store_dword v[0:1], v0, off +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; ; GFX9V4-LABEL: llvm_amdgcn_is_shared: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_load_dword s2, s[4:5], 0x4 @@ -161,6 +234,18 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { } define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { +; GFX8V3-LABEL: llvm_amdgcn_is_private: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_load_dword s0, s[4:5], 0x44 +; GFX8V3-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V3-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX8V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V3-NEXT: flat_store_dword v[0:1], v0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; ; GFX8V4-LABEL: llvm_amdgcn_is_private: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x44 @@ -185,6 +270,18 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; GFX8V5-NEXT: s_waitcnt vmcnt(0) ; GFX8V5-NEXT: s_endpgm ; +; GFX9V3-LABEL: llvm_amdgcn_is_private: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_load_dword s2, s[4:5], 0x4 +; GFX9V3-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: s_cmp_eq_u32 s2, s1 +; GFX9V3-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V3-NEXT: global_store_dword v[0:1], v0, off +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; ; GFX9V4-LABEL: llvm_amdgcn_is_private: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_load_dword s2, s[4:5], 0x4 @@ -215,6 +312,11 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { } define amdgpu_kernel void @llvm_trap() { +; GFX8V3-LABEL: llvm_trap: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8V3-NEXT: s_trap 2 +; ; GFX8V4-LABEL: llvm_trap: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_mov_b64 s[0:1], s[4:5] @@ -226,6 +328,11 @@ define amdgpu_kernel void @llvm_trap() { ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_trap 2 ; +; GFX9V3-LABEL: llvm_trap: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX9V3-NEXT: s_trap 2 +; ; GFX9V4-LABEL: llvm_trap: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_trap 2 @@ -238,6 +345,10 @@ define amdgpu_kernel void @llvm_trap() { } define amdgpu_kernel void @llvm_debugtrap() { +; GFX8V3-LABEL: llvm_debugtrap: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_trap 3 +; ; GFX8V4-LABEL: llvm_debugtrap: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_trap 3 @@ -246,6 +357,10 @@ define amdgpu_kernel void @llvm_debugtrap() { ; GFX8V5: ; %bb.0: ; GFX8V5-NEXT: s_trap 3 ; +; GFX9V3-LABEL: llvm_debugtrap: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_trap 3 +; ; GFX9V4-LABEL: llvm_debugtrap: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_trap 3 @@ -258,6 +373,31 @@ define amdgpu_kernel void @llvm_debugtrap() { } define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { +; GFX8V3-LABEL: llvm_amdgcn_queue_ptr: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: v_mov_b32_e32 v0, s6 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s7 +; GFX8V3-NEXT: s_add_u32 s0, s8, 8 +; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V3-NEXT: s_addc_u32 s1, s9, 0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s4 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s5 +; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V3-NEXT: v_mov_b32_e32 v2, s10 +; GFX8V3-NEXT: v_mov_b32_e32 v3, s11 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: v_mov_b32_e32 v0, s6 @@ -306,6 +446,23 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { ; GFX8V5-NEXT: s_waitcnt vmcnt(0) ; GFX8V5-NEXT: s_endpgm ; +; GFX9V3-LABEL: llvm_amdgcn_queue_ptr: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: v_mov_b32_e32 v2, 0 +; GFX9V3-NEXT: global_load_ubyte v0, v2, s[6:7] glc +; GFX9V3-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc +; GFX9V3-NEXT: global_load_ubyte v0, v2, s[4:5] glc +; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: v_mov_b32_e32 v0, s10 +; GFX9V3-NEXT: v_mov_b32_e32 v1, s11 +; GFX9V3-NEXT: ; kill: killed $sgpr6_sgpr7 +; GFX9V3-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; ; GFX9V4-LABEL: llvm_amdgcn_queue_ptr: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll index a04fe28dbffff..0353e7ee49ab9 100644 --- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll +++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll @@ -1,8 +1,17 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefix=DOORBELL %s ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefix=DOORBELL %s +; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefix=HSA %s declare void @llvm.trap() #0 +; HSA: .amdhsa_kernel trap +; HSA-NEXT: .amdhsa_group_segment_fixed_size 0 +; HSA-NEXT: .amdhsa_private_segment_fixed_size 0 +; HSA-NEXT: .amdhsa_kernarg_size 8 +; HSA-NEXT: .amdhsa_user_sgpr_count 8 +; HSA-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 +; HSA: .end_amdhsa_kernel + ; DOORBELL: .amdhsa_kernel trap ; DOORBELL-NEXT: .amdhsa_group_segment_fixed_size 0 ; DOORBELL-NEXT: .amdhsa_private_segment_fixed_size 0 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll index 9ed896c148e64..792ec2675247f 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll @@ -317,4 +317,4 @@ attributes #1 = { nounwind "stackrealign" } attributes #2 = { nounwind alignstack=128 } !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 300} diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index 54a15513cf0a5..03ea582698486 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -1,54 +1,101 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc %s -o - -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=NOHSA-TRAP-GFX900 %s -; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX803 %s -; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX900 %s -; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-trap-handler -verify-machineinstrs | FileCheck --check-prefix=HSA-NOTRAP-GFX900 %s +; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=NOHSA-TRAP-GFX900-V3 %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=NOHSA-TRAP-GFX900-V4 %s +; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX803-V3 %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX803-V4 %s +; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX900-V3 %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX900-V4 %s +; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-trap-handler -verify-machineinstrs | FileCheck --check-prefix=HSA-NOTRAP-GFX900-V3 %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-trap-handler -verify-machineinstrs | FileCheck --check-prefix=HSA-NOTRAP-GFX900-V4 %s declare void @llvm.trap() #0 declare void @llvm.debugtrap() #1 define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { -; NOHSA-TRAP-GFX900-LABEL: trap: -; NOHSA-TRAP-GFX900: ; %bb.0: -; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 -; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 -; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) -; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1] -; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0) -; NOHSA-TRAP-GFX900-NEXT: s_endpgm -; -; HSA-TRAP-GFX803-LABEL: trap: -; HSA-TRAP-GFX803: ; %bb.0: -; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1 -; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[4:5] -; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) -; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s2 -; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s3 -; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2 -; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) -; HSA-TRAP-GFX803-NEXT: s_trap 2 -; -; HSA-TRAP-GFX900-LABEL: trap: -; HSA-TRAP-GFX900: ; %bb.0: -; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 -; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 -; HSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) -; HSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1] -; HSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0) -; HSA-TRAP-GFX900-NEXT: s_trap 2 -; -; HSA-NOTRAP-GFX900-LABEL: trap: -; HSA-NOTRAP-GFX900: ; %bb.0: -; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 -; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 -; HSA-NOTRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) -; HSA-NOTRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1] -; HSA-NOTRAP-GFX900-NEXT: s_waitcnt vmcnt(0) -; HSA-NOTRAP-GFX900-NEXT: s_endpgm +; NOHSA-TRAP-GFX900-V3-LABEL: trap: +; NOHSA-TRAP-GFX900-V3: ; %bb.0: +; NOHSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0 +; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1 +; NOHSA-TRAP-GFX900-V3-NEXT: s_waitcnt lgkmcnt(0) +; NOHSA-TRAP-GFX900-V3-NEXT: global_store_dword v0, v1, s[0:1] +; NOHSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0) +; NOHSA-TRAP-GFX900-V3-NEXT: s_endpgm +; +; NOHSA-TRAP-GFX900-V4-LABEL: trap: +; NOHSA-TRAP-GFX900-V4: ; %bb.0: +; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 +; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1 +; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) +; NOHSA-TRAP-GFX900-V4-NEXT: global_store_dword v0, v1, s[0:1] +; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0) +; NOHSA-TRAP-GFX900-V4-NEXT: s_endpgm +; +; HSA-TRAP-GFX803-V3-LABEL: trap: +; HSA-TRAP-GFX803-V3: ; %bb.0: +; HSA-TRAP-GFX803-V3-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v2, 1 +; HSA-TRAP-GFX803-V3-NEXT: s_mov_b64 s[0:1], s[4:5] +; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v0, s2 +; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v1, s3 +; HSA-TRAP-GFX803-V3-NEXT: flat_store_dword v[0:1], v2 +; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX803-V3-NEXT: s_trap 2 +; +; HSA-TRAP-GFX803-V4-LABEL: trap: +; HSA-TRAP-GFX803-V4: ; %bb.0: +; HSA-TRAP-GFX803-V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v2, 1 +; HSA-TRAP-GFX803-V4-NEXT: s_mov_b64 s[0:1], s[4:5] +; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v0, s2 +; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v1, s3 +; HSA-TRAP-GFX803-V4-NEXT: flat_store_dword v[0:1], v2 +; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX803-V4-NEXT: s_trap 2 +; +; HSA-TRAP-GFX900-V3-LABEL: trap: +; HSA-TRAP-GFX900-V3: ; %bb.0: +; HSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0 +; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1 +; HSA-TRAP-GFX900-V3-NEXT: s_mov_b64 s[0:1], s[4:5] +; HSA-TRAP-GFX900-V3-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX900-V3-NEXT: global_store_dword v0, v1, s[2:3] +; HSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX900-V3-NEXT: s_trap 2 +; +; HSA-TRAP-GFX900-V4-LABEL: trap: +; HSA-TRAP-GFX900-V4: ; %bb.0: +; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 +; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1 +; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX900-V4-NEXT: global_store_dword v0, v1, s[0:1] +; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX900-V4-NEXT: s_trap 2 +; +; HSA-NOTRAP-GFX900-V3-LABEL: trap: +; HSA-NOTRAP-GFX900-V3: ; %bb.0: +; HSA-NOTRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1 +; HSA-NOTRAP-GFX900-V3-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NOTRAP-GFX900-V3-NEXT: global_store_dword v0, v1, s[0:1] +; HSA-NOTRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0) +; HSA-NOTRAP-GFX900-V3-NEXT: s_endpgm +; +; HSA-NOTRAP-GFX900-V4-LABEL: trap: +; HSA-NOTRAP-GFX900-V4: ; %bb.0: +; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1 +; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NOTRAP-GFX900-V4-NEXT: global_store_dword v0, v1, s[0:1] +; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0) +; HSA-NOTRAP-GFX900-V4-NEXT: s_endpgm store volatile i32 1, ptr addrspace(1) %arg0 call void @llvm.trap() unreachable @@ -57,77 +104,150 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { } define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { -; NOHSA-TRAP-GFX900-LABEL: non_entry_trap: -; NOHSA-TRAP-GFX900: ; %bb.0: ; %entry -; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 -; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) -; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc -; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0) -; NOHSA-TRAP-GFX900-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1 -; NOHSA-TRAP-GFX900-NEXT: s_cbranch_vccz .LBB1_2 -; NOHSA-TRAP-GFX900-NEXT: ; %bb.1: ; %ret -; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 3 -; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1] -; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0) -; NOHSA-TRAP-GFX900-NEXT: s_endpgm -; NOHSA-TRAP-GFX900-NEXT: .LBB1_2: ; %trap -; NOHSA-TRAP-GFX900-NEXT: s_endpgm -; -; HSA-TRAP-GFX803-LABEL: non_entry_trap: -; HSA-TRAP-GFX803: ; %bb.0: ; %entry -; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) -; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0 -; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1 -; HSA-TRAP-GFX803-NEXT: flat_load_dword v0, v[0:1] glc -; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) -; HSA-TRAP-GFX803-NEXT: v_cmp_eq_u32_e32 vcc, -1, v0 -; HSA-TRAP-GFX803-NEXT: s_cbranch_vccz .LBB1_2 -; HSA-TRAP-GFX803-NEXT: ; %bb.1: ; %ret -; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0 -; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 3 -; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1 -; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2 -; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) -; HSA-TRAP-GFX803-NEXT: s_endpgm -; HSA-TRAP-GFX803-NEXT: .LBB1_2: ; %trap -; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[4:5] -; HSA-TRAP-GFX803-NEXT: s_trap 2 -; -; HSA-TRAP-GFX900-LABEL: non_entry_trap: -; HSA-TRAP-GFX900: ; %bb.0: ; %entry -; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 -; HSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) -; HSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc -; HSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0) -; HSA-TRAP-GFX900-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1 -; HSA-TRAP-GFX900-NEXT: s_cbranch_vccz .LBB1_2 -; HSA-TRAP-GFX900-NEXT: ; %bb.1: ; %ret -; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 3 -; HSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1] -; HSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0) -; HSA-TRAP-GFX900-NEXT: s_endpgm -; HSA-TRAP-GFX900-NEXT: .LBB1_2: ; %trap -; HSA-TRAP-GFX900-NEXT: s_trap 2 -; -; HSA-NOTRAP-GFX900-LABEL: non_entry_trap: -; HSA-NOTRAP-GFX900: ; %bb.0: ; %entry -; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 -; HSA-NOTRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) -; HSA-NOTRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc -; HSA-NOTRAP-GFX900-NEXT: s_waitcnt vmcnt(0) -; HSA-NOTRAP-GFX900-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1 -; HSA-NOTRAP-GFX900-NEXT: s_cbranch_vccz .LBB1_2 -; HSA-NOTRAP-GFX900-NEXT: ; %bb.1: ; %ret -; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v1, 3 -; HSA-NOTRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1] -; HSA-NOTRAP-GFX900-NEXT: s_waitcnt vmcnt(0) -; HSA-NOTRAP-GFX900-NEXT: s_endpgm -; HSA-NOTRAP-GFX900-NEXT: .LBB1_2: ; %trap -; HSA-NOTRAP-GFX900-NEXT: s_endpgm +; NOHSA-TRAP-GFX900-V3-LABEL: non_entry_trap: +; NOHSA-TRAP-GFX900-V3: ; %bb.0: ; %entry +; NOHSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0 +; NOHSA-TRAP-GFX900-V3-NEXT: s_waitcnt lgkmcnt(0) +; NOHSA-TRAP-GFX900-V3-NEXT: global_load_dword v1, v0, s[0:1] glc +; NOHSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0) +; NOHSA-TRAP-GFX900-V3-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1 +; NOHSA-TRAP-GFX900-V3-NEXT: s_cbranch_vccz .LBB1_2 +; NOHSA-TRAP-GFX900-V3-NEXT: ; %bb.1: ; %ret +; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 3 +; NOHSA-TRAP-GFX900-V3-NEXT: global_store_dword v0, v1, s[0:1] +; NOHSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0) +; NOHSA-TRAP-GFX900-V3-NEXT: s_endpgm +; NOHSA-TRAP-GFX900-V3-NEXT: .LBB1_2: ; %trap +; NOHSA-TRAP-GFX900-V3-NEXT: s_endpgm +; +; NOHSA-TRAP-GFX900-V4-LABEL: non_entry_trap: +; NOHSA-TRAP-GFX900-V4: ; %bb.0: ; %entry +; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 +; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) +; NOHSA-TRAP-GFX900-V4-NEXT: global_load_dword v1, v0, s[0:1] glc +; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0) +; NOHSA-TRAP-GFX900-V4-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1 +; NOHSA-TRAP-GFX900-V4-NEXT: s_cbranch_vccz .LBB1_2 +; NOHSA-TRAP-GFX900-V4-NEXT: ; %bb.1: ; %ret +; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 3 +; NOHSA-TRAP-GFX900-V4-NEXT: global_store_dword v0, v1, s[0:1] +; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0) +; NOHSA-TRAP-GFX900-V4-NEXT: s_endpgm +; NOHSA-TRAP-GFX900-V4-NEXT: .LBB1_2: ; %trap +; NOHSA-TRAP-GFX900-V4-NEXT: s_endpgm +; +; HSA-TRAP-GFX803-V3-LABEL: non_entry_trap: +; HSA-TRAP-GFX803-V3: ; %bb.0: ; %entry +; HSA-TRAP-GFX803-V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v0, s0 +; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v1, s1 +; HSA-TRAP-GFX803-V3-NEXT: flat_load_dword v0, v[0:1] glc +; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX803-V3-NEXT: v_cmp_eq_u32_e32 vcc, -1, v0 +; HSA-TRAP-GFX803-V3-NEXT: s_cbranch_vccz .LBB1_2 +; HSA-TRAP-GFX803-V3-NEXT: ; %bb.1: ; %ret +; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v0, s0 +; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v2, 3 +; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v1, s1 +; HSA-TRAP-GFX803-V3-NEXT: flat_store_dword v[0:1], v2 +; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX803-V3-NEXT: s_endpgm +; HSA-TRAP-GFX803-V3-NEXT: .LBB1_2: ; %trap +; HSA-TRAP-GFX803-V3-NEXT: s_mov_b64 s[0:1], s[4:5] +; HSA-TRAP-GFX803-V3-NEXT: s_trap 2 +; +; HSA-TRAP-GFX803-V4-LABEL: non_entry_trap: +; HSA-TRAP-GFX803-V4: ; %bb.0: ; %entry +; HSA-TRAP-GFX803-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v0, s0 +; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v1, s1 +; HSA-TRAP-GFX803-V4-NEXT: flat_load_dword v0, v[0:1] glc +; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX803-V4-NEXT: v_cmp_eq_u32_e32 vcc, -1, v0 +; HSA-TRAP-GFX803-V4-NEXT: s_cbranch_vccz .LBB1_2 +; HSA-TRAP-GFX803-V4-NEXT: ; %bb.1: ; %ret +; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v0, s0 +; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v2, 3 +; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v1, s1 +; HSA-TRAP-GFX803-V4-NEXT: flat_store_dword v[0:1], v2 +; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX803-V4-NEXT: s_endpgm +; HSA-TRAP-GFX803-V4-NEXT: .LBB1_2: ; %trap +; HSA-TRAP-GFX803-V4-NEXT: s_mov_b64 s[0:1], s[4:5] +; HSA-TRAP-GFX803-V4-NEXT: s_trap 2 +; +; HSA-TRAP-GFX900-V3-LABEL: non_entry_trap: +; HSA-TRAP-GFX900-V3: ; %bb.0: ; %entry +; HSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0 +; HSA-TRAP-GFX900-V3-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX900-V3-NEXT: global_load_dword v1, v0, s[0:1] glc +; HSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX900-V3-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1 +; HSA-TRAP-GFX900-V3-NEXT: s_cbranch_vccz .LBB1_2 +; HSA-TRAP-GFX900-V3-NEXT: ; %bb.1: ; %ret +; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 3 +; HSA-TRAP-GFX900-V3-NEXT: global_store_dword v0, v1, s[0:1] +; HSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX900-V3-NEXT: s_endpgm +; HSA-TRAP-GFX900-V3-NEXT: .LBB1_2: ; %trap +; HSA-TRAP-GFX900-V3-NEXT: s_mov_b64 s[0:1], s[4:5] +; HSA-TRAP-GFX900-V3-NEXT: s_trap 2 +; +; HSA-TRAP-GFX900-V4-LABEL: non_entry_trap: +; HSA-TRAP-GFX900-V4: ; %bb.0: ; %entry +; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 +; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX900-V4-NEXT: global_load_dword v1, v0, s[0:1] glc +; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX900-V4-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1 +; HSA-TRAP-GFX900-V4-NEXT: s_cbranch_vccz .LBB1_2 +; HSA-TRAP-GFX900-V4-NEXT: ; %bb.1: ; %ret +; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 3 +; HSA-TRAP-GFX900-V4-NEXT: global_store_dword v0, v1, s[0:1] +; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX900-V4-NEXT: s_endpgm +; HSA-TRAP-GFX900-V4-NEXT: .LBB1_2: ; %trap +; HSA-TRAP-GFX900-V4-NEXT: s_trap 2 +; +; HSA-NOTRAP-GFX900-V3-LABEL: non_entry_trap: +; HSA-NOTRAP-GFX900-V3: ; %bb.0: ; %entry +; HSA-NOTRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NOTRAP-GFX900-V3-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NOTRAP-GFX900-V3-NEXT: global_load_dword v1, v0, s[0:1] glc +; HSA-NOTRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0) +; HSA-NOTRAP-GFX900-V3-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1 +; HSA-NOTRAP-GFX900-V3-NEXT: s_cbranch_vccz .LBB1_2 +; HSA-NOTRAP-GFX900-V3-NEXT: ; %bb.1: ; %ret +; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 3 +; HSA-NOTRAP-GFX900-V3-NEXT: global_store_dword v0, v1, s[0:1] +; HSA-NOTRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0) +; HSA-NOTRAP-GFX900-V3-NEXT: s_endpgm +; HSA-NOTRAP-GFX900-V3-NEXT: .LBB1_2: ; %trap +; HSA-NOTRAP-GFX900-V3-NEXT: s_endpgm +; +; HSA-NOTRAP-GFX900-V4-LABEL: non_entry_trap: +; HSA-NOTRAP-GFX900-V4: ; %bb.0: ; %entry +; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NOTRAP-GFX900-V4-NEXT: global_load_dword v1, v0, s[0:1] glc +; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0) +; HSA-NOTRAP-GFX900-V4-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1 +; HSA-NOTRAP-GFX900-V4-NEXT: s_cbranch_vccz .LBB1_2 +; HSA-NOTRAP-GFX900-V4-NEXT: ; %bb.1: ; %ret +; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 3 +; HSA-NOTRAP-GFX900-V4-NEXT: global_store_dword v0, v1, s[0:1] +; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0) +; HSA-NOTRAP-GFX900-V4-NEXT: s_endpgm +; HSA-NOTRAP-GFX900-V4-NEXT: .LBB1_2: ; %trap +; HSA-NOTRAP-GFX900-V4-NEXT: s_endpgm entry: %tmp29 = load volatile i32, ptr addrspace(1) %arg0 %cmp = icmp eq i32 %tmp29, -1 @@ -143,60 +263,115 @@ ret: } define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) { -; NOHSA-TRAP-GFX900-LABEL: debugtrap: -; NOHSA-TRAP-GFX900: ; %bb.0: -; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 -; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 -; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v2, 2 -; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) -; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1] -; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0) -; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v2, s[0:1] -; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0) -; NOHSA-TRAP-GFX900-NEXT: s_endpgm -; -; HSA-TRAP-GFX803-LABEL: debugtrap: -; HSA-TRAP-GFX803: ; %bb.0: -; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1 -; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2 -; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) -; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0 -; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1 -; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2 -; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) -; HSA-TRAP-GFX803-NEXT: s_trap 3 -; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v3 -; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) -; HSA-TRAP-GFX803-NEXT: s_endpgm -; -; HSA-TRAP-GFX900-LABEL: debugtrap: -; HSA-TRAP-GFX900: ; %bb.0: -; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 -; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 -; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v2, 2 -; HSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) -; HSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1] -; HSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0) -; HSA-TRAP-GFX900-NEXT: s_trap 3 -; HSA-TRAP-GFX900-NEXT: global_store_dword v0, v2, s[0:1] -; HSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0) -; HSA-TRAP-GFX900-NEXT: s_endpgm -; -; HSA-NOTRAP-GFX900-LABEL: debugtrap: -; HSA-NOTRAP-GFX900: ; %bb.0: -; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 -; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 -; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v2, 2 -; HSA-NOTRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) -; HSA-NOTRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1] -; HSA-NOTRAP-GFX900-NEXT: s_waitcnt vmcnt(0) -; HSA-NOTRAP-GFX900-NEXT: global_store_dword v0, v2, s[0:1] -; HSA-NOTRAP-GFX900-NEXT: s_waitcnt vmcnt(0) -; HSA-NOTRAP-GFX900-NEXT: s_endpgm +; NOHSA-TRAP-GFX900-V3-LABEL: debugtrap: +; NOHSA-TRAP-GFX900-V3: ; %bb.0: +; NOHSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0 +; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1 +; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v2, 2 +; NOHSA-TRAP-GFX900-V3-NEXT: s_waitcnt lgkmcnt(0) +; NOHSA-TRAP-GFX900-V3-NEXT: global_store_dword v0, v1, s[0:1] +; NOHSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0) +; NOHSA-TRAP-GFX900-V3-NEXT: global_store_dword v0, v2, s[0:1] +; NOHSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0) +; NOHSA-TRAP-GFX900-V3-NEXT: s_endpgm +; +; NOHSA-TRAP-GFX900-V4-LABEL: debugtrap: +; NOHSA-TRAP-GFX900-V4: ; %bb.0: +; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 +; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1 +; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v2, 2 +; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) +; NOHSA-TRAP-GFX900-V4-NEXT: global_store_dword v0, v1, s[0:1] +; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0) +; NOHSA-TRAP-GFX900-V4-NEXT: global_store_dword v0, v2, s[0:1] +; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0) +; NOHSA-TRAP-GFX900-V4-NEXT: s_endpgm +; +; HSA-TRAP-GFX803-V3-LABEL: debugtrap: +; HSA-TRAP-GFX803-V3: ; %bb.0: +; HSA-TRAP-GFX803-V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v2, 1 +; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v3, 2 +; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v0, s0 +; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v1, s1 +; HSA-TRAP-GFX803-V3-NEXT: flat_store_dword v[0:1], v2 +; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX803-V3-NEXT: s_trap 3 +; HSA-TRAP-GFX803-V3-NEXT: flat_store_dword v[0:1], v3 +; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX803-V3-NEXT: s_endpgm +; +; HSA-TRAP-GFX803-V4-LABEL: debugtrap: +; HSA-TRAP-GFX803-V4: ; %bb.0: +; HSA-TRAP-GFX803-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v2, 1 +; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v3, 2 +; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v0, s0 +; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v1, s1 +; HSA-TRAP-GFX803-V4-NEXT: flat_store_dword v[0:1], v2 +; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX803-V4-NEXT: s_trap 3 +; HSA-TRAP-GFX803-V4-NEXT: flat_store_dword v[0:1], v3 +; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX803-V4-NEXT: s_endpgm +; +; HSA-TRAP-GFX900-V3-LABEL: debugtrap: +; HSA-TRAP-GFX900-V3: ; %bb.0: +; HSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0 +; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1 +; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v2, 2 +; HSA-TRAP-GFX900-V3-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX900-V3-NEXT: global_store_dword v0, v1, s[0:1] +; HSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX900-V3-NEXT: s_trap 3 +; HSA-TRAP-GFX900-V3-NEXT: global_store_dword v0, v2, s[0:1] +; HSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX900-V3-NEXT: s_endpgm +; +; HSA-TRAP-GFX900-V4-LABEL: debugtrap: +; HSA-TRAP-GFX900-V4: ; %bb.0: +; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 +; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1 +; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v2, 2 +; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX900-V4-NEXT: global_store_dword v0, v1, s[0:1] +; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX900-V4-NEXT: s_trap 3 +; HSA-TRAP-GFX900-V4-NEXT: global_store_dword v0, v2, s[0:1] +; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX900-V4-NEXT: s_endpgm +; +; HSA-NOTRAP-GFX900-V3-LABEL: debugtrap: +; HSA-NOTRAP-GFX900-V3: ; %bb.0: +; HSA-NOTRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1 +; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v2, 2 +; HSA-NOTRAP-GFX900-V3-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NOTRAP-GFX900-V3-NEXT: global_store_dword v0, v1, s[0:1] +; HSA-NOTRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0) +; HSA-NOTRAP-GFX900-V3-NEXT: global_store_dword v0, v2, s[0:1] +; HSA-NOTRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0) +; HSA-NOTRAP-GFX900-V3-NEXT: s_endpgm +; +; HSA-NOTRAP-GFX900-V4-LABEL: debugtrap: +; HSA-NOTRAP-GFX900-V4: ; %bb.0: +; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1 +; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v2, 2 +; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NOTRAP-GFX900-V4-NEXT: global_store_dword v0, v1, s[0:1] +; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0) +; HSA-NOTRAP-GFX900-V4-NEXT: global_store_dword v0, v2, s[0:1] +; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0) +; HSA-NOTRAP-GFX900-V4-NEXT: s_endpgm store volatile i32 1, ptr addrspace(1) %arg0 call void @llvm.debugtrap() store volatile i32 2, ptr addrspace(1) %arg0 @@ -207,4 +382,4 @@ attributes #0 = { nounwind noreturn } attributes #1 = { nounwind } !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 CODE_OBJECT_VERSION} diff --git a/llvm/test/MC/AMDGPU/hsa-diag-v4.s b/llvm/test/MC/AMDGPU/hsa-diag-v3.s similarity index 94% rename from llvm/test/MC/AMDGPU/hsa-diag-v4.s rename to llvm/test/MC/AMDGPU/hsa-diag-v3.s index f7a554aedb746..369ac905ad2b2 100644 --- a/llvm/test/MC/AMDGPU/hsa-diag-v4.s +++ b/llvm/test/MC/AMDGPU/hsa-diag-v3.s @@ -1,18 +1,18 @@ -// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX8,PREGFX10,AMDHSA -// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX10PLUS,GFX10,AMDHSA -// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1100 -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX10PLUS,GFX11,AMDHSA -// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd- -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,NONAMDHSA -// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GFX90A,PREGFX10,AMDHSA,ALL +// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX8,PREGFX10,AMDHSA +// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX10PLUS,GFX10,AMDHSA +// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx1100 -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX10PLUS,GFX11,AMDHSA +// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd- -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,NONAMDHSA +// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GFX90A,PREGFX10,AMDHSA,ALL .text // GCN-LABEL: warning: test_target // GFX8-NOT: error: -// GFX10: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1010:xnack+ -// GFX11: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1100 -// NONAMDHSA: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-unknown--gfx810 +// GFX10: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810+xnack does not match the specified target id amdgcn-amd-amdhsa--gfx1010+xnack +// GFX11: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810+xnack does not match the specified target id amdgcn-amd-amdhsa--gfx1100 +// NONAMDHSA: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810+xnack does not match the specified target id amdgcn-amd-unknown--gfx810 .warning "test_target" -.amdgcn_target "amdgcn-amd-amdhsa--gfx810:xnack+" +.amdgcn_target "amdgcn-amd-amdhsa--gfx810+xnack" // GCN-LABEL: warning: test_amdhsa_kernel_no_name // GCN: error: unknown directive diff --git a/llvm/test/MC/AMDGPU/hsa-gfx10-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx10-v3.s new file mode 100644 index 0000000000000..ba60000837cdc --- /dev/null +++ b/llvm/test/MC/AMDGPU/hsa-gfx10-v3.s @@ -0,0 +1,226 @@ +// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1010 --amdhsa-code-object-version=3 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s +// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1010 --amdhsa-code-object-version=3 -mattr=+xnack -filetype=obj < %s > %t +// RUN: llvm-readelf -S -r -s %t | FileCheck --check-prefix=READOBJ %s +// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s + +// READOBJ: Section Headers +// READOBJ: .text PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256 +// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} 0000c0 {{[0-9]+}} A {{[0-9]+}} {{[0-9]+}} 64 + +// READOBJ: Relocation section '.rela.rodata' at offset +// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10 +// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110 +// READOBJ: 0000000000000090 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 210 + +// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries: +// READOBJ: 0000000000000000 0 FUNC LOCAL PROTECTED 2 minimal +// READOBJ-NEXT: 0000000000000100 0 FUNC LOCAL PROTECTED 2 complete +// READOBJ-NEXT: 0000000000000200 0 FUNC LOCAL PROTECTED 2 special_sgpr +// READOBJ-NEXT: 0000000000000000 64 OBJECT LOCAL DEFAULT 3 minimal.kd +// READOBJ-NEXT: 0000000000000040 64 OBJECT LOCAL DEFAULT 3 complete.kd +// READOBJ-NEXT: 0000000000000080 64 OBJECT LOCAL DEFAULT 3 special_sgpr.kd + +// OBJDUMP: Contents of section .rodata +// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here. +// minimal +// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0030 0000ac60 80000000 00000000 00000000 +// complete +// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000 +// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0070 015001e4 1f0f007f 7f040000 00000000 +// special_sgpr +// OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 00b0 00000060 80000000 00000000 00000000 + +.text +// ASM: .text + +.amdgcn_target "amdgcn-amd-amdhsa--gfx1010+xnack" +// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010+xnack" + +.p2align 8 +.type minimal,@function +minimal: + s_endpgm + +.p2align 8 +.type complete,@function +complete: + s_endpgm + +.p2align 8 +.type special_sgpr,@function +special_sgpr: + s_endpgm + +.rodata +// ASM: .rodata + +// Test that only specifying required directives is allowed, and that defaulted +// values are omitted. +.p2align 6 +.amdhsa_kernel minimal + .amdhsa_next_free_vgpr 0 + .amdhsa_next_free_sgpr 0 + .amdhsa_shared_vgpr_count 0 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel minimal +// ASM: .amdhsa_next_free_vgpr 0 +// ASM-NEXT: .amdhsa_next_free_sgpr 0 +// ASM: .amdhsa_shared_vgpr_count 0 +// ASM: .end_amdhsa_kernel + +// Test that we can specify all available directives with non-default values. +.p2align 6 +.amdhsa_kernel complete + .amdhsa_group_segment_fixed_size 1 + .amdhsa_private_segment_fixed_size 1 + .amdhsa_kernarg_size 8 + .amdhsa_user_sgpr_private_segment_buffer 1 + .amdhsa_user_sgpr_dispatch_ptr 1 + .amdhsa_user_sgpr_queue_ptr 1 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_user_sgpr_dispatch_id 1 + .amdhsa_user_sgpr_flat_scratch_init 1 + .amdhsa_user_sgpr_private_segment_size 1 + .amdhsa_wavefront_size32 1 + .amdhsa_system_sgpr_private_segment_wavefront_offset 1 + .amdhsa_system_sgpr_workgroup_id_x 0 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_sgpr_workgroup_info 1 + .amdhsa_system_vgpr_workitem_id 1 + .amdhsa_next_free_vgpr 9 + .amdhsa_next_free_sgpr 27 + .amdhsa_reserve_vcc 0 + .amdhsa_reserve_flat_scratch 0 + .amdhsa_reserve_xnack_mask 1 + .amdhsa_float_round_mode_32 1 + .amdhsa_float_round_mode_16_64 1 + .amdhsa_float_denorm_mode_32 1 + .amdhsa_float_denorm_mode_16_64 0 + .amdhsa_dx10_clamp 0 + .amdhsa_ieee_mode 0 + .amdhsa_fp16_overflow 1 + .amdhsa_workgroup_processor_mode 1 + .amdhsa_memory_ordered 1 + .amdhsa_forward_progress 1 + .amdhsa_exception_fp_ieee_invalid_op 1 + .amdhsa_exception_fp_denorm_src 1 + .amdhsa_exception_fp_ieee_div_zero 1 + .amdhsa_exception_fp_ieee_overflow 1 + .amdhsa_exception_fp_ieee_underflow 1 + .amdhsa_exception_fp_ieee_inexact 1 + .amdhsa_exception_int_div_zero 1 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel complete +// ASM-NEXT: .amdhsa_group_segment_fixed_size 1 +// ASM-NEXT: .amdhsa_private_segment_fixed_size 1 +// ASM-NEXT: .amdhsa_kernarg_size 8 +// ASM-NEXT: .amdhsa_user_sgpr_count 15 +// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 +// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1 +// ASM-NEXT: .amdhsa_wavefront_size32 1 +// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1 +// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1 +// ASM-NEXT: .amdhsa_next_free_vgpr 9 +// ASM-NEXT: .amdhsa_next_free_sgpr 27 +// ASM-NEXT: .amdhsa_reserve_vcc 0 +// ASM-NEXT: .amdhsa_reserve_flat_scratch 0 +// ASM-NEXT: .amdhsa_reserve_xnack_mask 1 +// ASM-NEXT: .amdhsa_float_round_mode_32 1 +// ASM-NEXT: .amdhsa_float_round_mode_16_64 1 +// ASM-NEXT: .amdhsa_float_denorm_mode_32 1 +// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0 +// ASM-NEXT: .amdhsa_dx10_clamp 0 +// ASM-NEXT: .amdhsa_ieee_mode 0 +// ASM-NEXT: .amdhsa_fp16_overflow 1 +// ASM-NEXT: .amdhsa_workgroup_processor_mode 1 +// ASM-NEXT: .amdhsa_memory_ordered 1 +// ASM-NEXT: .amdhsa_forward_progress 1 +// ASM-NEXT: .amdhsa_shared_vgpr_count 0 +// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1 +// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1 +// ASM-NEXT: .amdhsa_exception_int_div_zero 1 +// ASM-NEXT: .end_amdhsa_kernel + +// Test that we are including special SGPR usage in the granulated count. +.p2align 6 +.amdhsa_kernel special_sgpr + // Same next_free_sgpr as "complete", but... + .amdhsa_next_free_sgpr 27 + // ...on GFX10+ this should require an additional 6 SGPRs, pushing us from + // 3 granules to 4 + .amdhsa_reserve_flat_scratch 1 + + .amdhsa_reserve_vcc 0 + .amdhsa_reserve_xnack_mask 1 + + .amdhsa_float_denorm_mode_16_64 0 + .amdhsa_dx10_clamp 0 + .amdhsa_ieee_mode 0 + .amdhsa_next_free_vgpr 0 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel special_sgpr +// ASM: .amdhsa_next_free_vgpr 0 +// ASM-NEXT: .amdhsa_next_free_sgpr 27 +// ASM-NEXT: .amdhsa_reserve_vcc 0 +// ASM-NEXT: .amdhsa_reserve_xnack_mask 1 +// ASM: .amdhsa_float_denorm_mode_16_64 0 +// ASM-NEXT: .amdhsa_dx10_clamp 0 +// ASM-NEXT: .amdhsa_ieee_mode 0 +// ASM: .end_amdhsa_kernel + +.section .foo + +.byte .amdgcn.gfx_generation_number +// ASM: .byte 10 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 0 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 0 + +v_mov_b32_e32 v7, s10 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 8 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 11 + +.set .amdgcn.next_free_vgpr, 0 +.set .amdgcn.next_free_sgpr, 0 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 0 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 0 + +v_mov_b32_e32 v16, s3 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 17 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 4 diff --git a/llvm/test/MC/AMDGPU/hsa-gfx11-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx11-v3.s new file mode 100644 index 0000000000000..7f885b457aa63 --- /dev/null +++ b/llvm/test/MC/AMDGPU/hsa-gfx11-v3.s @@ -0,0 +1,213 @@ +// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1100 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefix=ASM %s +// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1100 --amdhsa-code-object-version=3 -filetype=obj < %s > %t +// RUN: llvm-readelf -S -r -s %t | FileCheck --check-prefix=READOBJ %s +// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s + +// READOBJ: Section Headers +// READOBJ: .text PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256 +// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} 0000c0 {{[0-9]+}} A {{[0-9]+}} {{[0-9]+}} 64 + +// READOBJ: Relocation section '.rela.rodata' at offset +// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10 +// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110 +// READOBJ: 0000000000000090 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 210 + +// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries: +// READOBJ: 0000000000000000 0 FUNC LOCAL PROTECTED 2 minimal +// READOBJ-NEXT: 0000000000000100 0 FUNC LOCAL PROTECTED 2 complete +// READOBJ-NEXT: 0000000000000200 0 FUNC LOCAL PROTECTED 2 special_sgpr +// READOBJ-NEXT: 0000000000000000 64 OBJECT LOCAL DEFAULT 3 minimal.kd +// READOBJ-NEXT: 0000000000000040 64 OBJECT LOCAL DEFAULT 3 complete.kd +// READOBJ-NEXT: 0000000000000080 64 OBJECT LOCAL DEFAULT 3 special_sgpr.kd + +// OBJDUMP: Contents of section .rodata +// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here. +// minimal +// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0030 0000ac60 80000000 00000000 00000000 +// complete +// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000 +// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0070 015001e4 130f007f 5e040000 00000000 +// special_sgpr +// OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 00b0 00000060 80000000 00000000 00000000 + +.text +// ASM: .text + +.amdgcn_target "amdgcn-amd-amdhsa--gfx1100" +// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx1100" + +.p2align 8 +.type minimal,@function +minimal: + s_endpgm + +.p2align 8 +.type complete,@function +complete: + s_endpgm + +.p2align 8 +.type special_sgpr,@function +special_sgpr: + s_endpgm + +.rodata +// ASM: .rodata + +// Test that only specifying required directives is allowed, and that defaulted +// values are omitted. +.p2align 6 +.amdhsa_kernel minimal + .amdhsa_next_free_vgpr 0 + .amdhsa_next_free_sgpr 0 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel minimal +// ASM: .amdhsa_next_free_vgpr 0 +// ASM-NEXT: .amdhsa_next_free_sgpr 0 +// ASM: .end_amdhsa_kernel + +// Test that we can specify all available directives with non-default values. +.p2align 6 +.amdhsa_kernel complete + .amdhsa_group_segment_fixed_size 1 + .amdhsa_private_segment_fixed_size 1 + .amdhsa_kernarg_size 8 + .amdhsa_user_sgpr_dispatch_ptr 1 + .amdhsa_user_sgpr_queue_ptr 1 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_user_sgpr_dispatch_id 1 + .amdhsa_user_sgpr_private_segment_size 1 + .amdhsa_wavefront_size32 1 + .amdhsa_enable_private_segment 1 + .amdhsa_system_sgpr_workgroup_id_x 0 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_sgpr_workgroup_info 1 + .amdhsa_system_vgpr_workitem_id 1 + .amdhsa_next_free_vgpr 9 + .amdhsa_next_free_sgpr 27 + .amdhsa_reserve_vcc 0 + .amdhsa_float_round_mode_32 1 + .amdhsa_float_round_mode_16_64 1 + .amdhsa_float_denorm_mode_32 1 + .amdhsa_float_denorm_mode_16_64 0 + .amdhsa_dx10_clamp 0 + .amdhsa_ieee_mode 0 + .amdhsa_fp16_overflow 1 + .amdhsa_workgroup_processor_mode 1 + .amdhsa_memory_ordered 1 + .amdhsa_forward_progress 1 + .amdhsa_exception_fp_ieee_invalid_op 1 + .amdhsa_exception_fp_denorm_src 1 + .amdhsa_exception_fp_ieee_div_zero 1 + .amdhsa_exception_fp_ieee_overflow 1 + .amdhsa_exception_fp_ieee_underflow 1 + .amdhsa_exception_fp_ieee_inexact 1 + .amdhsa_exception_int_div_zero 1 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel complete +// ASM-NEXT: .amdhsa_group_segment_fixed_size 1 +// ASM-NEXT: .amdhsa_private_segment_fixed_size 1 +// ASM-NEXT: .amdhsa_kernarg_size 8 +// ASM-NEXT: .amdhsa_user_sgpr_count 9 +// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1 +// ASM-NEXT: .amdhsa_wavefront_size32 1 +// ASM-NEXT: .amdhsa_enable_private_segment 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1 +// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1 +// ASM-NEXT: .amdhsa_next_free_vgpr 9 +// ASM-NEXT: .amdhsa_next_free_sgpr 27 +// ASM-NEXT: .amdhsa_reserve_vcc 0 +// ASM-NEXT: .amdhsa_float_round_mode_32 1 +// ASM-NEXT: .amdhsa_float_round_mode_16_64 1 +// ASM-NEXT: .amdhsa_float_denorm_mode_32 1 +// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0 +// ASM-NEXT: .amdhsa_dx10_clamp 0 +// ASM-NEXT: .amdhsa_ieee_mode 0 +// ASM-NEXT: .amdhsa_fp16_overflow 1 +// ASM-NEXT: .amdhsa_workgroup_processor_mode 1 +// ASM-NEXT: .amdhsa_memory_ordered 1 +// ASM-NEXT: .amdhsa_forward_progress 1 +// ASM-NEXT: .amdhsa_shared_vgpr_count 0 +// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1 +// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1 +// ASM-NEXT: .amdhsa_exception_int_div_zero 1 +// ASM-NEXT: .end_amdhsa_kernel + +// Test that we are including special SGPR usage in the granulated count. +.p2align 6 +.amdhsa_kernel special_sgpr + // Same next_free_sgpr as "complete", but... + .amdhsa_next_free_sgpr 27 + // ...on GFX10+ this should require an additional 6 SGPRs, pushing us from + // 3 granules to 4 + + .amdhsa_reserve_vcc 0 + + .amdhsa_float_denorm_mode_16_64 0 + .amdhsa_dx10_clamp 0 + .amdhsa_ieee_mode 0 + .amdhsa_next_free_vgpr 0 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel special_sgpr +// ASM: .amdhsa_next_free_vgpr 0 +// ASM-NEXT: .amdhsa_next_free_sgpr 27 +// ASM-NEXT: .amdhsa_reserve_vcc 0 +// ASM: .amdhsa_float_denorm_mode_16_64 0 +// ASM-NEXT: .amdhsa_dx10_clamp 0 +// ASM-NEXT: .amdhsa_ieee_mode 0 +// ASM: .end_amdhsa_kernel + +.section .foo + +.byte .amdgcn.gfx_generation_number +// ASM: .byte 11 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 0 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 0 + +v_mov_b32_e32 v7, s10 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 8 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 11 + +.set .amdgcn.next_free_vgpr, 0 +.set .amdgcn.next_free_sgpr, 0 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 0 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 0 + +v_mov_b32_e32 v16, s3 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 17 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 4 diff --git a/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s new file mode 100644 index 0000000000000..fd84fab8af816 --- /dev/null +++ b/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s @@ -0,0 +1,184 @@ +// RUN: llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck --check-prefix=ASM %s +// RUN: llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=obj < %s > %t +// RUN: llvm-readobj --elf-output-style=GNU --sections --symbols --relocations %t | FileCheck --check-prefix=READOBJ %s +// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s + +// READOBJ: Section Headers +// READOBJ: .text PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256 +// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} 000080 {{[0-9]+}} A {{[0-9]+}} {{[0-9]+}} 64 + +// READOBJ: Relocation section '.rela.rodata' at offset +// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10 +// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110 + +// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries: +// READOBJ-DAG: {{[0-9]+}}: 0000000000000100 0 FUNC LOCAL PROTECTED 2 complete +// READOBJ-DAG: {{[0-9]+}}: 0000000000000040 64 OBJECT LOCAL DEFAULT 3 complete.kd +// READOBJ-DAG: {{[0-9]+}}: 0000000000000000 0 FUNC LOCAL PROTECTED 2 minimal +// READOBJ-DAG: {{[0-9]+}}: 0000000000000000 64 OBJECT LOCAL DEFAULT 3 minimal.kd + +// OBJDUMP: Contents of section .rodata +// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here. +// minimal +// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0030 0000ac00 80000000 00000000 00000000 +// complete +// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000 +// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000100 +// OBJDUMP-NEXT: 0070 c1500104 210f007f 7f008100 00000000 + +.text +// ASM: .text + +.amdgcn_target "amdgcn-amd-amdhsa--gfx90a+xnack+sram-ecc" +// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx90a+xnack+sram-ecc" + +.p2align 8 +.type minimal,@function +minimal: + s_endpgm + +.p2align 8 +.type complete,@function +complete: + s_endpgm + +.rodata +// ASM: .rodata + +// Test that only specifying required directives is allowed, and that defaulted +// values are omitted. +.p2align 6 +.amdhsa_kernel minimal + .amdhsa_next_free_vgpr 0 + .amdhsa_next_free_sgpr 0 + .amdhsa_accum_offset 4 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel minimal +// ASM: .amdhsa_next_free_vgpr 0 +// ASM-NEXT: .amdhsa_next_free_sgpr 0 +// ASM-NEXT: .amdhsa_accum_offset 4 +// ASM: .amdhsa_tg_split 0 +// ASM: .end_amdhsa_kernel + +// Test that we can specify all available directives with non-default values. +.p2align 6 +.amdhsa_kernel complete + .amdhsa_group_segment_fixed_size 1 + .amdhsa_private_segment_fixed_size 1 + .amdhsa_user_sgpr_private_segment_buffer 1 + .amdhsa_user_sgpr_dispatch_ptr 1 + .amdhsa_user_sgpr_queue_ptr 1 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_user_sgpr_dispatch_id 1 + .amdhsa_user_sgpr_flat_scratch_init 1 + .amdhsa_kernarg_size 8 + .amdhsa_user_sgpr_kernarg_preload_length 1 + .amdhsa_user_sgpr_kernarg_preload_offset 1 + .amdhsa_user_sgpr_private_segment_size 1 + .amdhsa_system_sgpr_private_segment_wavefront_offset 1 + .amdhsa_system_sgpr_workgroup_id_x 0 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_sgpr_workgroup_info 1 + .amdhsa_system_vgpr_workitem_id 1 + .amdhsa_next_free_vgpr 9 + .amdhsa_next_free_sgpr 27 + .amdhsa_accum_offset 4 + .amdhsa_reserve_vcc 0 + .amdhsa_reserve_flat_scratch 0 + .amdhsa_float_round_mode_32 1 + .amdhsa_float_round_mode_16_64 1 + .amdhsa_float_denorm_mode_32 1 + .amdhsa_float_denorm_mode_16_64 0 + .amdhsa_dx10_clamp 0 + .amdhsa_ieee_mode 0 + .amdhsa_fp16_overflow 1 + .amdhsa_tg_split 1 + .amdhsa_exception_fp_ieee_invalid_op 1 + .amdhsa_exception_fp_denorm_src 1 + .amdhsa_exception_fp_ieee_div_zero 1 + .amdhsa_exception_fp_ieee_overflow 1 + .amdhsa_exception_fp_ieee_underflow 1 + .amdhsa_exception_fp_ieee_inexact 1 + .amdhsa_exception_int_div_zero 1 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel complete +// ASM-NEXT: .amdhsa_group_segment_fixed_size 1 +// ASM-NEXT: .amdhsa_private_segment_fixed_size 1 +// ASM-NEXT: .amdhsa_kernarg_size 8 +// ASM-NEXT: .amdhsa_user_sgpr_count 16 +// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 +// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 1 +// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1 +// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1 +// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1 +// ASM-NEXT: .amdhsa_next_free_vgpr 9 +// ASM-NEXT: .amdhsa_next_free_sgpr 27 +// ASM-NEXT: .amdhsa_accum_offset 4 +// ASM-NEXT: .amdhsa_reserve_vcc 0 +// ASM-NEXT: .amdhsa_reserve_flat_scratch 0 +// ASM-NEXT: .amdhsa_reserve_xnack_mask 1 +// ASM-NEXT: .amdhsa_float_round_mode_32 1 +// ASM-NEXT: .amdhsa_float_round_mode_16_64 1 +// ASM-NEXT: .amdhsa_float_denorm_mode_32 1 +// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0 +// ASM-NEXT: .amdhsa_dx10_clamp 0 +// ASM-NEXT: .amdhsa_ieee_mode 0 +// ASM-NEXT: .amdhsa_fp16_overflow 1 +// ASM-NEXT: .amdhsa_tg_split 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1 +// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1 +// ASM-NEXT: .amdhsa_exception_int_div_zero 1 +// ASM-NEXT: .end_amdhsa_kernel + +.section .foo + +.byte .amdgcn.gfx_generation_number +// ASM: .byte 9 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 0 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 0 + +v_mov_b32_e32 v7, s10 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 8 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 11 + +.set .amdgcn.next_free_vgpr, 0 +.set .amdgcn.next_free_sgpr, 0 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 0 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 0 + +v_mov_b32_e32 v16, s3 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 17 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 4 diff --git a/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s new file mode 100644 index 0000000000000..9624515ecd6fb --- /dev/null +++ b/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s @@ -0,0 +1,178 @@ +// RUN: llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck --check-prefix=ASM %s +// RUN: llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=obj < %s > %t +// RUN: llvm-readelf -S -r -s %t | FileCheck --check-prefix=READOBJ %s +// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s + +// READOBJ: Section Headers +// READOBJ: .text PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256 +// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} 000080 {{[0-9]+}} A {{[0-9]+}} {{[0-9]+}} 64 + +// READOBJ: Relocation section '.rela.rodata' at offset +// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10 +// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110 + +// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries: +// READOBJ-DAG: {{[0-9]+}}: 0000000000000100 0 FUNC LOCAL PROTECTED 2 complete +// READOBJ-DAG: {{[0-9]+}}: 0000000000000040 64 OBJECT LOCAL DEFAULT 3 complete.kd +// READOBJ-DAG: {{[0-9]+}}: 0000000000000000 0 FUNC LOCAL PROTECTED 2 minimal +// READOBJ-DAG: {{[0-9]+}}: 0000000000000000 64 OBJECT LOCAL DEFAULT 3 minimal.kd + +// OBJDUMP: Contents of section .rodata +// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here. +// minimal +// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0030 0000ac00 80000000 00000000 00000000 +// complete +// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000 +// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000100 +// OBJDUMP-NEXT: 0070 01510104 150f007f 5e008100 00000000 + +.text +// ASM: .text + +.amdgcn_target "amdgcn-amd-amdhsa--gfx940+xnack+sram-ecc" +// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx940+xnack+sram-ecc" + +.p2align 8 +.type minimal,@function +minimal: + s_endpgm + +.p2align 8 +.type complete,@function +complete: + s_endpgm + +.rodata +// ASM: .rodata + +// Test that only specifying required directives is allowed, and that defaulted +// values are omitted. +.p2align 6 +.amdhsa_kernel minimal + .amdhsa_next_free_vgpr 0 + .amdhsa_next_free_sgpr 0 + .amdhsa_accum_offset 4 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel minimal +// ASM: .amdhsa_next_free_vgpr 0 +// ASM-NEXT: .amdhsa_next_free_sgpr 0 +// ASM-NEXT: .amdhsa_accum_offset 4 +// ASM: .amdhsa_tg_split 0 +// ASM: .end_amdhsa_kernel + +// Test that we can specify all available directives with non-default values. +.p2align 6 +.amdhsa_kernel complete + .amdhsa_group_segment_fixed_size 1 + .amdhsa_private_segment_fixed_size 1 + .amdhsa_user_sgpr_dispatch_ptr 1 + .amdhsa_user_sgpr_queue_ptr 1 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_user_sgpr_dispatch_id 1 + .amdhsa_kernarg_size 8 + .amdhsa_user_sgpr_kernarg_preload_length 1 + .amdhsa_user_sgpr_kernarg_preload_offset 1 + .amdhsa_user_sgpr_private_segment_size 1 + .amdhsa_enable_private_segment 1 + .amdhsa_system_sgpr_workgroup_id_x 0 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_sgpr_workgroup_info 1 + .amdhsa_system_vgpr_workitem_id 1 + .amdhsa_next_free_vgpr 9 + .amdhsa_next_free_sgpr 27 + .amdhsa_accum_offset 4 + .amdhsa_reserve_vcc 0 + .amdhsa_float_round_mode_32 1 + .amdhsa_float_round_mode_16_64 1 + .amdhsa_float_denorm_mode_32 1 + .amdhsa_float_denorm_mode_16_64 0 + .amdhsa_dx10_clamp 0 + .amdhsa_ieee_mode 0 + .amdhsa_fp16_overflow 1 + .amdhsa_tg_split 1 + .amdhsa_exception_fp_ieee_invalid_op 1 + .amdhsa_exception_fp_denorm_src 1 + .amdhsa_exception_fp_ieee_div_zero 1 + .amdhsa_exception_fp_ieee_overflow 1 + .amdhsa_exception_fp_ieee_underflow 1 + .amdhsa_exception_fp_ieee_inexact 1 + .amdhsa_exception_int_div_zero 1 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel complete +// ASM-NEXT: .amdhsa_group_segment_fixed_size 1 +// ASM-NEXT: .amdhsa_private_segment_fixed_size 1 +// ASM-NEXT: .amdhsa_kernarg_size 8 +// ASM-NEXT: .amdhsa_user_sgpr_count 10 +// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 1 +// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1 +// ASM-NEXT: .amdhsa_enable_private_segment 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1 +// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1 +// ASM-NEXT: .amdhsa_next_free_vgpr 9 +// ASM-NEXT: .amdhsa_next_free_sgpr 27 +// ASM-NEXT: .amdhsa_accum_offset 4 +// ASM-NEXT: .amdhsa_reserve_vcc 0 +// ASM-NEXT: .amdhsa_reserve_xnack_mask 1 +// ASM-NEXT: .amdhsa_float_round_mode_32 1 +// ASM-NEXT: .amdhsa_float_round_mode_16_64 1 +// ASM-NEXT: .amdhsa_float_denorm_mode_32 1 +// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0 +// ASM-NEXT: .amdhsa_dx10_clamp 0 +// ASM-NEXT: .amdhsa_ieee_mode 0 +// ASM-NEXT: .amdhsa_fp16_overflow 1 +// ASM-NEXT: .amdhsa_tg_split 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1 +// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1 +// ASM-NEXT: .amdhsa_exception_int_div_zero 1 +// ASM-NEXT: .end_amdhsa_kernel + +.section .foo + +.byte .amdgcn.gfx_generation_number +// ASM: .byte 9 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 0 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 0 + +v_mov_b32_e32 v7, s10 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 8 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 11 + +.set .amdgcn.next_free_vgpr, 0 +.set .amdgcn.next_free_sgpr, 0 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 0 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 0 + +v_mov_b32_e32 v16, s3 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 17 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 4 diff --git a/llvm/test/MC/AMDGPU/hsa-v3.s b/llvm/test/MC/AMDGPU/hsa-v3.s new file mode 100644 index 0000000000000..9f854986d7bc4 --- /dev/null +++ b/llvm/test/MC/AMDGPU/hsa-v3.s @@ -0,0 +1,304 @@ +// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 --amdhsa-code-object-version=3 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s +// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 --amdhsa-code-object-version=3 -mattr=+xnack -filetype=obj < %s > %t +// RUN: llvm-readelf -S -r -s %t | FileCheck --check-prefix=READOBJ %s +// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s + +// READOBJ: Section Headers +// READOBJ: .text PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256 +// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} 000100 {{[0-9]+}} A {{[0-9]+}} {{[0-9]+}} 64 + +// READOBJ: Relocation section '.rela.rodata' at offset +// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10 +// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110 +// READOBJ: 0000000000000090 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 210 +// READOBJ: 00000000000000d0 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 310 + +// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries: +// READOBJ: 0000000000000000 0 FUNC LOCAL PROTECTED 2 minimal +// READOBJ-NEXT: 0000000000000100 0 FUNC LOCAL PROTECTED 2 complete +// READOBJ-NEXT: 0000000000000200 0 FUNC LOCAL PROTECTED 2 special_sgpr +// READOBJ-NEXT: 0000000000000300 0 FUNC LOCAL PROTECTED 2 disabled_user_sgpr +// READOBJ-NEXT: 0000000000000000 64 OBJECT LOCAL DEFAULT 3 minimal.kd +// READOBJ-NEXT: 0000000000000040 64 OBJECT LOCAL DEFAULT 3 complete.kd +// READOBJ-NEXT: 0000000000000080 64 OBJECT LOCAL DEFAULT 3 special_sgpr.kd +// READOBJ-NEXT: 00000000000000c0 64 OBJECT LOCAL DEFAULT 3 disabled_user_sgpr.kd + +// OBJDUMP: Contents of section .rodata +// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here. +// minimal +// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0030 0000ac00 80000000 00000000 00000000 +// complete +// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000 +// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0070 c2500104 1f0f007f 7f000000 00000000 +// special_sgpr +// OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 00b0 00010000 80000000 00000000 00000000 +// disabled_user_sgpr +// OBJDUMP-NEXT: 00c0 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 00f0 0000ac00 80000000 00000000 00000000 + +.text +// ASM: .text + +.amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack" +// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack" + +.p2align 8 +.type minimal,@function +minimal: + s_endpgm + +.p2align 8 +.type complete,@function +complete: + s_endpgm + +.p2align 8 +.type special_sgpr,@function +special_sgpr: + s_endpgm + +.p2align 8 +.type disabled_user_sgpr,@function +disabled_user_sgpr: + s_endpgm + +.rodata +// ASM: .rodata + +// Test that only specifying required directives is allowed, and that defaulted +// values are omitted. +.p2align 6 +.amdhsa_kernel minimal + .amdhsa_next_free_vgpr 0 + .amdhsa_next_free_sgpr 0 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel minimal +// ASM: .amdhsa_next_free_vgpr 0 +// ASM-NEXT: .amdhsa_next_free_sgpr 0 +// ASM: .end_amdhsa_kernel + +// Test that we can specify all available directives with non-default values. +.p2align 6 +.amdhsa_kernel complete + .amdhsa_group_segment_fixed_size 1 + .amdhsa_private_segment_fixed_size 1 + .amdhsa_kernarg_size 8 + .amdhsa_user_sgpr_private_segment_buffer 1 + .amdhsa_user_sgpr_dispatch_ptr 1 + .amdhsa_user_sgpr_queue_ptr 1 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_user_sgpr_dispatch_id 1 + .amdhsa_user_sgpr_flat_scratch_init 1 + .amdhsa_user_sgpr_private_segment_size 1 + .amdhsa_system_sgpr_private_segment_wavefront_offset 1 + .amdhsa_system_sgpr_workgroup_id_x 0 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_sgpr_workgroup_info 1 + .amdhsa_system_vgpr_workitem_id 1 + .amdhsa_next_free_vgpr 9 + .amdhsa_next_free_sgpr 27 + .amdhsa_reserve_vcc 0 + .amdhsa_reserve_flat_scratch 0 + .amdhsa_reserve_xnack_mask 1 + .amdhsa_float_round_mode_32 1 + .amdhsa_float_round_mode_16_64 1 + .amdhsa_float_denorm_mode_32 1 + .amdhsa_float_denorm_mode_16_64 0 + .amdhsa_dx10_clamp 0 + .amdhsa_ieee_mode 0 + .amdhsa_fp16_overflow 1 + .amdhsa_exception_fp_ieee_invalid_op 1 + .amdhsa_exception_fp_denorm_src 1 + .amdhsa_exception_fp_ieee_div_zero 1 + .amdhsa_exception_fp_ieee_overflow 1 + .amdhsa_exception_fp_ieee_underflow 1 + .amdhsa_exception_fp_ieee_inexact 1 + .amdhsa_exception_int_div_zero 1 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel complete +// ASM-NEXT: .amdhsa_group_segment_fixed_size 1 +// ASM-NEXT: .amdhsa_private_segment_fixed_size 1 +// ASM-NEXT: .amdhsa_kernarg_size 8 +// ASM-NEXT: .amdhsa_user_sgpr_count 15 +// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 +// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1 +// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1 +// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1 +// ASM-NEXT: .amdhsa_next_free_vgpr 9 +// ASM-NEXT: .amdhsa_next_free_sgpr 27 +// ASM-NEXT: .amdhsa_reserve_vcc 0 +// ASM-NEXT: .amdhsa_reserve_flat_scratch 0 +// ASM-NEXT: .amdhsa_reserve_xnack_mask 1 +// ASM-NEXT: .amdhsa_float_round_mode_32 1 +// ASM-NEXT: .amdhsa_float_round_mode_16_64 1 +// ASM-NEXT: .amdhsa_float_denorm_mode_32 1 +// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0 +// ASM-NEXT: .amdhsa_dx10_clamp 0 +// ASM-NEXT: .amdhsa_ieee_mode 0 +// ASM-NEXT: .amdhsa_fp16_overflow 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1 +// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1 +// ASM-NEXT: .amdhsa_exception_int_div_zero 1 +// ASM-NEXT: .end_amdhsa_kernel + +// Test that we are including special SGPR usage in the granulated count. +.p2align 6 +.amdhsa_kernel special_sgpr + // Same next_free_sgpr as "complete", but... + .amdhsa_next_free_sgpr 27 + // ...on GFX9 this should require an additional 6 SGPRs, pushing us from + // 3 granules to 4 + .amdhsa_reserve_flat_scratch 1 + + .amdhsa_reserve_vcc 0 + .amdhsa_reserve_xnack_mask 1 + + .amdhsa_float_denorm_mode_16_64 0 + .amdhsa_dx10_clamp 0 + .amdhsa_ieee_mode 0 + .amdhsa_next_free_vgpr 0 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel special_sgpr +// ASM: .amdhsa_next_free_vgpr 0 +// ASM-NEXT: .amdhsa_next_free_sgpr 27 +// ASM-NEXT: .amdhsa_reserve_vcc 0 +// ASM-NEXT: .amdhsa_reserve_xnack_mask 1 +// ASM: .amdhsa_float_denorm_mode_16_64 0 +// ASM-NEXT: .amdhsa_dx10_clamp 0 +// ASM-NEXT: .amdhsa_ieee_mode 0 +// ASM: .end_amdhsa_kernel + +// Test that explicitly disabling user_sgpr's does not affect the user_sgpr +// count, i.e. this should produce the same descriptor as minimal. +.p2align 6 +.amdhsa_kernel disabled_user_sgpr + .amdhsa_user_sgpr_private_segment_buffer 0 + .amdhsa_next_free_vgpr 0 + .amdhsa_next_free_sgpr 0 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel disabled_user_sgpr +// ASM: .amdhsa_next_free_vgpr 0 +// ASM-NEXT: .amdhsa_next_free_sgpr 0 +// ASM: .end_amdhsa_kernel + +.section .foo + +.byte .amdgcn.gfx_generation_number +// ASM: .byte 9 + +.byte .amdgcn.gfx_generation_minor +// ASM: .byte 0 + +.byte .amdgcn.gfx_generation_stepping +// ASM: .byte 4 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 0 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 0 + +v_mov_b32_e32 v7, s10 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 8 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 11 + +.set .amdgcn.next_free_vgpr, 0 +.set .amdgcn.next_free_sgpr, 0 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 0 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 0 + +v_mov_b32_e32 v16, s3 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 17 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 4 + +// Metadata + +.amdgpu_metadata + amdhsa.version: + - 3 + - 0 + amdhsa.kernels: + - .name: amd_kernel_code_t_test_all + .symbol: amd_kernel_code_t_test_all@kd + .kernarg_segment_size: 8 + .group_segment_fixed_size: 16 + .private_segment_fixed_size: 32 + .kernarg_segment_align: 64 + .wavefront_size: 128 + .sgpr_count: 14 + .vgpr_count: 40 + .max_flat_workgroup_size: 256 + - .name: amd_kernel_code_t_minimal + .symbol: amd_kernel_code_t_minimal@kd + .kernarg_segment_size: 8 + .group_segment_fixed_size: 16 + .private_segment_fixed_size: 32 + .kernarg_segment_align: 64 + .wavefront_size: 128 + .sgpr_count: 14 + .vgpr_count: 40 + .max_flat_workgroup_size: 256 +.end_amdgpu_metadata + +// ASM: .amdgpu_metadata +// ASM: amdhsa.kernels: +// ASM: - .group_segment_fixed_size: 16 +// ASM: .kernarg_segment_align: 64 +// ASM: .kernarg_segment_size: 8 +// ASM: .max_flat_workgroup_size: 256 +// ASM: .name: amd_kernel_code_t_test_all +// ASM: .private_segment_fixed_size: 32 +// ASM: .sgpr_count: 14 +// ASM: .symbol: 'amd_kernel_code_t_test_all@kd' +// ASM: .vgpr_count: 40 +// ASM: .wavefront_size: 128 +// ASM: - .group_segment_fixed_size: 16 +// ASM: .kernarg_segment_align: 64 +// ASM: .kernarg_segment_size: 8 +// ASM: .max_flat_workgroup_size: 256 +// ASM: .name: amd_kernel_code_t_minimal +// ASM: .private_segment_fixed_size: 32 +// ASM: .sgpr_count: 14 +// ASM: .symbol: 'amd_kernel_code_t_minimal@kd' +// ASM: .vgpr_count: 40 +// ASM: .wavefront_size: 128 +// ASM: amdhsa.version: +// ASM-NEXT: - 3 +// ASM-NEXT: - 0 +// ASM: .end_amdgpu_metadata diff --git a/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s b/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s index 7e3ae8424cc7b..63e532e0ffa37 100644 --- a/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s +++ b/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s @@ -1,4 +1,4 @@ -// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx90a %s 2>&1 >/dev/null | FileCheck -check-prefix=ERR %s +// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx90a %s 2>&1 >/dev/null | FileCheck -check-prefix=ERR %s .amdhsa_kernel implied_count_too_low_0 .amdhsa_user_sgpr_count 0 diff --git a/llvm/test/MC/AMDGPU/user-sgpr-count.s b/llvm/test/MC/AMDGPU/user-sgpr-count.s index 950c514f786b2..aa8970185eb04 100644 --- a/llvm/test/MC/AMDGPU/user-sgpr-count.s +++ b/llvm/test/MC/AMDGPU/user-sgpr-count.s @@ -1,10 +1,10 @@ -// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a --amdhsa-code-object-version=4 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s +// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a --amdhsa-code-object-version=3 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s .text // ASM: .text -.amdgcn_target "amdgcn-amd-amdhsa--gfx90a:xnack+" -// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx90a:xnack+" +.amdgcn_target "amdgcn-amd-amdhsa--gfx90a+xnack+sram-ecc" +// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx90a+xnack+sram-ecc" // ASM-LABEL: .amdhsa_kernel user_sgprs_implied_count