diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 22e8c6086f087..51bd9b63c127e 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2486,6 +2486,11 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var" [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree]>; +def int_amdgcn_flat_atomic_fmin_num : AMDGPUGlobalAtomicRtn; +def int_amdgcn_flat_atomic_fmax_num : AMDGPUGlobalAtomicRtn; +def int_amdgcn_global_atomic_fmin_num : AMDGPUGlobalAtomicRtn; +def int_amdgcn_global_atomic_fmax_num : AMDGPUGlobalAtomicRtn; + //===----------------------------------------------------------------------===// // Deep learning intrinsics. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 98d90814f223c..b0eac567ec9f1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1166,6 +1166,11 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const { if (isNoUnsignedWrap(Addr)) return true; + // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative + // values. + if (AMDGPU::isGFX12Plus(*Subtarget)) + return true; + auto LHS = Addr.getOperand(0); auto RHS = Addr.getOperand(1); @@ -1701,7 +1706,7 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr, } VAddr = Addr; - Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16); + Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32); return true; } @@ -1769,7 +1774,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); VOffset = SDValue(VMov, 0); SAddr = LHS; - Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16); + Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32); return true; } } @@ -1809,7 +1814,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, } if (SAddr) { - Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32); return true; } } @@ -1825,7 +1830,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32, CurDAG->getTargetConstant(0, SDLoc(), MVT::i32)); VOffset = SDValue(VMov, 0); - Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 121026aca6035..eaf72d7157ee2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -642,6 +642,10 @@ defm int_amdgcn_global_atomic_fmax : noret_op; defm int_amdgcn_global_atomic_csub : noret_op; defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op; defm int_amdgcn_ds_fadd_v2bf16 : noret_op; +defm int_amdgcn_flat_atomic_fmin_num : noret_op; +defm int_amdgcn_flat_atomic_fmax_num : noret_op; +defm int_amdgcn_global_atomic_fmin_num : noret_op; +defm int_amdgcn_global_atomic_fmax_num : noret_op; multiclass noret_binary_atomic_op { let HasNoUse = true in diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index d8a41ae1800e3..66d0e807e0284 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4663,9 +4663,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_global_atomic_csub: case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_global_atomic_fmax: + case Intrinsic::amdgcn_global_atomic_fmin_num: + case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fmin_num: + case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: return getDefaultMappingAllVGPR(MI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index fc05f14744c0f..beb670669581f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -241,9 +241,13 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index d754058d4f064..53a9fc2d8e7f6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1026,6 +1026,8 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl &OpIndexes, case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmax_num: + case Intrinsic::amdgcn_flat_atomic_fmin_num: OpIndexes.push_back(0); return true; default: @@ -1100,7 +1102,9 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, } case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmax: - case Intrinsic::amdgcn_flat_atomic_fmin: { + case Intrinsic::amdgcn_flat_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmax_num: + case Intrinsic::amdgcn_flat_atomic_fmin_num: { Type *DestTy = II->getType(); Type *SrcTy = NewV->getType(); unsigned NewAS = SrcTy->getPointerAddressSpace(); diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index f3771bff247fd..0dd2b3f5c2c91 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1072,19 +1072,43 @@ class FlatStoreSignedAtomicPat .ret:$data, $offset) >; -multiclass FlatAtomicPat { - defvar rtnNode = !cast(node#"_"#vt.Size); - defvar noRtnNode = !cast(node#"_noret_"#vt.Size); - - def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (!cast(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; +multiclass FlatAtomicNoRtnPat { + defvar noRtnNode = !cast(node # "_noret" # !if(isIntr, "", "_"#vt.Size)); let AddedComplexity = 1 in def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), (!cast(inst) VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; } +multiclass FlatAtomicRtnPat { + defvar rtnNode = !cast(node # !if(isIntr, "", "_"#vt.Size)); + + def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), + (!cast(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; +} + +multiclass FlatAtomicPat : + FlatAtomicRtnPat, + FlatAtomicNoRtnPat; + +multiclass FlatAtomicIntrNoRtnPat { + defm : FlatAtomicNoRtnPat; +} + +multiclass FlatAtomicIntrRtnPat { + defm : FlatAtomicRtnPat; +} + +multiclass FlatAtomicIntrPat : + FlatAtomicRtnPat, + FlatAtomicNoRtnPat; + class FlatSignedAtomicPatBase : GCNPat < (vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)), @@ -1305,10 +1329,10 @@ multiclass GlobalFLATStorePats { let AddedComplexity = 11 in - def : FlatSignedAtomicPatBase(inst), !cast(node), vt, data_vt>; + def : FlatSignedAtomicPatBase(inst), !cast(node), vt, data_vt>; let AddedComplexity = 13 in - def : GlobalAtomicSaddrPat(inst#"_SADDR"), !cast(node), vt, data_vt>; + def : GlobalAtomicSaddrPat(inst#"_SADDR"), !cast(node), vt, data_vt>; } multiclass GlobalFLATAtomicPatsRtnBase; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>; defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>; defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>; +} + +let OtherPredicates = [isGFX10GFX11] in { +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>; + defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>; defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>; } @@ -1527,6 +1555,13 @@ defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN_X2", "int_amdgcn_flat_atomic_f defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX_X2", "int_amdgcn_flat_atomic_fmax", f64>; } +let OtherPredicates = [isGFX12Only] in { + defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>; + defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>; + defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>; + defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>; +} + let OtherPredicates = [HasAtomicFaddNoRtnInsts] in { defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f32>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ec82beaf37fb9..aced46bac9e51 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1230,9 +1230,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_global_atomic_fadd: case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_global_atomic_fmax: + case Intrinsic::amdgcn_global_atomic_fmin_num: + case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fmin_num: + case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: { Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -1315,6 +1319,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fmin_num: + case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: case Intrinsic::amdgcn_global_atomic_csub: { @@ -8642,8 +8648,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, } case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_global_atomic_fmax: + case Intrinsic::amdgcn_global_atomic_fmin_num: + case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin: - case Intrinsic::amdgcn_flat_atomic_fmax: { + case Intrinsic::amdgcn_flat_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fmin_num: + case Intrinsic::amdgcn_flat_atomic_fmax_num: { MemSDNode *M = cast(Op); SDValue Ops[] = { M->getOperand(0), // Chain @@ -8653,12 +8663,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, unsigned Opcode = 0; switch (IntrID) { case Intrinsic::amdgcn_global_atomic_fmin: - case Intrinsic::amdgcn_flat_atomic_fmin: { + case Intrinsic::amdgcn_global_atomic_fmin_num: + case Intrinsic::amdgcn_flat_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmin_num: { Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN; break; } case Intrinsic::amdgcn_global_atomic_fmax: - case Intrinsic::amdgcn_flat_atomic_fmax: { + case Intrinsic::amdgcn_global_atomic_fmax_num: + case Intrinsic::amdgcn_flat_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fmax_num: { Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX; break; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index f819172b4ffbf..46ffc6100b292 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -8689,16 +8689,13 @@ bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, AddrSpace == AMDGPUAS::GLOBAL_ADDRESS)) return false; - bool AllowNegative = FlatVariant != SIInstrFlags::FLAT; - if (ST.hasNegativeScratchOffsetBug() && - FlatVariant == SIInstrFlags::FlatScratch) - AllowNegative = false; if (ST.hasNegativeUnalignedScratchOffsetBug() && FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 && (Offset % 4) != 0) { return false; } + bool AllowNegative = allowNegativeFlatOffset(FlatVariant); unsigned N = AMDGPU::getNumFlatOffsetBits(ST); return isIntN(N, Offset) && (AllowNegative || Offset >= 0); } @@ -8709,12 +8706,10 @@ SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const { int64_t RemainderOffset = COffsetVal; int64_t ImmField = 0; - bool AllowNegative = FlatVariant != SIInstrFlags::FLAT; - if (ST.hasNegativeScratchOffsetBug() && - FlatVariant == SIInstrFlags::FlatScratch) - AllowNegative = false; + bool AllowNegative = allowNegativeFlatOffset(FlatVariant); const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1; + if (AllowNegative) { // Use signed division by a power of two to truncate towards 0. int64_t D = 1LL << NumBits; @@ -8738,6 +8733,14 @@ SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, return {ImmField, RemainderOffset}; } +bool SIInstrInfo::allowNegativeFlatOffset(uint64_t FlatVariant) const { + if (ST.hasNegativeScratchOffsetBug() && + FlatVariant == SIInstrFlags::FlatScratch) + return false; + + return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST); +} + static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) { switch (ST.getGeneration()) { default: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 02b3422bccabf..affe520467520 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1294,6 +1294,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { unsigned AddrSpace, uint64_t FlatVariant) const; + /// Returns true if negative offsets are allowed for the given \p FlatVariant. + bool allowNegativeFlatOffset(uint64_t FlatVariant) const; + /// \brief Return a target-specific opcode if Opcode is a pseudo instruction. /// Return -1 if the target-specific opcode for the pseudo instruction does /// not exist. If Opcode is not a pseudo instruction, this is identity. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll index 3f93141252730..83266f8d8386e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX940 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %data) { ; GFX940-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index ec2cd43e5fb5d..4603fbcd525c7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -3,6 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1030 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -march=amdgcn -mcpu=gfx940 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s ; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_kernel: @@ -74,6 +75,22 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: store_load_sindex_kernel: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_and_b32 s1, s0, 15 +; GFX12-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-NEXT: s_lshl_b32 s1, s1, 2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:4 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx @@ -144,6 +161,19 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: store_load_vindex_kernel: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 +; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:4 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v1, 4, v1 +; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -214,6 +244,20 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX11-NEXT: scratch_load_b32 v0, v0, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: store_load_vindex_foo: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0 +; GFX12-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_nc_u32_e32 v1, s32, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v0, s32 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [32 x float], align 4, addrspace(5) %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx @@ -254,6 +298,13 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) { ; GFX11-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: private_ptr_foo: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v1, 0x41200000 +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:4 +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1 store float 1.000000e+01, ptr addrspace(5) %gep, align 4 ret void @@ -336,6 +387,23 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:260 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: store_load_sindex_small_offset_kernel: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: scratch_load_b32 v2, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_and_b32 s1, s0, 15 +; GFX12-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-NEXT: s_lshl_b32 s1, s1, 2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_addk_co_i32 s0, 0x104 +; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:260 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -417,6 +485,20 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: store_load_vindex_small_offset_kernel: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 +; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:260 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v1, 0x104, v1 +; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -503,6 +585,22 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX11-NEXT: scratch_load_b32 v0, v0, s32 offset:256 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: store_load_vindex_small_offset_foo: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0 +; GFX12-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX12-NEXT: s_add_co_i32 s0, s32, 0x100 +; GFX12-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v0, s32 offset:256 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -595,6 +693,23 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX11-NEXT: scratch_load_b32 v0, v1, s0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: store_load_sindex_large_offset_kernel: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: scratch_load_b32 v2, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_and_b32 s1, s0, 15 +; GFX12-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-NEXT: s_lshl_b32 s1, s1, 2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_addk_co_i32 s0, 0x4004 +; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:16388 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -678,6 +793,20 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: store_load_vindex_large_offset_kernel: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 +; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16388 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1 +; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -766,6 +895,22 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX11-NEXT: scratch_load_b32 v0, v0, s0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: store_load_vindex_large_offset_foo: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0 +; GFX12-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000 +; GFX12-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v0, s32 offset:16384 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -839,6 +984,17 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: store_load_large_imm_offset_kernel: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 +; GFX12-NEXT: scratch_store_b32 off, v0, off offset:4 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_store_b32 off, v1, off offset:16004 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, off offset:16004 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_endpgm bb: %i = alloca [4096 x i32], align 4, addrspace(5) %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef @@ -909,6 +1065,18 @@ define void @store_load_large_imm_offset_foo() { ; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: store_load_large_imm_offset_foo: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 +; GFX12-NEXT: scratch_store_b32 off, v0, s32 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16000 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16000 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [4096 x i32], align 4, addrspace(5) %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef @@ -979,6 +1147,20 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX11-NEXT: scratch_load_b32 v0, v0, off offset:1024 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: store_load_vidx_sidx_offset: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 15 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_add_lshl_u32 v0, s0, v0, 2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_endpgm bb: %alloca = alloca [32 x i32], align 4, addrspace(5) %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -1033,6 +1215,17 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) { ; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: store_load_i64_aligned: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v1, 15 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, ptr addrspace(5) %arg, align 8 %load = load volatile i64, ptr addrspace(5) %arg, align 8 @@ -1082,6 +1275,17 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: store_load_i64_unaligned: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v1, 15 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, ptr addrspace(5) %arg, align 1 %load = load volatile i64, ptr addrspace(5) %arg, align 1 @@ -1147,6 +1351,20 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { ; GFX11-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: store_load_v3i32_unaligned: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_mov_b32 s2, 3 +; GFX12-NEXT: s_mov_b32 s1, 2 +; GFX12-NEXT: s_mov_b32 s0, 1 +; GFX12-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1 +; GFX12-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NEXT: scratch_store_b96 v0, v[1:3], off th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b96 v[0:2], v0, off th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] bb: store volatile <3 x i32> , ptr addrspace(5) %arg, align 1 %load = load volatile <3 x i32>, ptr addrspace(5) %arg, align 1 @@ -1217,6 +1435,21 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; GFX11-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: store_load_v4i32_unaligned: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_mov_b32 s3, 4 +; GFX12-NEXT: s_mov_b32 s2, 3 +; GFX12-NEXT: s_mov_b32 s1, 2 +; GFX12-NEXT: s_mov_b32 s0, 1 +; GFX12-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 +; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX12-NEXT: scratch_store_b128 v0, v[1:4], off th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b128 v[0:3], v0, off th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] bb: store volatile <4 x i32> , ptr addrspace(5) %arg, align 1 %load = load volatile <4 x i32>, ptr addrspace(5) %arg, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll index 5516741e57e92..bfb2ecde783a6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll @@ -3,6 +3,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) { ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll index 5787392a5901a..d2c42292a0364 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) { ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir index f1c3673ae29dd..bfe806f47c86e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir @@ -4,6 +4,8 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s +# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s + --- name: amdgpu_atomic_cmpxchg_s32_flat @@ -53,6 +55,16 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] + ; + ; GFX12-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX12-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s32) = COPY $vgpr3 @@ -126,6 +138,16 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] + ; + ; GFX12-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gep4 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX12-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s32) = COPY $vgpr3 @@ -185,6 +207,16 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] + ; + ; GFX12-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 + ; GFX12-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 %2:vgpr(s64) = COPY $vgpr4_vgpr5 @@ -258,6 +290,16 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] + ; + ; GFX12-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_gep4 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 + ; GFX12-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 %2:vgpr(s64) = COPY $vgpr4_vgpr5 @@ -349,6 +391,16 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] + ; + ; GFX12-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gepm4 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX12-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], -4, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s32) = COPY $vgpr3 @@ -404,6 +456,15 @@ body: | ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX11-NEXT: FLAT_ATOMIC_CMPSWAP [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; + ; GFX12-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_nortn + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX12-NEXT: FLAT_ATOMIC_CMPSWAP [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s32) = COPY $vgpr3 @@ -456,6 +517,15 @@ body: | ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 ; GFX11-NEXT: FLAT_ATOMIC_CMPSWAP_X2 [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; + ; GFX12-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_nortn + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 + ; GFX12-NEXT: FLAT_ATOMIC_CMPSWAP_X2 [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 %2:vgpr(s64) = COPY $vgpr4_vgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir index e1ef96bec0fdb..4dba133bf110d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir @@ -6,6 +6,7 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s --- name: amdgpu_atomic_cmpxchg_s32_global diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir index 868f08d805caa..5dbc5288e3824 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir @@ -1,9 +1,10 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX7 %s -# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX7 %s -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s -# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX7,GCN %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX7,GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX9,GCN,LARGE_IOFFSET %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX10,GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11,GCN,LARGE_IOFFSET %s +# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX12,GCN,LARGE_IOFFSET %s --- name: flat_atomicrmw_add_s32 @@ -14,37 +15,13 @@ body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX7-LABEL: name: flat_atomicrmw_add_s32 - ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) - ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] - ; - ; GFX9-LABEL: name: flat_atomicrmw_add_s32 - ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) - ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] - ; - ; GFX10-LABEL: name: flat_atomicrmw_add_s32 - ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) - ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] - ; - ; GFX11-LABEL: name: flat_atomicrmw_add_s32 - ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) - ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; GCN-LABEL: name: flat_atomicrmw_add_s32 + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GCN-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s32) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst (s32), addrspace 0) @@ -61,33 +38,12 @@ body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX7-LABEL: name: flat_atomicrmw_add_s32_nortn - ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) - ; - ; GFX9-LABEL: name: flat_atomicrmw_add_s32_nortn - ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) - ; - ; GFX10-LABEL: name: flat_atomicrmw_add_s32_nortn - ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) - ; - ; GFX11-LABEL: name: flat_atomicrmw_add_s32_nortn - ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GCN-LABEL: name: flat_atomicrmw_add_s32_nortn + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s32) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst (s32), addrspace 0) @@ -119,13 +75,13 @@ body: | ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; - ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2047 - ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) - ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; LARGE_IOFFSET-LABEL: name: flat_atomicrmw_add_s32_offset2047 + ; LARGE_IOFFSET: liveins: $vgpr0_vgpr1, $vgpr2 + ; LARGE_IOFFSET-NEXT: {{ $}} + ; LARGE_IOFFSET-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; LARGE_IOFFSET-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; LARGE_IOFFSET-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; LARGE_IOFFSET-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2047 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 @@ -142,14 +98,6 @@ body: | ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] - ; - ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset2047 - ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) - ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 2047 @@ -183,12 +131,12 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; - ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn - ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 2047, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; LARGE_IOFFSET-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn + ; LARGE_IOFFSET: liveins: $vgpr0_vgpr1, $vgpr2 + ; LARGE_IOFFSET-NEXT: {{ $}} + ; LARGE_IOFFSET-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; LARGE_IOFFSET-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; LARGE_IOFFSET-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 2047, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 @@ -204,13 +152,6 @@ body: | ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) - ; - ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn - ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 2047, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 2047 @@ -244,13 +185,13 @@ body: | ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; - ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2048 - ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) - ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; LARGE_IOFFSET-LABEL: name: flat_atomicrmw_add_s32_offset2048 + ; LARGE_IOFFSET: liveins: $vgpr0_vgpr1, $vgpr2 + ; LARGE_IOFFSET-NEXT: {{ $}} + ; LARGE_IOFFSET-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; LARGE_IOFFSET-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; LARGE_IOFFSET-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; LARGE_IOFFSET-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2048 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 @@ -267,14 +208,6 @@ body: | ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] - ; - ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset2048 - ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) - ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 2048 @@ -308,12 +241,12 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; - ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn - ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 2048, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; LARGE_IOFFSET-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn + ; LARGE_IOFFSET: liveins: $vgpr0_vgpr1, $vgpr2 + ; LARGE_IOFFSET-NEXT: {{ $}} + ; LARGE_IOFFSET-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; LARGE_IOFFSET-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; LARGE_IOFFSET-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 2048, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 @@ -329,13 +262,6 @@ body: | ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) - ; - ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn - ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 2048, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 2048 @@ -369,13 +295,13 @@ body: | ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; - ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4095 - ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) - ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; LARGE_IOFFSET-LABEL: name: flat_atomicrmw_add_s32_offset4095 + ; LARGE_IOFFSET: liveins: $vgpr0_vgpr1, $vgpr2 + ; LARGE_IOFFSET-NEXT: {{ $}} + ; LARGE_IOFFSET-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; LARGE_IOFFSET-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; LARGE_IOFFSET-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; LARGE_IOFFSET-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4095 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 @@ -392,14 +318,6 @@ body: | ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] - ; - ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset4095 - ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) - ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 4095 @@ -433,12 +351,12 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; - ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn - ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; LARGE_IOFFSET-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn + ; LARGE_IOFFSET: liveins: $vgpr0_vgpr1, $vgpr2 + ; LARGE_IOFFSET-NEXT: {{ $}} + ; LARGE_IOFFSET-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; LARGE_IOFFSET-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; LARGE_IOFFSET-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 @@ -454,13 +372,6 @@ body: | ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) - ; - ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn - ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 4095 @@ -541,6 +452,14 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] + ; + ; GFX12-LABEL: name: flat_atomicrmw_add_s32_offset4097 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4097, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 4097 @@ -618,6 +537,13 @@ body: | ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) + ; + ; GFX12-LABEL: name: flat_atomicrmw_add_s32_offset4097_nortn + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 4097, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 4097 @@ -635,37 +561,13 @@ body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX7-LABEL: name: flat_atomicrmw_add_s64 - ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) - ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] - ; - ; GFX9-LABEL: name: flat_atomicrmw_add_s64 - ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] - ; - ; GFX10-LABEL: name: flat_atomicrmw_add_s64 - ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) - ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] - ; - ; GFX11-LABEL: name: flat_atomicrmw_add_s64 - ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) - ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] + ; GCN-LABEL: name: flat_atomicrmw_add_s64 + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GCN-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 %2:vgpr(s64) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst (s64), addrspace 0) @@ -682,33 +584,12 @@ body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX7-LABEL: name: flat_atomicrmw_add_s64_nortn - ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) - ; - ; GFX9-LABEL: name: flat_atomicrmw_add_s64_nortn - ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) - ; - ; GFX10-LABEL: name: flat_atomicrmw_add_s64_nortn - ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX10-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) - ; - ; GFX11-LABEL: name: flat_atomicrmw_add_s64_nortn - ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX11-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; GCN-LABEL: name: flat_atomicrmw_add_s64_nortn + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GCN-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 %2:vgpr(s64) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst (s64), addrspace 0) @@ -740,13 +621,13 @@ body: | ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] ; - ; GFX9-LABEL: name: flat_atomicrmw_add_s64_offset4095 - ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] + ; LARGE_IOFFSET-LABEL: name: flat_atomicrmw_add_s64_offset4095 + ; LARGE_IOFFSET: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; LARGE_IOFFSET-NEXT: {{ $}} + ; LARGE_IOFFSET-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; LARGE_IOFFSET-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; LARGE_IOFFSET-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; LARGE_IOFFSET-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] ; ; GFX10-LABEL: name: flat_atomicrmw_add_s64_offset4095 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 @@ -763,14 +644,6 @@ body: | ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] - ; - ; GFX11-LABEL: name: flat_atomicrmw_add_s64_offset4095 - ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) - ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 %2:vgpr(s64) = G_CONSTANT i64 4095 @@ -804,12 +677,12 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) ; - ; GFX9-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn - ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) + ; LARGE_IOFFSET-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn + ; LARGE_IOFFSET: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; LARGE_IOFFSET-NEXT: {{ $}} + ; LARGE_IOFFSET-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; LARGE_IOFFSET-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; LARGE_IOFFSET-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) ; ; GFX10-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 @@ -825,13 +698,6 @@ body: | ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX10-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) - ; - ; GFX11-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn - ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX11-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 %2:vgpr(s64) = G_CONSTANT i64 4095 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir index 43d9b911b9f33..637a7d080dd79 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir @@ -5,6 +5,7 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s +# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s --- name: global_atomicrmw_add_s32 @@ -59,6 +60,14 @@ body: | ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] + ; + ; GFX12-LABEL: name: global_atomicrmw_add_s32 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s32) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst (s32), addrspace 1) @@ -114,6 +123,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; + ; GFX12-LABEL: name: global_atomicrmw_add_s32_nortn + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s32) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst (s32), addrspace 1) @@ -181,6 +197,14 @@ body: | ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] + ; + ; GFX12-LABEL: name: global_atomicrmw_add_s32_offset2047 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 2047 @@ -246,6 +270,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; + ; GFX12-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 2047 @@ -323,6 +354,14 @@ body: | ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] + ; + ; GFX12-LABEL: name: global_atomicrmw_add_s32_offset2048 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 2048 @@ -396,6 +435,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 2048, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; + ; GFX12-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 2048, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 2048 @@ -473,6 +519,14 @@ body: | ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] + ; + ; GFX12-LABEL: name: global_atomicrmw_add_s32_offset4095 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 4095 @@ -546,6 +600,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; + ; GFX12-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 4095 @@ -640,6 +701,14 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] + ; + ; GFX12-LABEL: name: global_atomicrmw_add_s32_offset4097 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4097, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 4097 @@ -730,6 +799,13 @@ body: | ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) + ; + ; GFX12-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 4097, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 4097 @@ -791,6 +867,14 @@ body: | ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]] + ; + ; GFX12-LABEL: name: global_atomicrmw_add_s64 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 %2:vgpr(s64) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst (s64), addrspace 1) @@ -846,6 +930,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX11-NEXT: GLOBAL_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1) + ; + ; GFX12-LABEL: name: global_atomicrmw_add_s64_nortn + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: GLOBAL_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 %2:vgpr(s64) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst (s64), addrspace 1) @@ -921,6 +1012,14 @@ body: | ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]] + ; + ; GFX12-LABEL: name: global_atomicrmw_add_s64_offset4095 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 %2:vgpr(s64) = G_CONSTANT i64 4095 @@ -994,6 +1093,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX11-NEXT: GLOBAL_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1) + ; + ; GFX12-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: GLOBAL_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 %2:vgpr(s64) = G_CONSTANT i64 4095 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir index de21788ff168f..a6aa1be8addc7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir @@ -3,6 +3,7 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s +# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX12 %s --- @@ -42,6 +43,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_atomic_flat_s32_seq_cst + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = G_LOAD %0 :: (load seq_cst (s32), align 4, addrspace 0) $vgpr0 = COPY %1 @@ -86,6 +94,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s16>)) ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; + ; GFX12-LABEL: name: load_atomic_flat_v2s16_seq_cst + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s16>)) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load seq_cst (<2 x s16>), align 4, addrspace 0) $vgpr0 = COPY %1 @@ -130,6 +145,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p0) :: (load seq_cst (p3)) ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; + ; GFX12-LABEL: name: load_atomic_flat_p3_seq_cst + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p0) :: (load seq_cst (p3)) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = G_LOAD %0 :: (load seq_cst (p3), align 4, addrspace 0) $vgpr0 = COPY %1 @@ -174,6 +196,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64)) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; + ; GFX12-LABEL: name: load_atomic_flat_s64_seq_cst + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64)) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_LOAD %0 :: (load seq_cst (s64), align 8, addrspace 0) $vgpr0_vgpr1 = COPY %1 @@ -218,6 +247,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>)) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; + ; GFX12-LABEL: name: load_atomic_flat_v2s32_seq_cst + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>)) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load seq_cst (<2 x s32>), align 8, addrspace 0) $vgpr0_vgpr1 = COPY %1 @@ -262,6 +298,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<4 x s16>)) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; + ; GFX12-LABEL: name: load_atomic_flat_v4s16_seq_cst + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<4 x s16>)) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s16>) = G_LOAD %0 :: (load seq_cst (<4 x s16>), align 8, addrspace 0) $vgpr0_vgpr1 = COPY %1 @@ -306,6 +349,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p0) :: (load seq_cst (p1)) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; + ; GFX12-LABEL: name: load_atomic_flat_p1_seq_cst + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p0) :: (load seq_cst (p1)) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(p1) = G_LOAD %0 :: (load seq_cst (p1), align 8, addrspace 0) $vgpr0_vgpr1 = COPY %1 @@ -350,6 +400,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p0) :: (load seq_cst (p0)) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0) + ; + ; GFX12-LABEL: name: load_atomic_flat_p0_seq_cst + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p0) :: (load seq_cst (p0)) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(p0) = G_LOAD %0 :: (load seq_cst (p0), align 8, addrspace 0) $vgpr0_vgpr1 = COPY %1 @@ -426,6 +483,13 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_atomic_flat_s32_seq_cst_gep_m2048 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], -2048, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -2048 %2:vgpr(p0) = G_PTR_ADD %0, %1 @@ -488,6 +552,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 4095, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_atomic_flat_s32_seq_cst_gep_4095 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 4095, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 4095 %2:vgpr(p0) = G_PTR_ADD %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir index b678966de8537..1705b74f6395a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir @@ -5,6 +5,7 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-local.mir index 11c0d8beab303..553e8c773b466 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-local.mir @@ -4,6 +4,7 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir index d7c3254398862..eb62d80533455 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir @@ -4,6 +4,7 @@ # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s +# RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX12 %s --- @@ -51,6 +52,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_flat_s32_from_4 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 0) $vgpr0 = COPY %1 @@ -102,6 +110,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]] + ; + ; GFX12-LABEL: name: load_flat_s32_from_2 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = G_LOAD %0 :: (load (s16), align 2, addrspace 0) $vgpr0 = COPY %1 @@ -153,6 +168,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_flat_s32_from_1 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = G_LOAD %0 :: (load (s8), align 1, addrspace 0) $vgpr0 = COPY %1 @@ -204,6 +226,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>)) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; + ; GFX12-LABEL: name: load_flat_v2s32 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>)) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 8, addrspace 0) $vgpr0_vgpr1 = COPY %1 @@ -255,6 +284,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<3 x s32>), align 4) ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]] + ; + ; GFX12-LABEL: name: load_flat_v3s32 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<3 x s32>), align 4) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 4, addrspace 0) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -306,6 +342,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4) ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; + ; GFX12-LABEL: name: load_flat_v4s32 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 4, addrspace 0) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -357,6 +400,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64)) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; + ; GFX12-LABEL: name: load_flat_s64 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64)) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_LOAD %0 :: (load (s64), align 8, addrspace 0) $vgpr0_vgpr1 = COPY %1 @@ -408,6 +458,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4) ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; + ; GFX12-LABEL: name: load_flat_v2s64 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 4, addrspace 0) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -459,6 +516,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4) ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; + ; GFX12-LABEL: name: load_flat_v2p1 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 4, addrspace 0) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -510,6 +574,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_96(s96) = G_LOAD [[COPY]](p1) :: (load (s96), align 4) ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96) + ; + ; GFX12-LABEL: name: load_flat_s96 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_96(s96) = G_LOAD [[COPY]](p1) :: (load (s96), align 4) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s96) = G_LOAD %0 :: (load (s96), align 4, addrspace 0) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -561,6 +632,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4) ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) + ; + ; GFX12-LABEL: name: load_flat_s128 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s128) = G_LOAD %0 :: (load (s128), align 4, addrspace 0) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -612,6 +690,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_flat_p3_from_4 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = G_LOAD %0 :: (load (p3), align 4, addrspace 0) $vgpr0 = COPY %1 @@ -663,6 +748,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1)) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; + ; GFX12-LABEL: name: load_flat_p1_from_8 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1)) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p1) = G_LOAD %0 :: (load (p1), align 8, addrspace 0) $vgpr0_vgpr1 = COPY %1 @@ -714,6 +806,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999)) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) + ; + ; GFX12-LABEL: name: load_flat_p999_from_8 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999)) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p999) = G_LOAD %0 :: (load (p999), align 8, addrspace 0) $vgpr0_vgpr1 = COPY %1 @@ -765,6 +864,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>)) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) + ; + ; GFX12-LABEL: name: load_flat_v2p3 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>)) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 0) $vgpr0_vgpr1 = COPY %1 @@ -816,6 +922,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_flat_v2s16 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 4, addrspace 0) $vgpr0 = COPY %1 @@ -867,6 +980,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>)) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; + ; GFX12-LABEL: name: load_flat_v4s16 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>)) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 8, addrspace 0) $vgpr0_vgpr1 = COPY %1 @@ -918,6 +1038,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_96(<6 x s16>) = G_LOAD [[COPY]](p1) :: (load (<6 x s16>), align 4) ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<6 x s16>) + ; + ; GFX12-LABEL: name: load_flat_v6s16 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_96(<6 x s16>) = G_LOAD [[COPY]](p1) :: (load (<6 x s16>), align 4) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<6 x s16>) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<6 x s16>) = G_LOAD %0 :: (load (<6 x s16>), align 4, addrspace 0) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -969,6 +1096,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4) ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; + ; GFX12-LABEL: name: load_flat_v8s16 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 0) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -1048,6 +1182,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_flat_s32_from_1_gep_2047 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 2047 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -1125,6 +1266,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_flat_s32_from_1_gep_2048 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 2048 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -1218,6 +1366,13 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_flat_s32_from_1_gep_m2047 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -2047 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -1311,6 +1466,13 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_flat_s32_from_1_gep_m2048 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -2048 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -1388,6 +1550,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_flat_s32_from_1_gep_4095 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 4095 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -1481,6 +1650,13 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_flat_s32_from_1_gep_4096 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 4096, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 4096 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -1574,6 +1750,13 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_flat_s32_from_1_gep_m4095 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -4095 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -1667,6 +1850,13 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_flat_s32_from_1_gep_m4096 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -4096 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -1760,6 +1950,13 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_flat_s32_from_1_gep_8191 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 8191, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 8191 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -1853,6 +2050,13 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_flat_s32_from_1_gep_8192 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 8192, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 8192 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -1946,6 +2150,13 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_flat_s32_from_1_gep_m8191 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], -8191, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -8191 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -2039,6 +2250,13 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_flat_s32_from_1_gep_m8192 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], -8192, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -8192 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -2046,3 +2264,419 @@ body: | $vgpr0 = COPY %3 ... + +--- + +name: load_flat_s32_from_1_gep_24bit_max +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GFX7-LABEL: name: load_flat_s32_from_1_gep_24bit_max + ; GFX7: liveins: $vgpr0_vgpr1 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX8-LABEL: name: load_flat_s32_from_1_gep_24bit_max + ; GFX8: liveins: $vgpr0_vgpr1 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX9-LABEL: name: load_flat_s32_from_1_gep_24bit_max + ; GFX9: liveins: $vgpr0_vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX10-LABEL: name: load_flat_s32_from_1_gep_24bit_max + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX11-LABEL: name: load_flat_s32_from_1_gep_24bit_max + ; GFX11: liveins: $vgpr0_vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_flat_s32_from_1_gep_24bit_max + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 8388607, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + %0:vgpr(p1) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = G_CONSTANT i64 8388607 + %2:vgpr(p1) = G_PTR_ADD %0, %1 + %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0) + $vgpr0 = COPY %3 + +... + +--- + +name: load_flat_s32_from_1_gep_2x_24bit_max +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GFX7-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_max + ; GFX7: liveins: $vgpr0_vgpr1 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX8-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_max + ; GFX8: liveins: $vgpr0_vgpr1 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX9-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_max + ; GFX9: liveins: $vgpr0_vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX10-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_max + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX11-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_max + ; GFX11: liveins: $vgpr0_vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_max + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + %0:vgpr(p1) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = G_CONSTANT i64 16777214 + %2:vgpr(p1) = G_PTR_ADD %0, %1 + %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0) + $vgpr0 = COPY %3 + +... + +--- + +name: load_flat_s32_from_1_gep_24bit_min +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GFX7-LABEL: name: load_flat_s32_from_1_gep_24bit_min + ; GFX7: liveins: $vgpr0_vgpr1 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX8-LABEL: name: load_flat_s32_from_1_gep_24bit_min + ; GFX8: liveins: $vgpr0_vgpr1 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX9-LABEL: name: load_flat_s32_from_1_gep_24bit_min + ; GFX9: liveins: $vgpr0_vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX10-LABEL: name: load_flat_s32_from_1_gep_24bit_min + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX11-LABEL: name: load_flat_s32_from_1_gep_24bit_min + ; GFX11: liveins: $vgpr0_vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_flat_s32_from_1_gep_24bit_min + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], -8388608, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + %0:vgpr(p1) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = G_CONSTANT i64 -8388608 + %2:vgpr(p1) = G_PTR_ADD %0, %1 + %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0) + $vgpr0 = COPY %3 + +... + +--- + +name: load_flat_s32_from_1_gep_2x_24bit_min +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GFX7-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_min + ; GFX7: liveins: $vgpr0_vgpr1 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX8-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_min + ; GFX8: liveins: $vgpr0_vgpr1 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX9-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_min + ; GFX9: liveins: $vgpr0_vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX10-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_min + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX11-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_min + ; GFX11: liveins: $vgpr0_vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_min + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + %0:vgpr(p1) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = G_CONSTANT i64 -16777215 + %2:vgpr(p1) = G_PTR_ADD %0, %1 + %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0) + $vgpr0 = COPY %3 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir index 0103bfc9d39c1..0fb9864b46007 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir @@ -2,6 +2,7 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s +# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s --- name: load_global_s32_from_sgpr @@ -36,6 +37,14 @@ body: | ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + ; + ; GFX12-LABEL: name: load_global_s32_from_sgpr + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:vgpr(p1) = COPY %0 %2:vgpr(s32) = G_LOAD %1 :: (load (s32), align 4, addrspace 1) @@ -78,6 +87,14 @@ body: | ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + ; + ; GFX12-LABEL: name: load_global_s32_from_sgpr_zext_vgpr + ; GFX12: liveins: $sgpr0_sgpr1, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:vgpr(s32) = COPY $vgpr0 %2:vgpr(p1) = COPY %0 @@ -123,6 +140,14 @@ body: | ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + ; + ; GFX12-LABEL: name: load_global_s32_from_sgpr_merge_zext_vgpr + ; GFX12: liveins: $sgpr0_sgpr1, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:vgpr(s32) = COPY $vgpr0 %2:vgpr(p1) = COPY %0 @@ -198,6 +223,24 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_global_s32_from_sgpr_merge_not_0_vgpr + ; GFX12: liveins: $sgpr0_sgpr1, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY]] + ; GFX12-NEXT: %notzero:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, %notzero, %subreg.sub1 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec + ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:vgpr(s32) = COPY $vgpr0 %2:vgpr(p1) = COPY %0 @@ -261,6 +304,14 @@ body: | ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + ; + ; GFX12-LABEL: name: load_global_s32_from_sgpr_zext_vgpr_offset4095 + ; GFX12: liveins: $sgpr0_sgpr1, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:vgpr(s32) = COPY $vgpr0 %2:vgpr(p1) = COPY %0 @@ -326,6 +377,14 @@ body: | ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], -4096, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + ; + ; GFX12-LABEL: name: load_global_s32_from_sgpr_zext_vgpr_offset_neg4096 + ; GFX12: liveins: $sgpr0_sgpr1, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], -4096, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:vgpr(s32) = COPY $vgpr0 %2:vgpr(p1) = COPY %0 @@ -371,6 +430,14 @@ body: | ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + ; + ; GFX12-LABEL: name: load_global_s32_from_sgpr_base_offset_4096 + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 4096, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 4096 %2:sgpr(p1) = G_PTR_ADD %0, %1 @@ -413,6 +480,14 @@ body: | ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 1, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + ; + ; GFX12-LABEL: name: load_global_s32_from_sgpr_base_offset_4097 + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 4097, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 4097 %2:sgpr(p1) = G_PTR_ADD %0, %1 @@ -479,6 +554,14 @@ body: | ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4097 + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], -4097, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 -4097 %2:sgpr(p1) = G_PTR_ADD %0, %1 @@ -521,6 +604,14 @@ body: | ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 2049, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + ; + ; GFX12-LABEL: name: load_global_s32_from_sgpr_base_offset_2049 + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 2049, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 2049 %2:sgpr(p1) = G_PTR_ADD %0, %1 @@ -571,6 +662,14 @@ body: | ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], -2049, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + ; + ; GFX12-LABEL: name: load_global_s32_from_sgpr_base_offset_neg2049 + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], -2049, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 -2049 %2:sgpr(p1) = G_PTR_ADD %0, %1 @@ -612,6 +711,14 @@ body: | ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 4095, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + ; + ; GFX12-LABEL: name: load_global_s32_from_sgpr_base_offset_4294967295 + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4286578688, implicit $exec + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 8388607, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 4294967295 %2:sgpr(p1) = G_PTR_ADD %0, %1 @@ -683,6 +790,24 @@ body: | ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_global_s32_from_sgpr_base_offset_4294967296 + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX12-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX12-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 4294967296 %2:sgpr(p1) = G_PTR_ADD %0, %1 @@ -755,6 +880,24 @@ body: | ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_global_s32_from_sgpr_base_offset_4294971390 + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4094 + ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX12-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX12-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 4294971390 %2:sgpr(p1) = G_PTR_ADD %0, %1 @@ -827,6 +970,24 @@ body: | ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4294967295 + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX12-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX12-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 -4294967295 %2:sgpr(p1) = G_PTR_ADD %0, %1 @@ -898,6 +1059,24 @@ body: | ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4294967296 + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX12-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX12-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 -4294967296 %2:sgpr(p1) = G_PTR_ADD %0, %1 @@ -932,6 +1111,12 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY [[DEF]] ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_global_s32_from_copy_undef_sgpr + ; GFX12: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:sgpr(p1) = G_IMPLICIT_DEF %1:vgpr(p1) = COPY %0 %2:vgpr(s32) = G_LOAD %1 :: (load (s32), align 4, addrspace 1) @@ -961,6 +1146,11 @@ body: | ; GFX11: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_global_s32_from_undef_vgpr + ; GFX12: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:vgpr(p1) = G_IMPLICIT_DEF %1:vgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 1) $vgpr0 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir index 27806edc91808..f8ef9426bc36e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir @@ -6,6 +6,7 @@ # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s +# RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX12 %s --- @@ -78,6 +79,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_global_s32_from_4 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 1) $vgpr0 = COPY %1 @@ -153,6 +161,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, implicit $exec :: (load (s16), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]] + ; + ; GFX12-LABEL: name: load_global_s32_from_2 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, implicit $exec :: (load (s16), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = G_LOAD %0 :: (load (s16), align 2, addrspace 1) $vgpr0 = COPY %1 @@ -228,6 +243,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_global_s32_from_1 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = G_LOAD %0 :: (load (s8), align 1, addrspace 1) $vgpr0 = COPY %1 @@ -303,6 +325,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] + ; + ; GFX12-LABEL: name: load_global_v2s32 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 8, addrspace 1) $vgpr0_vgpr1 = COPY %1 @@ -378,6 +407,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1) ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] + ; + ; GFX12-LABEL: name: load_global_v4s32 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 4, addrspace 1) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -453,6 +489,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] + ; + ; GFX12-LABEL: name: load_global_s64 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_LOAD %0 :: (load (s64), align 8, addrspace 1) $vgpr0_vgpr1 = COPY %1 @@ -528,6 +571,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1) ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] + ; + ; GFX12-LABEL: name: load_global_v2s64 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 4, addrspace 1) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -593,6 +643,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1) ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; + ; GFX12-LABEL: name: load_global_v2p1 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 4, addrspace 1) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -658,6 +715,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1) ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) + ; + ; GFX12-LABEL: name: load_global_s128 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s128) = G_LOAD %0 :: (load (s128), align 4, addrspace 1) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -733,6 +797,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (p3), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_global_p3_from_4 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (p3), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = G_LOAD %0 :: (load (p3), align 4, addrspace 1) $vgpr0 = COPY %1 @@ -808,6 +879,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (p1), addrspace 1) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] + ; + ; GFX12-LABEL: name: load_global_p1_from_8 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (p1), addrspace 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p1) = G_LOAD %0 :: (load (p1), align 8, addrspace 1) $vgpr0_vgpr1 = COPY %1 @@ -873,6 +951,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) + ; + ; GFX12-LABEL: name: load_global_p999_from_8 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p999) = G_LOAD %0 :: (load (p999), align 8, addrspace 1) $vgpr0_vgpr1 = COPY %1 @@ -938,6 +1023,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) + ; + ; GFX12-LABEL: name: load_global_v2p3 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 1) $vgpr0_vgpr1 = COPY %1 @@ -1013,6 +1105,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_global_v2s16 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 4, addrspace 1) $vgpr0 = COPY %1 @@ -1088,6 +1187,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] + ; + ; GFX12-LABEL: name: load_global_v4s16 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 8, addrspace 1) $vgpr0_vgpr1 = COPY %1 @@ -1163,6 +1269,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1) ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] + ; + ; GFX12-LABEL: name: load_global_v8s16 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 1) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -1258,6 +1371,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_global_s32_from_1_gep_2047 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 2047 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -1359,6 +1479,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_global_s32_from_1_gep_2048 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 2048 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -1468,6 +1595,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_global_s32_from_1_gep_m2047 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -2047 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -1577,6 +1711,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_global_s32_from_1_gep_m2048 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -2048 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -1678,6 +1819,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_global_s32_from_1_gep_4095 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 4095 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -1797,6 +1945,13 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_global_s32_from_1_gep_4096 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4096, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 4096 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -1914,6 +2069,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_global_s32_from_1_gep_m4095 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -4095 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -2031,6 +2193,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_global_s32_from_1_gep_m4096 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -4096 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -2150,6 +2319,13 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_global_s32_from_1_gep_8191 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 8191, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 8191 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -2269,6 +2445,13 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_global_s32_from_1_gep_8192 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 8192, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 8192 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -2402,6 +2585,13 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_global_s32_from_1_gep_m8191 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -8191, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -8191 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -2535,6 +2725,13 @@ body: | ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_global_s32_from_1_gep_m8192 + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -8192, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -8192 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -2542,3 +2739,551 @@ body: | $vgpr0 = COPY %3 ... + +--- + +name: load_global_s32_from_1_gep_24bit_max +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GFX6-LABEL: name: load_global_s32_from_1_gep_24bit_max + ; GFX6: liveins: $vgpr0_vgpr1 + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8388607 + ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; + ; GFX7-LABEL: name: load_global_s32_from_1_gep_24bit_max + ; GFX7: liveins: $vgpr0_vgpr1 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8388607 + ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; + ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_24bit_max + ; GFX7-FLAT: liveins: $vgpr0_vgpr1 + ; GFX7-FLAT-NEXT: {{ $}} + ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec + ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX8-LABEL: name: load_global_s32_from_1_gep_24bit_max + ; GFX8: liveins: $vgpr0_vgpr1 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX9-LABEL: name: load_global_s32_from_1_gep_24bit_max + ; GFX9: liveins: $vgpr0_vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX10-LABEL: name: load_global_s32_from_1_gep_24bit_max + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX11-LABEL: name: load_global_s32_from_1_gep_24bit_max + ; GFX11: liveins: $vgpr0_vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_global_s32_from_1_gep_24bit_max + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 8388607, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + %0:vgpr(p1) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = G_CONSTANT i64 8388607 + %2:vgpr(p1) = G_PTR_ADD %0, %1 + %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1) + $vgpr0 = COPY %3 + +... + +--- + +name: load_global_s32_from_1_gep_2x_24bit_max +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GFX6-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max + ; GFX6: liveins: $vgpr0_vgpr1 + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 16777214 + ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; + ; GFX7-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max + ; GFX7: liveins: $vgpr0_vgpr1 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 16777214 + ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; + ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max + ; GFX7-FLAT: liveins: $vgpr0_vgpr1 + ; GFX7-FLAT-NEXT: {{ $}} + ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX8-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max + ; GFX8: liveins: $vgpr0_vgpr1 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX9-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max + ; GFX9: liveins: $vgpr0_vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX10-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX11-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max + ; GFX11: liveins: $vgpr0_vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + %0:vgpr(p1) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = G_CONSTANT i64 16777214 + %2:vgpr(p1) = G_PTR_ADD %0, %1 + %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1) + $vgpr0 = COPY %3 + +... + +--- + +name: load_global_s32_from_1_gep_24bit_min +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GFX6-LABEL: name: load_global_s32_from_1_gep_24bit_min + ; GFX6: liveins: $vgpr0_vgpr1 + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; + ; GFX7-LABEL: name: load_global_s32_from_1_gep_24bit_min + ; GFX7: liveins: $vgpr0_vgpr1 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; + ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_24bit_min + ; GFX7-FLAT: liveins: $vgpr0_vgpr1 + ; GFX7-FLAT-NEXT: {{ $}} + ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX8-LABEL: name: load_global_s32_from_1_gep_24bit_min + ; GFX8: liveins: $vgpr0_vgpr1 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX9-LABEL: name: load_global_s32_from_1_gep_24bit_min + ; GFX9: liveins: $vgpr0_vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX10-LABEL: name: load_global_s32_from_1_gep_24bit_min + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX11-LABEL: name: load_global_s32_from_1_gep_24bit_min + ; GFX11: liveins: $vgpr0_vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_global_s32_from_1_gep_24bit_min + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -8388608, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + %0:vgpr(p1) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = G_CONSTANT i64 -8388608 + %2:vgpr(p1) = G_PTR_ADD %0, %1 + %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1) + $vgpr0 = COPY %3 + +... + +--- + +name: load_global_s32_from_1_gep_2x_24bit_min +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GFX6-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min + ; GFX6: liveins: $vgpr0_vgpr1 + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; + ; GFX7-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min + ; GFX7: liveins: $vgpr0_vgpr1 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; + ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min + ; GFX7-FLAT: liveins: $vgpr0_vgpr1 + ; GFX7-FLAT-NEXT: {{ $}} + ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX8-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min + ; GFX8: liveins: $vgpr0_vgpr1 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) + ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; + ; GFX9-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min + ; GFX9: liveins: $vgpr0_vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX10-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX11-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min + ; GFX11: liveins: $vgpr0_vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + %0:vgpr(p1) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = G_CONSTANT i64 -16777215 + %2:vgpr(p1) = G_PTR_ADD %0, %1 + %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1) + $vgpr0 = COPY %3 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.s96.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.s96.mir index 286b4daa79b93..ef1bb842cd354 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.s96.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.s96.mir @@ -5,6 +5,7 @@ # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s +# RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir index ea320cec9991d..c8c2197806240 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir @@ -4,6 +4,7 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s +# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX12 %s --- @@ -39,6 +40,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_private_s32_from_4 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 5) $vgpr0 = COPY %1 @@ -79,6 +87,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[SCRATCH_LOAD_USHORT:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_USHORT]] + ; + ; GFX12-LABEL: name: load_private_s32_from_2 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[SCRATCH_LOAD_USHORT:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_USHORT]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load (s16), align 2, addrspace 5) $vgpr0 = COPY %1 @@ -119,6 +134,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load (s8), align 1, addrspace 5) $vgpr0 = COPY %1 @@ -159,6 +181,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_private_p3_from_4 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(p3) = G_LOAD %0 :: (load (p3), align 4, addrspace 5) $vgpr0 = COPY %1 @@ -199,6 +228,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p5), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_private_p5_from_4 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p5), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(p5) = G_LOAD %0 :: (load (p5), align 4, addrspace 5) $vgpr0 = COPY %1 @@ -240,6 +276,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_private_v2s16 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 4, addrspace 5) $vgpr0 = COPY %1 @@ -288,6 +331,13 @@ body: | ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_gep_2047 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2047 %2:vgpr(p5) = G_PTR_ADD %0, %1 @@ -327,6 +377,15 @@ body: | ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_AND_B32_e64_]], 2047, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_gep_2047_known_bits + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec + ; GFX12-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec + ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_AND_B32_e64_]], 2047, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2147483647 %2:vgpr(s32) = G_AND %0, %1 @@ -376,6 +435,13 @@ body: | ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_gep_2048 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2048 %2:vgpr(p5) = G_PTR_ADD %0, %1 @@ -422,6 +488,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_gep_m2047 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -2047 %2:vgpr(p5) = G_PTR_ADD %0, %1 @@ -468,6 +541,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_gep_m2048 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -2048 %2:vgpr(p5) = G_PTR_ADD %0, %1 @@ -514,6 +594,13 @@ body: | ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_gep_4095 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 4095 %2:vgpr(p5) = G_PTR_ADD %0, %1 @@ -562,6 +649,13 @@ body: | ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_gep_4096 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 4096, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 4096 %2:vgpr(p5) = G_PTR_ADD %0, %1 @@ -608,6 +702,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_gep_m4095 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -4095 %2:vgpr(p5) = G_PTR_ADD %0, %1 @@ -654,6 +755,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_gep_m4096 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -4096 %2:vgpr(p5) = G_PTR_ADD %0, %1 @@ -702,6 +810,13 @@ body: | ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_gep_8191 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 8191, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 8191 %2:vgpr(p5) = G_PTR_ADD %0, %1 @@ -750,6 +865,13 @@ body: | ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_gep_8192 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 8192, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 8192 %2:vgpr(p5) = G_PTR_ADD %0, %1 @@ -798,6 +920,13 @@ body: | ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_gep_m8191 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -8191, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -8191 %2:vgpr(p5) = G_PTR_ADD %0, %1 @@ -846,6 +975,13 @@ body: | ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_gep_m8192 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -8192, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -8192 %2:vgpr(p5) = G_PTR_ADD %0, %1 @@ -856,6 +992,230 @@ body: | --- +name: load_private_s32_from_1_gep_24bit_max +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + stackPtrOffsetReg: $sgpr32 + +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: load_private_s32_from_1_gep_24bit_max + ; GFX6: liveins: $vgpr0 + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8388607, implicit $exec + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5) + ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] + ; + ; GFX9-LABEL: name: load_private_s32_from_1_gep_24bit_max + ; GFX9: liveins: $vgpr0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8388607, implicit $exec + ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] + ; + ; GFX11-LABEL: name: load_private_s32_from_1_gep_24bit_max + ; GFX11: liveins: $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8388607, implicit $exec + ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_gep_24bit_max + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 8388607, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + %0:vgpr(p5) = COPY $vgpr0 + %1:vgpr(s32) = G_CONSTANT i32 8388607 + %2:vgpr(p5) = G_PTR_ADD %0, %1 + %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 5) + $vgpr0 = COPY %3 + +... + +--- + +name: load_private_s32_from_1_gep_2x_24bit_max +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + stackPtrOffsetReg: $sgpr32 + +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: load_private_s32_from_1_gep_2x_24bit_max + ; GFX6: liveins: $vgpr0 + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16777214, implicit $exec + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5) + ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] + ; + ; GFX9-LABEL: name: load_private_s32_from_1_gep_2x_24bit_max + ; GFX9: liveins: $vgpr0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16777214, implicit $exec + ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] + ; + ; GFX11-LABEL: name: load_private_s32_from_1_gep_2x_24bit_max + ; GFX11: liveins: $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16777214, implicit $exec + ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_gep_2x_24bit_max + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16777214, implicit $exec + ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + %0:vgpr(p5) = COPY $vgpr0 + %1:vgpr(s32) = G_CONSTANT i32 16777214 + %2:vgpr(p5) = G_PTR_ADD %0, %1 + %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 5) + $vgpr0 = COPY %3 + +... + +--- + +name: load_private_s32_from_1_gep_24bit_min +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + stackPtrOffsetReg: $sgpr32 + +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: load_private_s32_from_1_gep_24bit_min + ; GFX6: liveins: $vgpr0 + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8388608, implicit $exec + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5) + ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] + ; + ; GFX9-LABEL: name: load_private_s32_from_1_gep_24bit_min + ; GFX9: liveins: $vgpr0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8388608, implicit $exec + ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] + ; + ; GFX11-LABEL: name: load_private_s32_from_1_gep_24bit_min + ; GFX11: liveins: $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8388608, implicit $exec + ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_gep_24bit_min + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -8388608, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + %0:vgpr(p5) = COPY $vgpr0 + %1:vgpr(s32) = G_CONSTANT i32 -8388608 + %2:vgpr(p5) = G_PTR_ADD %0, %1 + %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 5) + $vgpr0 = COPY %3 + +... + +--- + +name: load_private_s32_from_1_gep_2x_24bit_min +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + stackPtrOffsetReg: $sgpr32 + +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: load_private_s32_from_1_gep_2x_24bit_min + ; GFX6: liveins: $vgpr0 + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16777215, implicit $exec + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5) + ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] + ; + ; GFX9-LABEL: name: load_private_s32_from_1_gep_2x_24bit_min + ; GFX9: liveins: $vgpr0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16777215, implicit $exec + ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] + ; + ; GFX11-LABEL: name: load_private_s32_from_1_gep_2x_24bit_min + ; GFX11: liveins: $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16777215, implicit $exec + ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_gep_2x_24bit_min + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16777215, implicit $exec + ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + %0:vgpr(p5) = COPY $vgpr0 + %1:vgpr(s32) = G_CONSTANT i32 -16777215 + %2:vgpr(p5) = G_PTR_ADD %0, %1 + %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 5) + $vgpr0 = COPY %3 + +... + +--- + name: load_private_s32_from_4_constant_0 legalized: true regBankSelected: true @@ -879,6 +1239,11 @@ body: | ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX11-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_private_s32_from_4_constant_0 + ; GFX12: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]] %0:vgpr(p5) = G_CONSTANT i32 0 %1:vgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 5) $vgpr0 = COPY %1 @@ -910,6 +1275,11 @@ body: | ; GFX11: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 16 ; GFX11-NEXT: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR [[S_MOV_B32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD_SADDR]] + ; + ; GFX12-LABEL: name: load_private_s32_from_4_constant_sgpr_16 + ; GFX12: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 16 + ; GFX12-NEXT: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR [[S_MOV_B32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD_SADDR]] %0:sgpr(p5) = G_CONSTANT i32 16 %1:vgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 5) $vgpr0 = COPY %1 @@ -941,6 +1311,11 @@ body: | ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_constant_4095 + ; GFX12: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec + ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] %0:vgpr(p5) = G_CONSTANT i32 4095 %1:vgpr(s32) = G_LOAD %0 :: (load (s8), align 1, addrspace 5) $vgpr0 = COPY %1 @@ -974,6 +1349,11 @@ body: | ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_constant_4096 + ; GFX12: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]] %0:vgpr(p5) = G_CONSTANT i32 4096 %1:vgpr(s32) = G_LOAD %0 :: (load (s8), align 1, addrspace 5) $vgpr0 = COPY %1 @@ -1006,6 +1386,10 @@ body: | ; GFX11-LABEL: name: load_private_s32_from_fi ; GFX11: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD_SADDR]] + ; + ; GFX12-LABEL: name: load_private_s32_from_fi + ; GFX12: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD_SADDR]] %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 5) $vgpr0 = COPY %1 @@ -1037,6 +1421,10 @@ body: | ; GFX11-LABEL: name: load_private_s32_from_1_fi_offset_4095 ; GFX11: [[SCRATCH_LOAD_UBYTE_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE_SADDR %stack.0, 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE_SADDR]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_fi_offset_4095 + ; GFX12: [[SCRATCH_LOAD_UBYTE_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE_SADDR %stack.0, 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE_SADDR]] %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_CONSTANT i32 4095 %2:vgpr(p5) = G_PTR_ADD %0, %1 @@ -1071,6 +1459,10 @@ body: | ; GFX11-LABEL: name: load_private_s32_from_1_fi_offset_sgpr_4095 ; GFX11: [[SCRATCH_LOAD_UBYTE_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE_SADDR %stack.0, 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE_SADDR]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_fi_offset_sgpr_4095 + ; GFX12: [[SCRATCH_LOAD_UBYTE_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE_SADDR %stack.0, 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE_SADDR]] %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:sgpr(s32) = G_CONSTANT i32 4095 %2:vgpr(s32) = COPY %1 @@ -1113,6 +1505,10 @@ body: | ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE_SVS:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE_SVS [[V_MOV_B32_e32_]], %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE_SVS]] + ; + ; GFX12-LABEL: name: load_private_s32_from_1_fi_offset_4096 + ; GFX12: [[SCRATCH_LOAD_UBYTE_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE_SADDR %stack.0, 4096, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE_SADDR]] %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_CONSTANT i32 4096 %2:vgpr(p5) = G_PTR_ADD %0, %1 @@ -1149,6 +1545,11 @@ body: | ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec ; GFX11-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5) ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]] + ; + ; GFX12-LABEL: name: load_private_s32_from_neg1 + ; GFX12: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec + ; GFX12-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5) + ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]] %0:vgpr(p5) = G_CONSTANT i32 -1 %1:vgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 5) $vgpr0 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir index 289d70be83c01..1d708ef0c7951 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir @@ -3,6 +3,7 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-local.mir index 827eb13e3e4e8..247be2c85b39f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-local.mir @@ -4,6 +4,7 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir index fc6925ee5709c..9c13135c0bf17 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir @@ -4,6 +4,7 @@ # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s +# RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX12 %s --- @@ -50,6 +51,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX12-LABEL: name: store_flat_s32_to_4 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 G_STORE %1, %0 :: (store (s32), align 4, addrspace 0) @@ -100,6 +108,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16)) + ; + ; GFX12-LABEL: name: store_flat_s32_to_2 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16)) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 G_STORE %1, %0 :: (store (s16), align 2, addrspace 0) @@ -150,6 +165,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8)) + ; + ; GFX12-LABEL: name: store_flat_s32_to_1 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8)) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 G_STORE %1, %0 :: (store (s8), align 1, addrspace 0) @@ -201,6 +223,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX11-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) + ; + ; GFX12-LABEL: name: store_flat_s64 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store (s64), align 8, addrspace 0) @@ -251,6 +280,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4 ; GFX11-NEXT: G_STORE [[COPY1]](s96), [[COPY]](p1) :: (store (s96), align 16) + ; + ; GFX12-LABEL: name: store_flat_s96 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4 + ; GFX12-NEXT: G_STORE [[COPY1]](s96), [[COPY]](p1) :: (store (s96), align 16) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4 G_STORE %1, %0 :: (store (s96), align 16, addrspace 0) @@ -301,6 +337,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX11-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128)) + ; + ; GFX12-LABEL: name: store_flat_s128 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128)) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store (s128), align 16, addrspace 0) @@ -352,6 +395,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX11-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>)) + ; + ; GFX12-LABEL: name: store_flat_v2s32 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>)) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s32>) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store (<2 x s32>), align 8, addrspace 0) @@ -402,6 +452,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4 ; GFX11-NEXT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>), align 16) + ; + ; GFX12-LABEL: name: store_flat_v3s32 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4 + ; GFX12-NEXT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>), align 16) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4 G_STORE %1, %0 :: (store (<3 x s32>), align 16, addrspace 0) @@ -452,6 +509,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX11-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>)) + ; + ; GFX12-LABEL: name: store_flat_v4s32 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>)) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s32>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store (<4 x s32>), align 16, addrspace 0) @@ -503,6 +567,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>)) + ; + ; GFX12-LABEL: name: store_flat_v2s16 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>)) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s16>) = COPY $vgpr2 G_STORE %1, %0 :: (store (<2 x s16>), align 4, addrspace 0) @@ -554,6 +625,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX11-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>)) + ; + ; GFX12-LABEL: name: store_flat_v4s16 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>)) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s16>) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store (<4 x s16>), align 8, addrspace 0) @@ -605,6 +683,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4 ; GFX11-NEXT: G_STORE [[COPY1]](<6 x s16>), [[COPY]](p1) :: (store (<6 x s16>), align 16) + ; + ; GFX12-LABEL: name: store_flat_v6s16 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4 + ; GFX12-NEXT: G_STORE [[COPY1]](<6 x s16>), [[COPY]](p1) :: (store (<6 x s16>), align 16) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4 G_STORE %1, %0 :: (store (<6 x s16>), align 16, addrspace 0) @@ -655,6 +740,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX11-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>)) + ; + ; GFX12-LABEL: name: store_flat_v8s16 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>)) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store (<8 x s16>), align 16, addrspace 0) @@ -706,6 +798,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX11-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>)) + ; + ; GFX12-LABEL: name: store_flat_v2s64 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>)) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store (<2 x s64>), align 16, addrspace 0) @@ -757,6 +856,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX11-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1)) + ; + ; GFX12-LABEL: name: store_flat_p1 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1)) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p1) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store (p1), align 8, addrspace 0) @@ -808,6 +914,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX11-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>)) + ; + ; GFX12-LABEL: name: store_flat_v2p1 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>)) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store (<2 x p1>), align 16, addrspace 0) @@ -859,6 +972,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3)) + ; + ; GFX12-LABEL: name: store_flat_p3 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3)) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %1, %0 :: (store (p3), align 4, addrspace 0) @@ -910,6 +1030,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3 ; GFX11-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>)) + ; + ; GFX12-LABEL: name: store_flat_v2p3 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>)) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store (<2 x p3>), align 8, addrspace 0) @@ -960,6 +1087,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32)) + ; + ; GFX12-LABEL: name: store_atomic_flat_s32 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32)) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 G_STORE %1, %0 :: (store monotonic (s32), align 4, addrspace 0) @@ -1011,6 +1145,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX11-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64)) + ; + ; GFX12-LABEL: name: store_atomic_flat_s64 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64)) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store monotonic (s64), align 8, addrspace 0) @@ -1086,6 +1227,13 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX12-LABEL: name: store_flat_s32_gep_2047 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, implicit $exec, implicit $flat_scr :: (store (s32)) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 2047 @@ -1093,3 +1241,418 @@ body: | G_STORE %1, %3 :: (store (s32), align 4, addrspace 0) ... + +--- + +name: store_flat_s32_to_1_gep_24bit_max +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; GFX7-LABEL: name: store_flat_s32_to_1_gep_24bit_max + ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX8-LABEL: name: store_flat_s32_to_1_gep_24bit_max + ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX9-LABEL: name: store_flat_s32_to_1_gep_24bit_max + ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX10-LABEL: name: store_flat_s32_to_1_gep_24bit_max + ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX11-LABEL: name: store_flat_s32_to_1_gep_24bit_max + ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX12-LABEL: name: store_flat_s32_to_1_gep_24bit_max + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 8388607, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + %0:vgpr(p1) = COPY $vgpr0_vgpr1 + %1:vgpr(s32) = COPY $vgpr2 + %2:vgpr(s64) = G_CONSTANT i64 8388607 + %3:vgpr(p1) = G_PTR_ADD %0, %2 + G_STORE %1, %3 :: (store (s32), align 4, addrspace 0) + +... + +--- + +name: store_flat_s32_to_1_gep_2x_24bit_max +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; GFX7-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_max + ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX8-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_max + ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX9-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_max + ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX10-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_max + ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX11-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_max + ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX12-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_max + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX12-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + %0:vgpr(p1) = COPY $vgpr0_vgpr1 + %1:vgpr(s32) = COPY $vgpr2 + %2:vgpr(s64) = G_CONSTANT i64 16777214 + %3:vgpr(p1) = G_PTR_ADD %0, %2 + G_STORE %1, %3 :: (store (s32), align 4, addrspace 0) + +... + +--- + +name: store_flat_s32_to_1_gep_24bit_min +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; GFX7-LABEL: name: store_flat_s32_to_1_gep_24bit_min + ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX8-LABEL: name: store_flat_s32_to_1_gep_24bit_min + ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX9-LABEL: name: store_flat_s32_to_1_gep_24bit_min + ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX10-LABEL: name: store_flat_s32_to_1_gep_24bit_min + ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX11-LABEL: name: store_flat_s32_to_1_gep_24bit_min + ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX12-LABEL: name: store_flat_s32_to_1_gep_24bit_min + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], -8388608, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + %0:vgpr(p1) = COPY $vgpr0_vgpr1 + %1:vgpr(s32) = COPY $vgpr2 + %2:vgpr(s64) = G_CONSTANT i64 -8388608 + %3:vgpr(p1) = G_PTR_ADD %0, %2 + G_STORE %1, %3 :: (store (s32), align 4, addrspace 0) + +... + +--- + +name: store_flat_s32_to_1_gep_2x_24bit_min +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; GFX7-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_min + ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX8-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_min + ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX9-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_min + ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX10-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_min + ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX11-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_min + ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX11-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; + ; GFX12-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_min + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX12-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + %0:vgpr(p1) = COPY $vgpr0_vgpr1 + %1:vgpr(s32) = COPY $vgpr2 + %2:vgpr(s64) = G_CONSTANT i64 -16777215 + %3:vgpr(p1) = G_PTR_ADD %0, %2 + G_STORE %1, %3 :: (store (s32), align 4, addrspace 0) +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir index 95e01aa413a37..79cd2a07883a6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir @@ -6,6 +6,7 @@ # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s +# RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX12 %s --- @@ -69,6 +70,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; + ; GFX12-LABEL: name: store_global_s32_to_4 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 G_STORE %1, %0 :: (store (s32), align 4, addrspace 1) @@ -136,6 +144,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: GLOBAL_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s16), addrspace 1) + ; + ; GFX12-LABEL: name: store_global_s32_to_2 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: GLOBAL_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s16), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 G_STORE %1, %0 :: (store (s16), align 2, addrspace 1) @@ -203,6 +218,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: GLOBAL_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s8), addrspace 1) + ; + ; GFX12-LABEL: name: store_global_s32_to_1 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: GLOBAL_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s8), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 G_STORE %1, %0 :: (store (s8), align 1, addrspace 1) @@ -271,6 +293,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX10-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1) + ; + ; GFX12-LABEL: name: store_global_s64 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store (s64), align 8, addrspace 1) @@ -328,6 +357,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128), addrspace 1) + ; + ; GFX12-LABEL: name: store_global_s128 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store (s128), align 16, addrspace 1) @@ -396,6 +432,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX10-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s32>), addrspace 1) + ; + ; GFX12-LABEL: name: store_global_v2s32 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s32>), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s32>) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store (<2 x s32>), align 8, addrspace 1) @@ -463,6 +506,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<4 x s32>), addrspace 1) + ; + ; GFX12-LABEL: name: store_global_v4s32 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<4 x s32>), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s32>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store (<4 x s32>), align 16, addrspace 1) @@ -531,6 +581,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 1) + ; + ; GFX12-LABEL: name: store_global_v2s16 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s16>) = COPY $vgpr2 G_STORE %1, %0 :: (store (<2 x s16>), align 4, addrspace 1) @@ -599,6 +656,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX10-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<4 x s16>), addrspace 1) + ; + ; GFX12-LABEL: name: store_global_v4s16 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<4 x s16>), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s16>) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store (<4 x s16>), align 8, addrspace 1) @@ -667,6 +731,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<8 x s16>), addrspace 1) + ; + ; GFX12-LABEL: name: store_global_v8s16 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<8 x s16>), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store (<8 x s16>), align 16, addrspace 1) @@ -735,6 +806,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s64>), addrspace 1) + ; + ; GFX12-LABEL: name: store_global_v2s64 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s64>), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store (<2 x s64>), align 16, addrspace 1) @@ -803,6 +881,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX10-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (p1), addrspace 1) + ; + ; GFX12-LABEL: name: store_global_p1 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (p1), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p1) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store (p1), align 8, addrspace 1) @@ -861,6 +946,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>), addrspace 1) + ; + ; GFX12-LABEL: name: store_global_v2p1 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store (<2 x p1>), align 16, addrspace 1) @@ -929,6 +1021,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (p3), addrspace 1) + ; + ; GFX12-LABEL: name: store_global_p3 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (p3), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %1, %0 :: (store (p3), align 4, addrspace 1) @@ -987,6 +1086,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3 ; GFX10-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1) + ; + ; GFX12-LABEL: name: store_global_v2p3 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store (<2 x p3>), align 8, addrspace 1) @@ -1054,6 +1160,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store monotonic (s32), addrspace 1) + ; + ; GFX12-LABEL: name: store_atomic_global_s32 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store monotonic (s32), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 G_STORE %1, %0 :: (store monotonic (s32), align 4, addrspace 1) @@ -1122,6 +1235,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX10-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store monotonic (s64), addrspace 1) + ; + ; GFX12-LABEL: name: store_atomic_global_s64 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store monotonic (s64), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store monotonic (s64), align 8, addrspace 1) @@ -1206,6 +1326,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (store (s32), addrspace 1) + ; + ; GFX12-LABEL: name: store_global_s32_gep_2047 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (store (s32), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 2047 @@ -1213,3 +1340,490 @@ body: | G_STORE %1, %3 :: (store (s32), align 4, addrspace 1) ... + +--- + +name: store_global_s32_to_1_gep_24bit_max +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; GFX6-LABEL: name: store_global_s32_to_1_gep_24bit_max + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8388607 + ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; + ; GFX7-LABEL: name: store_global_s32_to_1_gep_24bit_max + ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8388607 + ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; + ; GFX7-FLAT-LABEL: name: store_global_s32_to_1_gep_24bit_max + ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX7-FLAT-NEXT: {{ $}} + ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; + ; GFX8-LABEL: name: store_global_s32_to_1_gep_24bit_max + ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; + ; GFX9-LABEL: name: store_global_s32_to_1_gep_24bit_max + ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; + ; GFX10-LABEL: name: store_global_s32_to_1_gep_24bit_max + ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; + ; GFX12-LABEL: name: store_global_s32_to_1_gep_24bit_max + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 8388607, 0, implicit $exec :: (store (s32), addrspace 1) + %0:vgpr(p1) = COPY $vgpr0_vgpr1 + %1:vgpr(s32) = COPY $vgpr2 + %2:vgpr(s64) = G_CONSTANT i64 8388607 + %3:vgpr(p1) = G_PTR_ADD %0, %2 + G_STORE %1, %3 :: (store (s32), align 4, addrspace 1) +... + +--- + +name: store_global_s32_to_1_gep_24bit_min +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; GFX6-LABEL: name: store_global_s32_to_1_gep_24bit_min + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; + ; GFX7-LABEL: name: store_global_s32_to_1_gep_24bit_min + ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; + ; GFX7-FLAT-LABEL: name: store_global_s32_to_1_gep_24bit_min + ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX7-FLAT-NEXT: {{ $}} + ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; + ; GFX8-LABEL: name: store_global_s32_to_1_gep_24bit_min + ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; + ; GFX9-LABEL: name: store_global_s32_to_1_gep_24bit_min + ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; + ; GFX10-LABEL: name: store_global_s32_to_1_gep_24bit_min + ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; + ; GFX12-LABEL: name: store_global_s32_to_1_gep_24bit_min + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], -8388608, 0, implicit $exec :: (store (s32), addrspace 1) + %0:vgpr(p1) = COPY $vgpr0_vgpr1 + %1:vgpr(s32) = COPY $vgpr2 + %2:vgpr(s64) = G_CONSTANT i64 -8388608 + %3:vgpr(p1) = G_PTR_ADD %0, %2 + G_STORE %1, %3 :: (store (s32), align 4, addrspace 1) + +... + +--- + +name: store_global_s32_to_1_gep_2x_24bit_max +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; GFX6-LABEL: name: store_global_s32_to_1_gep_2x_24bit_max + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 16777214 + ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; + ; GFX7-LABEL: name: store_global_s32_to_1_gep_2x_24bit_max + ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 16777214 + ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; + ; GFX7-FLAT-LABEL: name: store_global_s32_to_1_gep_2x_24bit_max + ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX7-FLAT-NEXT: {{ $}} + ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; + ; GFX8-LABEL: name: store_global_s32_to_1_gep_2x_24bit_max + ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; + ; GFX9-LABEL: name: store_global_s32_to_1_gep_2x_24bit_max + ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; + ; GFX10-LABEL: name: store_global_s32_to_1_gep_2x_24bit_max + ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; + ; GFX12-LABEL: name: store_global_s32_to_1_gep_2x_24bit_max + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1) + %0:vgpr(p1) = COPY $vgpr0_vgpr1 + %1:vgpr(s32) = COPY $vgpr2 + %2:vgpr(s64) = G_CONSTANT i64 16777214 + %3:vgpr(p1) = G_PTR_ADD %0, %2 + G_STORE %1, %3 :: (store (s32), align 4, addrspace 1) + +... + +--- + +name: store_global_s32_to_1_gep_2x_24bit_min +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; GFX6-LABEL: name: store_global_s32_to_1_gep_2x_24bit_min + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; + ; GFX7-LABEL: name: store_global_s32_to_1_gep_2x_24bit_min + ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; + ; GFX7-FLAT-LABEL: name: store_global_s32_to_1_gep_2x_24bit_min + ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX7-FLAT-NEXT: {{ $}} + ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; + ; GFX8-LABEL: name: store_global_s32_to_1_gep_2x_24bit_min + ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; + ; GFX9-LABEL: name: store_global_s32_to_1_gep_2x_24bit_min + ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX9-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; + ; GFX10-LABEL: name: store_global_s32_to_1_gep_2x_24bit_min + ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX10-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; + ; GFX12-LABEL: name: store_global_s32_to_1_gep_2x_24bit_min + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1 + ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec + ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1) + %0:vgpr(p1) = COPY $vgpr0_vgpr1 + %1:vgpr(s32) = COPY $vgpr2 + %2:vgpr(s64) = G_CONSTANT i64 -16777215 + %3:vgpr(p1) = G_PTR_ADD %0, %2 + G_STORE %1, %3 :: (store (s32), align 4, addrspace 1) + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.s96.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.s96.mir index 54774ff512cb6..557fedb5d7371 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.s96.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.s96.mir @@ -5,6 +5,7 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s # 96-bit load/store is only legal for SI, so split the test out. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir index 4f801759c38e1..ed14d0598c8d9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir @@ -4,6 +4,7 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s +# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX12 %s --- @@ -26,18 +27,27 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5) + ; ; GFX9-LABEL: name: function_store_private_s32_to_4 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5) + ; ; GFX11-LABEL: name: function_store_private_s32_to_4 ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5) + ; + ; GFX12-LABEL: name: function_store_private_s32_to_4 + ; GFX12: liveins: $vgpr0, $vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store (s32), align 4, addrspace 5) @@ -65,18 +75,27 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s16), addrspace 5) + ; ; GFX9-LABEL: name: function_store_private_s32_to_2 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-NEXT: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s16), addrspace 5) + ; ; GFX11-LABEL: name: function_store_private_s32_to_2 ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: SCRATCH_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16), addrspace 5) + ; + ; GFX12-LABEL: name: function_store_private_s32_to_2 + ; GFX12: liveins: $vgpr0, $vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: SCRATCH_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16), addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store (s16), align 2, addrspace 5) @@ -104,18 +123,27 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 5) + ; ; GFX9-LABEL: name: function_store_private_s32_to_1 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-NEXT: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 5) + ; ; GFX11-LABEL: name: function_store_private_s32_to_1 ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: SCRATCH_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5) + ; + ; GFX12-LABEL: name: function_store_private_s32_to_1 + ; GFX12: liveins: $vgpr0, $vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: SCRATCH_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store (s8), align 1, addrspace 5) @@ -143,18 +171,27 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 5) + ; ; GFX9-LABEL: name: function_store_private_v2s16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 5) + ; ; GFX11-LABEL: name: function_store_private_v2s16 ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>), addrspace 5) + ; + ; GFX12-LABEL: name: function_store_private_v2s16 + ; GFX12: liveins: $vgpr0, $vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>), addrspace 5) %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store (<2 x s16>), align 4, addrspace 5) @@ -182,18 +219,27 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (p3), addrspace 5) + ; ; GFX9-LABEL: name: function_store_private_p3 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (p3), addrspace 5) + ; ; GFX11-LABEL: name: function_store_private_p3 ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3), addrspace 5) + ; + ; GFX12-LABEL: name: function_store_private_p3 + ; GFX12: liveins: $vgpr0, $vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3), addrspace 5) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store (p3), align 4, addrspace 5) @@ -221,18 +267,27 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (p5), addrspace 5) + ; ; GFX9-LABEL: name: function_store_private_p5 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (p5), addrspace 5) + ; ; GFX11-LABEL: name: function_store_private_p5 ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p5), addrspace 5) + ; + ; GFX12-LABEL: name: function_store_private_p5 + ; GFX12: liveins: $vgpr0, $vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p5), addrspace 5) %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store (p5), align 4, addrspace 5) @@ -258,12 +313,18 @@ body: | ; GFX6-LABEL: name: function_store_private_s32_to_1_fi_offset_4095 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX6-NEXT: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (store (s8), addrspace 5) + ; ; GFX9-LABEL: name: function_store_private_s32_to_1_fi_offset_4095 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9-NEXT: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (store (s8), addrspace 5) + ; ; GFX11-LABEL: name: function_store_private_s32_to_1_fi_offset_4095 ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX11-NEXT: SCRATCH_STORE_BYTE_SADDR [[V_MOV_B32_e32_]], %stack.0, 4095, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5) + ; + ; GFX12-LABEL: name: function_store_private_s32_to_1_fi_offset_4095 + ; GFX12: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-NEXT: SCRATCH_STORE_BYTE_SADDR [[V_MOV_B32_e32_]], %stack.0, 4095, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5) %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_CONSTANT i32 4095 %2:vgpr(p5) = G_PTR_ADD %0, %1 @@ -291,13 +352,20 @@ body: | ; GFX6-LABEL: name: function_store_private_s32_to_1_constant_4095 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX6-NEXT: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (store (s8), addrspace 5) + ; ; GFX9-LABEL: name: function_store_private_s32_to_1_constant_4095 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9-NEXT: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (store (s8), addrspace 5) + ; ; GFX11-LABEL: name: function_store_private_s32_to_1_constant_4095 ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX11-NEXT: SCRATCH_STORE_BYTE [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5) + ; + ; GFX12-LABEL: name: function_store_private_s32_to_1_constant_4095 + ; GFX12: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec + ; GFX12-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-NEXT: SCRATCH_STORE_BYTE [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5) %0:vgpr(p5) = G_CONSTANT i32 4095 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store (s8), align 1, addrspace 5) @@ -324,14 +392,21 @@ body: | ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX6-NEXT: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 5) + ; ; GFX9-LABEL: name: function_store_private_s32_to_1_constant_4096 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX9-NEXT: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 5) + ; ; GFX11-LABEL: name: function_store_private_s32_to_1_constant_4096 ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX11-NEXT: SCRATCH_STORE_BYTE [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5) + ; + ; GFX12-LABEL: name: function_store_private_s32_to_1_constant_4096 + ; GFX12: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX12-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-NEXT: SCRATCH_STORE_BYTE [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5) %0:vgpr(p5) = G_CONSTANT i32 4096 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store (s8), align 1, addrspace 5) @@ -358,18 +433,27 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5) + ; ; GFX9-LABEL: name: kernel_store_private_s32_to_4 ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5) + ; ; GFX11-LABEL: name: kernel_store_private_s32_to_4 ; GFX11: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5) + ; + ; GFX12-LABEL: name: kernel_store_private_s32_to_4 + ; GFX12: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store (s32), align 4, addrspace 5) @@ -396,18 +480,27 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s16), addrspace 5) + ; ; GFX9-LABEL: name: kernel_store_private_s32_to_2 ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-NEXT: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s16), addrspace 5) + ; ; GFX11-LABEL: name: kernel_store_private_s32_to_2 ; GFX11: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: SCRATCH_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16), addrspace 5) + ; + ; GFX12-LABEL: name: kernel_store_private_s32_to_2 + ; GFX12: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: SCRATCH_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16), addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store (s16), align 2, addrspace 5) @@ -434,18 +527,27 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 5) + ; ; GFX9-LABEL: name: kernel_store_private_s32_to_1 ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-NEXT: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 5) + ; ; GFX11-LABEL: name: kernel_store_private_s32_to_1 ; GFX11: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: SCRATCH_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5) + ; + ; GFX12-LABEL: name: kernel_store_private_s32_to_1 + ; GFX12: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: SCRATCH_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store (s8), align 1, addrspace 5) @@ -473,18 +575,27 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 5) + ; ; GFX9-LABEL: name: kernel_store_private_v2s16 ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 5) + ; ; GFX11-LABEL: name: kernel_store_private_v2s16 ; GFX11: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>), addrspace 5) + ; + ; GFX12-LABEL: name: kernel_store_private_v2s16 + ; GFX12: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>), addrspace 5) %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store (<2 x s16>), align 4, addrspace 5) @@ -511,18 +622,27 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (p3), addrspace 5) + ; ; GFX9-LABEL: name: kernel_store_private_p3 ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (p3), addrspace 5) + ; ; GFX11-LABEL: name: kernel_store_private_p3 ; GFX11: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3), addrspace 5) + ; + ; GFX12-LABEL: name: kernel_store_private_p3 + ; GFX12: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3), addrspace 5) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store (p3), align 4, addrspace 5) @@ -549,18 +669,27 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (p5), addrspace 5) + ; ; GFX9-LABEL: name: kernel_store_private_p5 ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (p5), addrspace 5) + ; ; GFX11-LABEL: name: kernel_store_private_p5 ; GFX11: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p5), addrspace 5) + ; + ; GFX12-LABEL: name: kernel_store_private_p5 + ; GFX12: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p5), addrspace 5) %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store (p5), align 4, addrspace 5) @@ -588,16 +717,24 @@ body: | ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX6-NEXT: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (store (s8), addrspace 5) + ; ; GFX9-LABEL: name: kernel_store_private_s32_to_1_fi_offset_4095 ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9-NEXT: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (store (s8), addrspace 5) + ; ; GFX11-LABEL: name: kernel_store_private_s32_to_1_fi_offset_4095 ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX11-NEXT: SCRATCH_STORE_BYTE_SADDR [[V_MOV_B32_e32_]], %stack.0, 4095, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5) + ; + ; GFX12-LABEL: name: kernel_store_private_s32_to_1_fi_offset_4095 + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-NEXT: SCRATCH_STORE_BYTE_SADDR [[V_MOV_B32_e32_]], %stack.0, 4095, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5) %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_CONSTANT i32 4095 %2:vgpr(p5) = G_PTR_ADD %0, %1 @@ -627,17 +764,26 @@ body: | ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX6-NEXT: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (store (s8), addrspace 5) + ; ; GFX9-LABEL: name: kernel_store_private_s32_to_1_constant_4095 ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9-NEXT: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (store (s8), addrspace 5) + ; ; GFX11-LABEL: name: kernel_store_private_s32_to_1_constant_4095 ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX11-NEXT: SCRATCH_STORE_BYTE [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5) + ; + ; GFX12-LABEL: name: kernel_store_private_s32_to_1_constant_4095 + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec + ; GFX12-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-NEXT: SCRATCH_STORE_BYTE [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5) %0:vgpr(p5) = G_CONSTANT i32 4095 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store (s8), align 1, addrspace 5) @@ -666,18 +812,27 @@ body: | ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX6-NEXT: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 5) + ; ; GFX9-LABEL: name: kernel_store_private_s32_to_1_constant_4096 ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX9-NEXT: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 5) + ; ; GFX11-LABEL: name: kernel_store_private_s32_to_1_constant_4096 ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX11-NEXT: SCRATCH_STORE_BYTE [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5) + ; + ; GFX12-LABEL: name: kernel_store_private_s32_to_1_constant_4096 + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX12-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-NEXT: SCRATCH_STORE_BYTE [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5) %0:vgpr(p5) = G_CONSTANT i32 4096 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store (s8), align 1, addrspace 5) @@ -703,17 +858,26 @@ body: | ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5) + ; ; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5) + ; ; GFX11-LABEL: name: function_store_private_s32_to_4_wave_address ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[V_LSHRREV_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5) + ; + ; GFX12-LABEL: name: function_store_private_s32_to_4_wave_address + ; GFX12: liveins: $vgpr0, $vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec + ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[V_LSHRREV_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32 G_STORE %0, %1 :: (store (s32), align 4, addrspace 5) @@ -740,11 +904,13 @@ body: | ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, implicit $exec :: (store (s32), addrspace 5) + ; ; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4095 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, implicit $exec :: (store (s32), addrspace 5) + ; ; GFX11-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4095 ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} @@ -754,6 +920,13 @@ body: | ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHRREV_B32_e64_]], [[COPY1]], 0, implicit $exec ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5) + ; + ; GFX12-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4095 + ; GFX12: liveins: $vgpr0, $vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec + ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[V_LSHRREV_B32_e64_]], 4095, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32 %2:sgpr(s32) = G_CONSTANT i32 4095 @@ -797,6 +970,13 @@ body: | ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHRREV_B32_e64_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5) + ; + ; GFX12-LABEL: name: function_store_private_s32_to_4_wave_address_offset_4095 + ; GFX12: liveins: $vgpr0, $vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec + ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[V_LSHRREV_B32_e64_]], 4095, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32 %2:vgpr(s32) = G_CONSTANT i32 4095 @@ -838,6 +1018,7 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHRREV_B32_e64_]], [[COPY1]], 0, implicit $exec ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5) + ; ; GFX11-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4096 ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} @@ -847,6 +1028,13 @@ body: | ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHRREV_B32_e64_]], [[COPY1]], 0, implicit $exec ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5) + ; + ; GFX12-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4096 + ; GFX12: liveins: $vgpr0, $vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec + ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[V_LSHRREV_B32_e64_]], 4096, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32 %2:sgpr(s32) = G_CONSTANT i32 4096 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir index 324839482976e..e67f3620d013c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir @@ -4,6 +4,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_load_flat_s1_align1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir index c0fb02fb184cf..d6acc6ecdfc66 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir @@ -16,6 +16,8 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefixes=GFX9-MESA %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefixes=GFX9-HSA %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefixes=GFX9-MESA %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefixes=GFX9-HSA %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefixes=GFX9-MESA %s # ERR-NOT: remark # ERR: remark: :0:0: unable to legalize instruction: %{{[0-9]+}}:_(<2 x s32>) = G_LOAD %{{[0-9]+}}:_(p1) :: (load (<2 x s16>), align 1, addrspace 1) (in function: test_extload_global_v2s32_from_v2s16_align1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir index a94df99206304..1249de647bb75 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir @@ -9,6 +9,8 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -mattr=+unaligned-access-mode -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX10-UNALIGNED %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -O0 -run-pass=legalizer -mattr=-unaligned-access-mode -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX11 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -O0 -run-pass=legalizer -mattr=+unaligned-access-mode -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX11-UNALIGNED %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -O0 -run-pass=legalizer -mattr=-unaligned-access-mode -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX11 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -O0 -run-pass=legalizer -mattr=+unaligned-access-mode -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX11-UNALIGNED %s --- name: test_load_local_s1_align1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir index 40884567f9962..23a0524b69ffa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir @@ -5,6 +5,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX10 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX11 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX11 %s --- name: test_load_private_s1_align1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir index c2fdbff77868f..f20e481ccd4b7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir @@ -5,6 +5,7 @@ # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_store_global_s1_align1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll index 4ab963ed85cca..d76c9e4c1c134 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11 +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12 define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) { ; GFX10-LABEL: global_atomic_csub: @@ -17,6 +18,13 @@ define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) { ; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_atomic_csub: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data) ret i32 %ret } @@ -39,6 +47,13 @@ define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) { ; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_atomic_csub_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data) ret i32 %ret @@ -58,6 +73,13 @@ define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) { ; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_atomic_csub_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data) ret void } @@ -80,6 +102,13 @@ define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) { ; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_atomic_csub_offset_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data) ret void @@ -112,6 +141,18 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_csub_sgpr_base_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b32 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data) store i32 %ret, ptr addrspace(1) undef @@ -139,6 +180,14 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa ; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data) ret void diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index f45d6d0696498..1df9a250a3159 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -3,6 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A %s ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1100 %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1200 %s define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX908-LABEL: syncscope_system: @@ -85,6 +86,32 @@ define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX1100-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1100-NEXT: v_mov_b32_e32 v0, v3 ; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-LABEL: syncscope_system: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1200-NEXT: flat_load_b32 v3, v[0:1] +; GFX1200-NEXT: s_mov_b32 s0, 0 +; GFX1200-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1200-NEXT: v_mov_b32_e32 v4, v3 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX1200-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1200-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX1200-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1200-NEXT: buffer_gl0_inv +; GFX1200-NEXT: buffer_gl1_inv +; GFX1200-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX1200-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1200-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1200-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1200-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1200-NEXT: v_mov_b32_e32 v0, v3 +; GFX1200-NEXT: s_setpc_b64 s[30:31] %res = atomicrmw fadd ptr %addr, float %val seq_cst ret float %res } @@ -175,6 +202,15 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 { ; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1100-NEXT: buffer_gl0_inv ; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-LABEL: syncscope_workgroup_rtn: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1200-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1200-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX1200-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1200-NEXT: buffer_gl0_inv +; GFX1200-NEXT: s_setpc_b64 s[30:31] %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst ret float %res } @@ -296,6 +332,16 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 { ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100-NEXT: buffer_gl0_inv ; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-LABEL: syncscope_workgroup_nortn: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1200-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1200-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX1200-NEXT: s_waitcnt lgkmcnt(0) +; GFX1200-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1200-NEXT: buffer_gl0_inv +; GFX1200-NEXT: s_setpc_b64 s[30:31] %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst ret void } @@ -374,6 +420,31 @@ define float @no_unsafe(ptr %addr, float %val) { ; GFX1100-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1100-NEXT: v_mov_b32_e32 v0, v3 ; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-LABEL: no_unsafe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1200-NEXT: flat_load_b32 v3, v[0:1] +; GFX1200-NEXT: s_mov_b32 s0, 0 +; GFX1200-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1200-NEXT: v_mov_b32_e32 v4, v3 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX1200-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1200-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX1200-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1200-NEXT: buffer_gl0_inv +; GFX1200-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX1200-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1200-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1200-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1200-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1200-NEXT: v_mov_b32_e32 v0, v3 +; GFX1200-NEXT: s_setpc_b64 s[30:31] %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst ret float %res } diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index 2456da6218773..a8dcb74c7d5f9 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -4,6 +4,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX12 %s ; Should not merge this to a dword load define i32 @global_load_2xi16_align2(ptr addrspace(1) %p) #0 { @@ -47,6 +48,13 @@ define i32 @global_load_2xi16_align2(ptr addrspace(1) %p) #0 { ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_load_2xi16_align2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1 %p.0 = load i16, ptr addrspace(1) %p, align 2 %p.1 = load i16, ptr addrspace(1) %gep.p, align 2 @@ -112,6 +120,16 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_2xi16_align2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1 store i16 1, ptr addrspace(1) %r, align 2 store i16 2, ptr addrspace(1) %gep.r, align 2 @@ -172,6 +190,13 @@ define i32 @global_load_2xi16_align1(ptr addrspace(1) %p) #0 { ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_load_2xi16_align1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1 %p.0 = load i16, ptr addrspace(1) %p, align 1 %p.1 = load i16, ptr addrspace(1) %gep.p, align 1 @@ -248,6 +273,16 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_2xi16_align1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1 store i16 1, ptr addrspace(1) %r, align 1 store i16 2, ptr addrspace(1) %gep.r, align 1 @@ -290,6 +325,13 @@ define i32 @global_load_2xi16_align4(ptr addrspace(1) %p) #0 { ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_load_2xi16_align4: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1 %p.0 = load i16, ptr addrspace(1) %p, align 4 %p.1 = load i16, ptr addrspace(1) %gep.p, align 2 @@ -349,6 +391,16 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_2xi16_align4: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1 store i16 1, ptr addrspace(1) %r, align 4 store i16 2, ptr addrspace(1) %gep.r, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll index 08e36413322b6..61f0e5e2a8892 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -7,6 +7,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+unaligned-scratch-access -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX10-FLASTSCR %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+unaligned-scratch-access < %s | FileCheck --check-prefix=GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+unaligned-scratch-access -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX11-FLASTSCR %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+unaligned-scratch-access < %s | FileCheck --check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+unaligned-scratch-access -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX12-FLASTSCR %s ; Should not merge this to a dword load define i32 @private_load_2xi16_align2(ptr addrspace(5) %p) #0 { @@ -70,6 +72,20 @@ define i32 @private_load_2xi16_align2(ptr addrspace(5) %p) #0 { ; GFX11-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off ; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) ; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: private_load_2xi16_align2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: scratch_load_b32 v0, v0, off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FLASTSCR-LABEL: private_load_2xi16_align2: +; GFX12-FLASTSCR: ; %bb.0: +; GFX12-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off +; GFX12-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1 %p.0 = load i16, ptr addrspace(5) %p, align 2 %p.1 = load i16, ptr addrspace(5) %gep.p, align 2 @@ -144,6 +160,20 @@ define void @private_store_2xi16_align2(ptr addrspace(5) %p, ptr addrspace(5) %r ; GFX11-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX11-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off ; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: private_store_2xi16_align2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX12-NEXT: scratch_store_b32 v1, v0, off +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FLASTSCR-LABEL: private_store_2xi16_align2: +; GFX12-FLASTSCR: ; %bb.0: +; GFX12-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX12-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off +; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1 store i16 1, ptr addrspace(5) %r, align 2 store i16 2, ptr addrspace(5) %gep.r, align 2 @@ -222,6 +252,20 @@ define i32 @private_load_2xi16_align1(ptr addrspace(5) %p) #0 { ; GFX11-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off ; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) ; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: private_load_2xi16_align1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: scratch_load_b32 v0, v0, off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FLASTSCR-LABEL: private_load_2xi16_align1: +; GFX12-FLASTSCR: ; %bb.0: +; GFX12-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off +; GFX12-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1 %p.0 = load i16, ptr addrspace(5) %p, align 1 %p.1 = load i16, ptr addrspace(5) %gep.p, align 1 @@ -301,6 +345,20 @@ define void @private_store_2xi16_align1(ptr addrspace(5) %p, ptr addrspace(5) %r ; GFX11-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX11-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off ; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: private_store_2xi16_align1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX12-NEXT: scratch_store_b32 v1, v0, off +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FLASTSCR-LABEL: private_store_2xi16_align1: +; GFX12-FLASTSCR: ; %bb.0: +; GFX12-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX12-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off +; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1 store i16 1, ptr addrspace(5) %r, align 1 store i16 2, ptr addrspace(5) %gep.r, align 1 @@ -364,6 +422,20 @@ define i32 @private_load_2xi16_align4(ptr addrspace(5) %p) #0 { ; GFX11-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off ; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) ; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: private_load_2xi16_align4: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: scratch_load_b32 v0, v0, off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FLASTSCR-LABEL: private_load_2xi16_align4: +; GFX12-FLASTSCR: ; %bb.0: +; GFX12-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off +; GFX12-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1 %p.0 = load i16, ptr addrspace(5) %p, align 4 %p.1 = load i16, ptr addrspace(5) %gep.p, align 2 @@ -435,6 +507,20 @@ define void @private_store_2xi16_align4(ptr addrspace(5) %p, ptr addrspace(5) %r ; GFX11-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX11-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off ; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: private_store_2xi16_align4: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX12-NEXT: scratch_store_b32 v1, v0, off +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FLASTSCR-LABEL: private_store_2xi16_align4: +; GFX12-FLASTSCR: ; %bb.0: +; GFX12-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX12-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off +; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1 store i16 1, ptr addrspace(5) %r, align 4 store i16 2, ptr addrspace(5) %gep.r, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll index ad4d4a4a30fc6..93e8630dc7f56 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12 %s ; vgpr offset @@ -22,6 +23,13 @@ define amdgpu_ps void @test_scratch_load_i8_zext_v(ptr addrspace(5) %in, ptr %ou ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i8_zext_v: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[1:2], v0 +; GFX12-NEXT: s_endpgm %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1 %load = load i8, ptr addrspace(5) %gep, align 4 %ext = zext i8 %load to i32 @@ -47,6 +55,13 @@ define amdgpu_ps void @test_scratch_load_i8_sext_v(ptr addrspace(5) %in, ptr %ou ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i8_sext_v: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_i8 v0, v0, off offset:1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[1:2], v0 +; GFX12-NEXT: s_endpgm %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1 %load = load i8, ptr addrspace(5) %gep, align 4 %ext = sext i8 %load to i32 @@ -72,6 +87,13 @@ define amdgpu_ps void @test_scratch_load_i16_zext_v(ptr addrspace(5) %in, ptr %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i16_zext_v: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_u16 v0, v0, off offset:2 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[1:2], v0 +; GFX12-NEXT: s_endpgm %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1 %load = load i16, ptr addrspace(5) %gep, align 4 %ext = zext i16 %load to i32 @@ -97,6 +119,13 @@ define amdgpu_ps void @test_scratch_load_i16_sext_v(ptr addrspace(5) %in, ptr %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i16_sext_v: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_i16 v0, v0, off offset:2 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[1:2], v0 +; GFX12-NEXT: s_endpgm %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1 %load = load i16, ptr addrspace(5) %gep, align 4 %ext = sext i16 %load to i32 @@ -125,6 +154,14 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_v(ptr addrspace(5) %i ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_v: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000 +; GFX12-NEXT: scratch_load_d16_u8 v3, v0, off offset:1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[1:2], v3 +; GFX12-NEXT: s_endpgm bb: %gep = getelementptr i8, ptr addrspace(5) %in, i64 1 %load_lo = load i8, ptr addrspace(5) %gep @@ -155,6 +192,14 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_v(ptr addrspace(5) %i ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_v: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000 +; GFX12-NEXT: scratch_load_d16_i8 v3, v0, off offset:1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[1:2], v3 +; GFX12-NEXT: s_endpgm bb: %gep = getelementptr i8, ptr addrspace(5) %in, i64 1 %load_lo = load i8, ptr addrspace(5) %gep @@ -185,6 +230,14 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_v(ptr addrspace(5) %in, p ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_v: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000 +; GFX12-NEXT: scratch_load_d16_b16 v3, v0, off offset:2 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[1:2], v3 +; GFX12-NEXT: s_endpgm bb: %gep = getelementptr i16, ptr addrspace(5) %in, i64 1 %load_lo = load i16, ptr addrspace(5) %gep @@ -215,6 +268,14 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_v(ptr addrspace(5) %i ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_v: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v3, -1 +; GFX12-NEXT: scratch_load_d16_hi_u8 v3, v0, off offset:1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[1:2], v3 +; GFX12-NEXT: s_endpgm bb: %gep = getelementptr i8, ptr addrspace(5) %in, i64 1 %load_lo = load i8, ptr addrspace(5) %gep @@ -245,6 +306,14 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_v(ptr addrspace(5) %i ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_v: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v3, -1 +; GFX12-NEXT: scratch_load_d16_hi_i8 v3, v0, off offset:1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[1:2], v3 +; GFX12-NEXT: s_endpgm bb: %gep = getelementptr i8, ptr addrspace(5) %in, i64 1 %load_lo = load i8, ptr addrspace(5) %gep @@ -275,6 +344,14 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_v(ptr addrspace(5) %in, p ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_v: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v3, -1 +; GFX12-NEXT: scratch_load_d16_hi_b16 v3, v0, off offset:2 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[1:2], v3 +; GFX12-NEXT: s_endpgm bb: %gep = getelementptr i16, ptr addrspace(5) %in, i64 1 %load_lo = load i16, ptr addrspace(5) %gep @@ -303,6 +380,13 @@ define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_v(ptr %in, ptr addrspac ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: scratch_store_d16_hi_b8 v1, v0, off ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_v: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: flat_load_b32 v0, v[0:1] +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: scratch_store_d16_hi_b8 v2, v0, off offset:4 +; GFX12-NEXT: s_endpgm bb: %load = load <4 x i8>, ptr %in %element = extractelement <4 x i8> %load, i32 2 @@ -331,6 +415,13 @@ define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_v(ptr %in, ptr addrspa ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: scratch_store_d16_hi_b16 v1, v0, off ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_v: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: flat_load_b32 v0, v[0:1] +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: scratch_store_d16_hi_b16 v2, v0, off offset:2 +; GFX12-NEXT: s_endpgm bb: %load = load <2 x i16>, ptr %in %element = extractelement <2 x i16> %load, i32 1 @@ -362,6 +453,13 @@ define amdgpu_ps void @test_scratch_load_i8_zext_s(ptr addrspace(5) inreg %in, p ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i8_zext_s: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_u8 v2, off, s0 offset:1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: s_endpgm %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1 %load = load i8, ptr addrspace(5) %gep, align 4 %ext = zext i8 %load to i32 @@ -387,6 +485,13 @@ define amdgpu_ps void @test_scratch_load_i8_sext_s(ptr addrspace(5) inreg %in, p ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i8_sext_s: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_i8 v2, off, s0 offset:1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: s_endpgm %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1 %load = load i8, ptr addrspace(5) %gep, align 4 %ext = sext i8 %load to i32 @@ -412,6 +517,13 @@ define amdgpu_ps void @test_scratch_load_i16_zext_s(ptr addrspace(5) inreg %in, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i16_zext_s: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_u16 v2, off, s0 offset:2 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: s_endpgm %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1 %load = load i16, ptr addrspace(5) %gep, align 4 %ext = zext i16 %load to i32 @@ -437,6 +549,13 @@ define amdgpu_ps void @test_scratch_load_i16_sext_s(ptr addrspace(5) inreg %in, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i16_sext_s: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_i16 v2, off, s0 offset:2 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: s_endpgm %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1 %load = load i16, ptr addrspace(5) %gep, align 4 %ext = sext i16 %load to i32 @@ -466,6 +585,14 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_s(ptr addrspace(5) in ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_s: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v2, 0xffff0000 +; GFX12-NEXT: scratch_load_d16_u8 v2, off, s0 offset:1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: s_endpgm bb: %gep = getelementptr i8, ptr addrspace(5) %in, i64 1 %load_lo = load i8, ptr addrspace(5) %gep @@ -497,6 +624,14 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_s(ptr addrspace(5) in ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_s: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v2, 0xffff0000 +; GFX12-NEXT: scratch_load_d16_i8 v2, off, s0 offset:1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: s_endpgm bb: %gep = getelementptr i8, ptr addrspace(5) %in, i64 1 %load_lo = load i8, ptr addrspace(5) %gep @@ -528,6 +663,14 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_s(ptr addrspace(5) inreg ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_s: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v2, 0xffff0000 +; GFX12-NEXT: scratch_load_d16_b16 v2, off, s0 offset:2 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: s_endpgm bb: %gep = getelementptr i16, ptr addrspace(5) %in, i64 1 %load_lo = load i16, ptr addrspace(5) %gep @@ -559,6 +702,14 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_s(ptr addrspace(5) in ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_s: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v2, -1 +; GFX12-NEXT: scratch_load_d16_hi_u8 v2, off, s0 offset:1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: s_endpgm bb: %gep = getelementptr i8, ptr addrspace(5) %in, i64 1 %load_lo = load i8, ptr addrspace(5) %gep @@ -590,6 +741,14 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_s(ptr addrspace(5) in ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_s: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v2, -1 +; GFX12-NEXT: scratch_load_d16_hi_i8 v2, off, s0 offset:1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: s_endpgm bb: %gep = getelementptr i8, ptr addrspace(5) %in, i64 1 %load_lo = load i8, ptr addrspace(5) %gep @@ -621,6 +780,14 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_s(ptr addrspace(5) inreg ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_s: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v2, -1 +; GFX12-NEXT: scratch_load_d16_hi_b16 v2, off, s0 offset:2 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: s_endpgm bb: %gep = getelementptr i16, ptr addrspace(5) %in, i64 1 %load_lo = load i16, ptr addrspace(5) %gep @@ -649,6 +816,13 @@ define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_s(ptr %in, ptr addrspac ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: scratch_store_d16_hi_b8 off, v0, s0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_s: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: flat_load_b32 v0, v[0:1] +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: scratch_store_d16_hi_b8 off, v0, s0 offset:4 +; GFX12-NEXT: s_endpgm bb: %load = load <4 x i8>, ptr %in %element = extractelement <4 x i8> %load, i32 2 @@ -677,6 +851,13 @@ define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_s(ptr %in, ptr addrspa ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: scratch_store_d16_hi_b16 off, v0, s0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_s: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: flat_load_b32 v0, v[0:1] +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: scratch_store_d16_hi_b16 off, v0, s0 offset:2 +; GFX12-NEXT: s_endpgm bb: %load = load <2 x i16>, ptr %in %element = extractelement <2 x i16> %load, i32 1 @@ -710,6 +891,14 @@ define amdgpu_ps void @test_scratch_load_i8_zext_svs(ptr addrspace(5) inreg %in, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i8_zext_svs: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[1:2], v0 +; GFX12-NEXT: s_endpgm %voffset4 = mul i32 %voffset, 4 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4 %gep = getelementptr inbounds i8, ptr addrspace(5) %gep0, i32 1 @@ -739,6 +928,14 @@ define amdgpu_ps void @test_scratch_load_i8_sext_svs(ptr addrspace(5) inreg %in, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i8_sext_svs: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX12-NEXT: scratch_load_i8 v0, v0, off offset:1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[1:2], v0 +; GFX12-NEXT: s_endpgm %voffset4 = mul i32 %voffset, 4 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4 %gep = getelementptr inbounds i8, ptr addrspace(5) %gep0, i32 1 @@ -768,6 +965,14 @@ define amdgpu_ps void @test_scratch_load_i16_zext_svs(ptr addrspace(5) inreg %in ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i16_zext_svs: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX12-NEXT: scratch_load_u16 v0, v0, off offset:2 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[1:2], v0 +; GFX12-NEXT: s_endpgm %voffset4 = mul i32 %voffset, 4 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4 %gep = getelementptr inbounds i16, ptr addrspace(5) %gep0, i32 1 @@ -797,6 +1002,14 @@ define amdgpu_ps void @test_scratch_load_i16_sext_svs(ptr addrspace(5) inreg %in ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i16_sext_svs: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX12-NEXT: scratch_load_i16 v0, v0, off offset:2 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[1:2], v0 +; GFX12-NEXT: s_endpgm %voffset4 = mul i32 %voffset, 4 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4 %gep = getelementptr inbounds i16, ptr addrspace(5) %gep0, i32 1 @@ -830,6 +1043,15 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_svs(ptr addrspace(5) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000 +; GFX12-NEXT: scratch_load_d16_u8 v3, v0, off offset:1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[1:2], v3 +; GFX12-NEXT: s_endpgm bb: %voffset4 = mul i32 %voffset, 4 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4 @@ -865,6 +1087,15 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_svs(ptr addrspace(5) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000 +; GFX12-NEXT: scratch_load_d16_i8 v3, v0, off offset:1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[1:2], v3 +; GFX12-NEXT: s_endpgm bb: %voffset4 = mul i32 %voffset, 4 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4 @@ -900,6 +1131,15 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_svs(ptr addrspace(5) inre ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_svs: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000 +; GFX12-NEXT: scratch_load_d16_b16 v3, v0, off offset:2 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[1:2], v3 +; GFX12-NEXT: s_endpgm bb: %voffset4 = mul i32 %voffset, 4 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4 @@ -935,6 +1175,15 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_svs(ptr addrspace(5) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX12-NEXT: v_mov_b32_e32 v3, -1 +; GFX12-NEXT: scratch_load_d16_hi_u8 v3, v0, off offset:1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[1:2], v3 +; GFX12-NEXT: s_endpgm bb: %voffset4 = mul i32 %voffset, 4 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4 @@ -970,6 +1219,15 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_svs(ptr addrspace(5) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX12-NEXT: v_mov_b32_e32 v3, -1 +; GFX12-NEXT: scratch_load_d16_hi_i8 v3, v0, off offset:1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[1:2], v3 +; GFX12-NEXT: s_endpgm bb: %voffset4 = mul i32 %voffset, 4 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4 @@ -1005,6 +1263,15 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_svs(ptr addrspace(5) inre ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_svs: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX12-NEXT: v_mov_b32_e32 v3, -1 +; GFX12-NEXT: scratch_load_d16_hi_b16 v3, v0, off offset:2 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: flat_store_b32 v[1:2], v3 +; GFX12-NEXT: s_endpgm bb: %voffset4 = mul i32 %voffset, 4 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4 @@ -1038,6 +1305,14 @@ define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_svs(ptr %in, ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: scratch_store_d16_hi_b8 v1, v0, off ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_svs: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: flat_load_b32 v0, v[0:1] +; GFX12-NEXT: v_lshl_add_u32 v1, v2, 2, s0 +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: scratch_store_d16_hi_b8 v1, v0, off offset:4 +; GFX12-NEXT: s_endpgm bb: %load = load <4 x i8>, ptr %in %element = extractelement <4 x i8> %load, i32 2 @@ -1071,6 +1346,14 @@ define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_svs(ptr %in, ptr addrs ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: scratch_store_d16_hi_b16 v1, v0, off ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_svs: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: flat_load_b32 v0, v[0:1] +; GFX12-NEXT: v_lshl_add_u32 v1, v2, 2, s0 +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: scratch_store_d16_hi_b16 v1, v0, off offset:2 +; GFX12-NEXT: s_endpgm bb: %load = load <2 x i16>, ptr %in %element = extractelement <2 x i16> %load, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index 8d28c7845167a..04eb6dcff4632 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -3,6 +3,8 @@ ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx940 < %s | FileCheck %s -check-prefixes=GFX940-GISEL ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11-SDAG ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11-GISEL +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefixes=GFX12-SDAG +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefixes=GFX12-GISEL ; Test flat scratch SVS addressing mode with various combinations of alignment ; of soffset, voffset and inst_offset. @@ -86,6 +88,36 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: soff1_voff1: +; GFX12-SDAG: ; %bb.0: ; %bb +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: soff1_voff1: +; GFX12-GISEL: ; %bb.0: ; %bb +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: s_endpgm bb: %soff1 = mul i32 %soff, 1 %a = alloca i8, i32 64, align 4, addrspace(5) @@ -182,6 +214,38 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: soff1_voff2: +; GFX12-SDAG: ; %bb.0: ; %bb +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: soff1_voff2: +; GFX12-GISEL: ; %bb.0: ; %bb +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: s_endpgm bb: %soff1 = mul i32 %soff, 1 %a = alloca i8, i32 64, align 4, addrspace(5) @@ -278,6 +342,38 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: soff1_voff4: +; GFX12-SDAG: ; %bb.0: ; %bb +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: soff1_voff4: +; GFX12-GISEL: ; %bb.0: ; %bb +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: s_endpgm bb: %soff1 = mul i32 %soff, 1 %a = alloca i8, i32 64, align 4, addrspace(5) @@ -375,6 +471,40 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: soff2_voff1: +; GFX12-SDAG: ; %bb.0: ; %bb +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: soff2_voff1: +; GFX12-GISEL: ; %bb.0: ; %bb +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: s_endpgm bb: %soff2 = mul i32 %soff, 2 %a = alloca i8, i32 64, align 4, addrspace(5) @@ -475,6 +605,40 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: soff2_voff2: +; GFX12-SDAG: ; %bb.0: ; %bb +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: soff2_voff2: +; GFX12-GISEL: ; %bb.0: ; %bb +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: s_endpgm bb: %soff2 = mul i32 %soff, 2 %a = alloca i8, i32 64, align 4, addrspace(5) @@ -575,6 +739,40 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: soff2_voff4: +; GFX12-SDAG: ; %bb.0: ; %bb +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: soff2_voff4: +; GFX12-GISEL: ; %bb.0: ; %bb +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: s_endpgm bb: %soff2 = mul i32 %soff, 2 %a = alloca i8, i32 64, align 4, addrspace(5) @@ -672,6 +870,40 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: soff4_voff1: +; GFX12-SDAG: ; %bb.0: ; %bb +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: soff4_voff1: +; GFX12-GISEL: ; %bb.0: ; %bb +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: s_endpgm bb: %soff4 = mul i32 %soff, 4 %a = alloca i8, i32 64, align 4, addrspace(5) @@ -772,6 +1004,40 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: soff4_voff2: +; GFX12-SDAG: ; %bb.0: ; %bb +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: soff4_voff2: +; GFX12-GISEL: ; %bb.0: ; %bb +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: s_endpgm bb: %soff4 = mul i32 %soff, 4 %a = alloca i8, i32 64, align 4, addrspace(5) @@ -870,6 +1136,40 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: soff4_voff4: +; GFX12-SDAG: ; %bb.0: ; %bb +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: soff4_voff4: +; GFX12-GISEL: ; %bb.0: ; %bb +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: s_endpgm bb: %soff4 = mul i32 %soff, 4 %a = alloca i8, i32 64, align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index f1a2fe8867753..98379f5e3c68b 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -2,11 +2,13 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX12 %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-PAL %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12-PAL %s define amdgpu_kernel void @zero_init_kernel() { ; GFX9-LABEL: zero_init_kernel: @@ -63,6 +65,22 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:4 ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: zero_init_kernel: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:52 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:36 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:20 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:4 +; GFX12-NEXT: s_endpgm +; ; GFX9-PAL-LABEL: zero_init_kernel: ; GFX9-PAL: ; %bb.0: ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] @@ -165,6 +183,22 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:20 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:4 ; GFX11-PAL-NEXT: s_endpgm +; +; GFX12-PAL-LABEL: zero_init_kernel: +; GFX12-PAL: ; %bb.0: +; GFX12-PAL-NEXT: s_mov_b32 s0, 0 +; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-PAL-NEXT: s_mov_b32 s1, s0 +; GFX12-PAL-NEXT: s_mov_b32 s2, s0 +; GFX12-PAL-NEXT: s_mov_b32 s3, s0 +; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-PAL-NEXT: s_clause 0x3 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:52 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:36 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:20 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:4 +; GFX12-PAL-NEXT: s_endpgm %alloca = alloca [32 x i16], align 2, addrspace(5) call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false) ret void @@ -223,6 +257,23 @@ define void @zero_init_foo() { ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: zero_init_foo: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:48 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-PAL-LABEL: zero_init_foo: ; GFX9-PAL: ; %bb.0: ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -290,6 +341,23 @@ define void @zero_init_foo() { ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-PAL-LABEL: zero_init_foo: +; GFX12-PAL: ; %bb.0: +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: s_mov_b32 s0, 0 +; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-PAL-NEXT: s_mov_b32 s1, s0 +; GFX12-PAL-NEXT: s_mov_b32 s2, s0 +; GFX12-PAL-NEXT: s_mov_b32 s3, s0 +; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-PAL-NEXT: s_clause 0x3 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:48 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [32 x i16], align 2, addrspace(5) call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false) ret void @@ -350,6 +418,22 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: store_load_sindex_kernel: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 15 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_and_b32 s1, s0, 15 +; GFX12-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-NEXT: s_lshl_b32 s1, s1, 2 +; GFX12-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-NEXT: s_add_co_i32 s1, s1, 4 +; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_endpgm +; ; GFX9-PAL-LABEL: store_load_sindex_kernel: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] @@ -428,6 +512,22 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm +; +; GFX12-PAL-LABEL: store_load_sindex_kernel: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX12-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 +; GFX12-PAL-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-PAL-NEXT: s_add_co_i32 s1, s1, 4 +; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx @@ -487,6 +587,20 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: store_load_sindex_foo: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v0, 15 +; GFX12-NEXT: s_and_b32 s1, s0, 15 +; GFX12-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-NEXT: s_lshl_b32 s1, s1, 2 +; GFX12-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-NEXT: s_add_co_i32 s1, s1, 4 +; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_endpgm +; ; GFX9-PAL-LABEL: store_load_sindex_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] @@ -558,6 +672,20 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm +; +; GFX12-PAL-LABEL: store_load_sindex_foo: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 +; GFX12-PAL-NEXT: s_add_co_i32 s0, s0, 4 +; GFX12-PAL-NEXT: s_add_co_i32 s1, s1, 4 +; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx @@ -610,6 +738,17 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: store_load_vindex_kernel: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_sub_nc_u32_e32 v2, 4, v0 +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:4 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_endpgm +; ; GFX9-PAL-LABEL: store_load_vindex_kernel: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] @@ -671,6 +810,17 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm +; +; GFX12-PAL-LABEL: store_load_vindex_kernel: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 4, v0 +; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -724,6 +874,19 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: store_load_vindex_foo: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 +; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s32 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX12-NEXT: scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v1, s32 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-PAL-LABEL: store_load_vindex_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -777,6 +940,19 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-PAL-LABEL: store_load_vindex_foo: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 +; GFX12-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s32 +; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [32 x float], align 4, addrspace(5) %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx @@ -810,6 +986,13 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) { ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: private_ptr_foo: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v1, 0x41200000 +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-PAL-LABEL: private_ptr_foo: ; GFX9-PAL: ; %bb.0: ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -839,6 +1022,13 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) { ; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-PAL-LABEL: private_ptr_foo: +; GFX12-PAL: ; %bb.0: +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 +; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 +; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1 store float 1.000000e+01, ptr addrspace(5) %gep, align 4 ret void @@ -905,6 +1095,24 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:308 ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: zero_init_small_offset_kernel: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:260 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:276 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:292 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:308 +; GFX12-NEXT: s_endpgm +; ; GFX9-PAL-LABEL: zero_init_small_offset_kernel: ; GFX9-PAL: ; %bb.0: ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] @@ -1017,6 +1225,24 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:292 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:308 ; GFX11-PAL-NEXT: s_endpgm +; +; GFX12-PAL-LABEL: zero_init_small_offset_kernel: +; GFX12-PAL: ; %bb.0: +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_mov_b32 s0, 0 +; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-PAL-NEXT: s_mov_b32 s1, s0 +; GFX12-PAL-NEXT: s_mov_b32 s2, s0 +; GFX12-PAL-NEXT: s_mov_b32 s3, s0 +; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-PAL-NEXT: s_clause 0x3 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:260 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:276 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:292 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:308 +; GFX12-PAL-NEXT: s_endpgm %padding = alloca [64 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef @@ -1084,6 +1310,25 @@ define void @zero_init_small_offset_foo() { ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: zero_init_small_offset_foo: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: scratch_load_b32 v0, off, s32 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-PAL-LABEL: zero_init_small_offset_foo: ; GFX9-PAL: ; %bb.0: ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1159,6 +1404,25 @@ define void @zero_init_small_offset_foo() { ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-PAL-LABEL: zero_init_small_offset_foo: +; GFX12-PAL: ; %bb.0: +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_mov_b32 s0, 0 +; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-PAL-NEXT: s_mov_b32 s1, s0 +; GFX12-PAL-NEXT: s_mov_b32 s2, s0 +; GFX12-PAL-NEXT: s_mov_b32 s3, s0 +; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-PAL-NEXT: s_clause 0x3 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304 +; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] %padding = alloca [64 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef @@ -1228,6 +1492,24 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: store_load_sindex_small_offset_kernel: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, 15 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_and_b32 s1, s0, 15 +; GFX12-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-NEXT: s_lshl_b32 s1, s1, 2 +; GFX12-NEXT: s_addk_co_i32 s0, 0x104 +; GFX12-NEXT: s_addk_co_i32 s1, 0x104 +; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_endpgm +; ; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] @@ -1343,6 +1625,24 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm +; +; GFX12-PAL-LABEL: store_load_sindex_small_offset_kernel: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX12-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 +; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x104 +; GFX12-PAL-NEXT: s_addk_co_i32 s1, 0x104 +; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -1412,6 +1712,22 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: store_load_sindex_small_offset_foo: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, 15 +; GFX12-NEXT: s_and_b32 s1, s0, 15 +; GFX12-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-NEXT: s_lshl_b32 s1, s1, 2 +; GFX12-NEXT: s_addk_co_i32 s0, 0x104 +; GFX12-NEXT: s_addk_co_i32 s1, 0x104 +; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_endpgm +; ; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] @@ -1518,6 +1834,22 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm +; +; GFX12-PAL-LABEL: store_load_sindex_small_offset_foo: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 +; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x104 +; GFX12-PAL-NEXT: s_addk_co_i32 s1, 0x104 +; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -1579,6 +1911,18 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: store_load_vindex_small_offset_kernel: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:260 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_endpgm +; ; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] @@ -1672,6 +2016,18 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm +; +; GFX12-PAL-LABEL: store_load_vindex_small_offset_kernel: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 +; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:260 th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -1737,6 +2093,21 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: store_load_vindex_small_offset_foo: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 +; GFX12-NEXT: s_add_co_i32 s0, s32, 0x100 +; GFX12-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX12-NEXT: scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:256 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1802,6 +2173,21 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-PAL-LABEL: store_load_vindex_small_offset_foo: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 +; GFX12-PAL-NEXT: s_add_co_i32 s0, s32, 0x100 +; GFX12-PAL-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -1879,6 +2265,24 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:48 ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: zero_init_large_offset_kernel: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:16388 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:16404 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:16420 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:16436 +; GFX12-NEXT: s_endpgm +; ; GFX9-PAL-LABEL: zero_init_large_offset_kernel: ; GFX9-PAL: ; %bb.0: ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] @@ -1996,6 +2400,24 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:48 ; GFX11-PAL-NEXT: s_endpgm +; +; GFX12-PAL-LABEL: zero_init_large_offset_kernel: +; GFX12-PAL: ; %bb.0: +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_mov_b32 s0, 0 +; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-PAL-NEXT: s_mov_b32 s1, s0 +; GFX12-PAL-NEXT: s_mov_b32 s2, s0 +; GFX12-PAL-NEXT: s_mov_b32 s3, s0 +; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-PAL-NEXT: s_clause 0x3 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16388 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16404 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16420 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16436 +; GFX12-PAL-NEXT: s_endpgm %padding = alloca [4096 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef @@ -2074,6 +2496,25 @@ define void @zero_init_large_offset_foo() { ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:48 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: zero_init_large_offset_foo: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: scratch_load_b32 v0, off, s32 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16384 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16400 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16416 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16432 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-PAL-LABEL: zero_init_large_offset_foo: ; GFX9-PAL: ; %bb.0: ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2190,6 +2631,25 @@ define void @zero_init_large_offset_foo() { ; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:48 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-PAL-LABEL: zero_init_large_offset_foo: +; GFX12-PAL: ; %bb.0: +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_mov_b32 s0, 0 +; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-PAL-NEXT: s_mov_b32 s1, s0 +; GFX12-PAL-NEXT: s_mov_b32 s2, s0 +; GFX12-PAL-NEXT: s_mov_b32 s3, s0 +; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-PAL-NEXT: s_clause 0x3 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16384 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16400 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16416 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16432 +; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] %padding = alloca [4096 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef @@ -2259,6 +2719,24 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: store_load_sindex_large_offset_kernel: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, 15 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_and_b32 s1, s0, 15 +; GFX12-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-NEXT: s_lshl_b32 s1, s1, 2 +; GFX12-NEXT: s_addk_co_i32 s0, 0x4004 +; GFX12-NEXT: s_addk_co_i32 s1, 0x4004 +; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_endpgm +; ; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] @@ -2374,6 +2852,24 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm +; +; GFX12-PAL-LABEL: store_load_sindex_large_offset_kernel: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX12-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 +; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x4004 +; GFX12-PAL-NEXT: s_addk_co_i32 s1, 0x4004 +; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -2443,6 +2939,22 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: store_load_sindex_large_offset_foo: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, 15 +; GFX12-NEXT: s_and_b32 s1, s0, 15 +; GFX12-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-NEXT: s_lshl_b32 s1, s1, 2 +; GFX12-NEXT: s_addk_co_i32 s0, 0x4004 +; GFX12-NEXT: s_addk_co_i32 s1, 0x4004 +; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_endpgm +; ; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] @@ -2549,6 +3061,22 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm +; +; GFX12-PAL-LABEL: store_load_sindex_large_offset_foo: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 +; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 +; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x4004 +; GFX12-PAL-NEXT: s_addk_co_i32 s1, 0x4004 +; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -2611,6 +3139,18 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: store_load_vindex_large_offset_kernel: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16388 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_endpgm +; ; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] @@ -2706,6 +3246,18 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm +; +; GFX12-PAL-LABEL: store_load_vindex_large_offset_kernel: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 +; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:16388 th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -2772,6 +3324,21 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: store_load_vindex_large_offset_foo: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 +; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000 +; GFX12-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX12-NEXT: scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2839,6 +3406,21 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s0 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-PAL-LABEL: store_load_vindex_large_offset_foo: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 +; GFX12-PAL-NEXT: s_add_co_i32 s0, s32, 0x4000 +; GFX12-PAL-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -2900,6 +3482,17 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: store_load_large_imm_offset_kernel: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 +; GFX12-NEXT: scratch_store_b32 off, v0, off offset:4 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_store_b32 off, v1, off offset:16004 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, off offset:16004 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_endpgm +; ; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] @@ -2993,6 +3586,17 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm +; +; GFX12-PAL-LABEL: store_load_large_imm_offset_kernel: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 +; GFX12-PAL-NEXT: scratch_store_b32 off, v0, off offset:4 th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_store_b32 off, v1, off offset:16004 th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:16004 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_endpgm bb: %i = alloca [4096 x i32], align 4, addrspace(5) %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef @@ -3050,6 +3654,18 @@ define void @store_load_large_imm_offset_foo() { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: store_load_large_imm_offset_foo: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 +; GFX12-NEXT: scratch_store_b32 off, v0, s32 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16000 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16000 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3108,6 +3724,18 @@ define void @store_load_large_imm_offset_foo() { ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-PAL-LABEL: store_load_large_imm_offset_foo: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 +; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s32 th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_store_b32 off, v1, s32 offset:16000 th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 offset:16000 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [4096 x i32], align 4, addrspace(5) %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef @@ -3166,6 +3794,19 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: store_load_vidx_sidx_offset: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, 4 +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_endpgm +; ; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] @@ -3234,6 +3875,19 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX11-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1024 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm +; +; GFX12-PAL-LABEL: store_load_vidx_sidx_offset: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0 +; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 4 +; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_endpgm bb: %alloca = alloca [32 x i32], align 4, addrspace(5) %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -3278,6 +3932,16 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: store_load_i64_aligned: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 +; GFX12-NEXT: scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-PAL-LABEL: store_load_i64_aligned: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3320,6 +3984,16 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) { ; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-PAL-LABEL: store_load_i64_aligned: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 +; GFX12-PAL-NEXT: scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, ptr addrspace(5) %arg, align 8 %load = load volatile i64, ptr addrspace(5) %arg, align 8 @@ -3359,6 +4033,16 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: store_load_i64_unaligned: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 +; GFX12-NEXT: scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-PAL-LABEL: store_load_i64_unaligned: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3401,6 +4085,16 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-PAL-LABEL: store_load_i64_unaligned: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 +; GFX12-PAL-NEXT: scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, ptr addrspace(5) %arg, align 1 %load = load volatile i64, ptr addrspace(5) %arg, align 1 @@ -3443,6 +4137,17 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: store_load_v3i32_unaligned: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-NEXT: v_mov_b32_e32 v3, 3 +; GFX12-NEXT: scratch_store_b96 v0, v[1:3], off th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b96 v[0:2], v0, off th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-PAL-LABEL: store_load_v3i32_unaligned: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3489,6 +4194,17 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { ; GFX11-PAL-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-PAL-LABEL: store_load_v3i32_unaligned: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-PAL-NEXT: v_mov_b32_e32 v3, 3 +; GFX12-PAL-NEXT: scratch_store_b96 v0, v[1:3], off th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_b96 v[0:2], v0, off th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: store volatile <3 x i32> , ptr addrspace(5) %arg, align 1 %load = load volatile <3 x i32>, ptr addrspace(5) %arg, align 1 @@ -3533,6 +4249,17 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: store_load_v4i32_unaligned: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4 +; GFX12-NEXT: scratch_store_b128 v0, v[1:4], off th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b128 v[0:3], v0, off th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-PAL-LABEL: store_load_v4i32_unaligned: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3582,6 +4309,17 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-PAL-LABEL: store_load_v4i32_unaligned: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-PAL-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4 +; GFX12-PAL-NEXT: scratch_store_b128 v0, v[1:4], off th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_b128 v[0:3], v0, off th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: store volatile <4 x i32> , ptr addrspace(5) %arg, align 1 %load = load volatile <4 x i32>, ptr addrspace(5) %arg, align 1 @@ -3620,6 +4358,16 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: store_load_i32_negative_unaligned: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-NEXT: scratch_store_b8 v0, v1, off offset:-1 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:-1 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-PAL-LABEL: store_load_i32_negative_unaligned: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3672,6 +4420,16 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg) ; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-PAL-LABEL: store_load_i32_negative_unaligned: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-1 th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-1 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: %ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -1 store volatile i8 1, ptr addrspace(5) %ptr, align 1 @@ -3712,6 +4470,16 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: store_load_i32_large_negative_unaligned: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-NEXT: scratch_store_b8 v0, v1, off offset:-4225 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:-4225 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3765,6 +4533,16 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture ; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-PAL-LABEL: store_load_i32_large_negative_unaligned: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-4225 th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-4225 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: %ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -4225 store volatile i8 1, ptr addrspace(5) %ptr, align 1 @@ -3842,6 +4620,25 @@ define amdgpu_ps void @large_offset() { ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: large_offset: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:3024 th:TH_STORE_NT_RT +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810 +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: ; use v0 +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: ; use v1 +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_endpgm +; ; GFX9-PAL-LABEL: large_offset: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] @@ -3940,6 +4737,25 @@ define amdgpu_ps void @large_offset() { ; GFX11-PAL-NEXT: ; use v1 ; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm +; +; GFX12-PAL-LABEL: large_offset: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-PAL-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 +; GFX12-PAL-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:3024 th:TH_STORE_NT_RT +; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-PAL-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 th:TH_LOAD_RT_NT +; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810 +; GFX12-PAL-NEXT: ;;#ASMSTART +; GFX12-PAL-NEXT: ; use v0 +; GFX12-PAL-NEXT: ;;#ASMEND +; GFX12-PAL-NEXT: ;;#ASMSTART +; GFX12-PAL-NEXT: ; use v1 +; GFX12-PAL-NEXT: ;;#ASMEND +; GFX12-PAL-NEXT: s_endpgm bb: %alloca = alloca [128 x <4 x i32>], align 16, addrspace(5) %alloca2 = alloca [128 x <4 x i32>], align 16, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index 9eb1b484e461a..f1879f2876678 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN1 %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN2 %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_add_i64_offset: @@ -32,6 +33,19 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst @@ -76,6 +90,22 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst @@ -121,6 +151,24 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -170,6 +218,23 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -204,6 +269,19 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst ret void @@ -243,6 +321,22 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst store i64 %tmp0, ptr %out2 @@ -283,6 +377,24 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst @@ -327,6 +439,23 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst @@ -364,6 +493,19 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst @@ -408,6 +550,22 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst @@ -453,6 +611,24 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -502,6 +678,23 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -536,6 +729,19 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst ret void @@ -575,6 +781,22 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst store i64 %tmp0, ptr %out2 @@ -615,6 +837,24 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst @@ -659,6 +899,23 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst @@ -696,6 +953,19 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst @@ -740,6 +1010,22 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst @@ -785,6 +1071,24 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -834,6 +1138,23 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -868,6 +1189,19 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst ret void @@ -907,6 +1241,22 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst store i64 %tmp0, ptr %out2 @@ -947,6 +1297,24 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst @@ -991,6 +1359,23 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst @@ -1026,6 +1411,18 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst @@ -1070,6 +1467,21 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst @@ -1113,6 +1525,23 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -1162,6 +1591,22 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -1194,6 +1639,18 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst ret void @@ -1233,6 +1690,21 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst store i64 %tmp0, ptr %out2 @@ -1271,6 +1743,23 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst @@ -1315,6 +1804,22 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst @@ -1350,6 +1855,18 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst @@ -1394,6 +1911,21 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst @@ -1437,6 +1969,23 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -1486,6 +2035,22 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -1518,6 +2083,18 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst ret void @@ -1557,6 +2134,21 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst store i64 %tmp0, ptr %out2 @@ -1595,6 +2187,23 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst @@ -1639,6 +2248,22 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst @@ -1674,6 +2299,18 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst @@ -1718,6 +2355,21 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst @@ -1761,6 +2413,23 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -1810,6 +2479,22 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -1842,6 +2527,18 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst ret void @@ -1881,6 +2578,21 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst store i64 %tmp0, ptr %out2 @@ -1919,6 +2631,23 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst @@ -1963,6 +2692,22 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst @@ -1998,6 +2743,18 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst @@ -2042,6 +2799,21 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst @@ -2085,6 +2857,23 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -2134,6 +2923,22 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -2166,6 +2971,18 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst ret void @@ -2205,6 +3022,21 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst store i64 %tmp0, ptr %out2 @@ -2243,6 +3075,23 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst @@ -2287,6 +3136,22 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst @@ -2324,6 +3189,19 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst @@ -2368,6 +3246,22 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst @@ -2413,6 +3307,24 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -2462,6 +3374,23 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -2496,6 +3425,19 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst ret void @@ -2535,6 +3477,22 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst store i64 %tmp0, ptr %out2 @@ -2575,6 +3533,24 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst @@ -2619,6 +3595,23 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst @@ -2656,6 +3649,19 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst @@ -2692,6 +3698,19 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_f64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr double, ptr %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, double %in syncscope("agent") seq_cst @@ -2728,6 +3747,19 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_pointer_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr ptr, ptr %out, i32 4 %val = atomicrmw volatile xchg ptr %gep, ptr %in syncscope("agent") seq_cst @@ -2772,6 +3804,22 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst @@ -2817,6 +3865,24 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -2866,6 +3932,23 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -2900,6 +3983,19 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst ret void @@ -2939,6 +4035,22 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst store i64 %tmp0, ptr %out2 @@ -2979,6 +4091,24 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst @@ -3023,6 +4153,23 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst @@ -3060,6 +4207,19 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst @@ -3104,6 +4264,22 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst @@ -3149,6 +4325,24 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -3198,6 +4392,23 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -3232,6 +4443,19 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst ret void @@ -3271,6 +4495,22 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst store i64 %tmp0, ptr %out2 @@ -3311,6 +4551,24 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst @@ -3355,6 +4613,23 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst @@ -3394,6 +4669,19 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %in, i64 4 %val = load atomic i64, ptr %gep seq_cst, align 8 @@ -3429,6 +4717,19 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %val = load atomic i64, ptr %in syncscope("agent") seq_cst, align 8 store i64 %val, ptr %out @@ -3475,6 +4776,24 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %in, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -3519,6 +4838,24 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %in, i64 %index %val = load atomic i64, ptr %ptr seq_cst, align 8 @@ -3552,6 +4889,15 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 store atomic i64 %in, ptr %gep seq_cst, align 8 @@ -3580,6 +4926,15 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: store atomic i64 %in, ptr %out seq_cst, align 8 ret void @@ -3619,6 +4974,20 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -3656,6 +5025,20 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index store atomic i64 %in, ptr %ptr seq_cst, align 8 @@ -3698,6 +5081,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst @@ -3740,6 +5139,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_soffset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 9000 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst @@ -3786,6 +5201,21 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst @@ -3834,6 +5264,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -3889,6 +5335,25 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -3930,6 +5395,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst ret void @@ -3971,6 +5452,21 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst %extract0 = extractvalue { i64, i1 } %val, 0 @@ -4014,6 +5510,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst @@ -4064,6 +5576,25 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst @@ -4104,6 +5635,19 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr double, ptr %in, i64 4 %val = load atomic double, ptr %gep seq_cst, align 8 @@ -4139,6 +5683,19 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %val = load atomic double, ptr %in syncscope("agent") seq_cst, align 8 store double %val, ptr %out @@ -4185,6 +5742,24 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr %in, i64 %index %gep = getelementptr double, ptr %ptr, i64 4 @@ -4229,6 +5804,24 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr %in, i64 %index %val = load atomic double, ptr %ptr seq_cst, align 8 @@ -4262,6 +5855,15 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr double, ptr %out, i64 4 store atomic double %in, ptr %gep seq_cst, align 8 @@ -4290,6 +5892,15 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: store atomic double %in, ptr %out seq_cst, align 8 ret void @@ -4329,6 +5940,20 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr %out, i64 %index %gep = getelementptr double, ptr %ptr, i64 4 @@ -4366,6 +5991,20 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr %out, i64 %index store atomic double %in, ptr %ptr seq_cst, align 8 @@ -4402,6 +6041,19 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst @@ -4446,6 +6098,22 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst @@ -4491,6 +6159,24 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_incr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -4540,6 +6226,23 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -4574,6 +6277,19 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst ret void @@ -4613,6 +6329,22 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst store i64 %tmp0, ptr %out2 @@ -4653,6 +6385,24 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_incr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst @@ -4697,6 +6447,23 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_ret_incr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst @@ -4734,6 +6501,19 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst @@ -4778,6 +6558,22 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst @@ -4823,6 +6619,24 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_decr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -4872,6 +6686,23 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -4906,6 +6737,19 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst ret void @@ -4945,6 +6789,22 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst store i64 %tmp0, ptr %out2 @@ -4985,6 +6845,24 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_decr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst @@ -5029,6 +6907,23 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_ret_decr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll new file mode 100644 index 0000000000000..5c3e83c4b9cfe --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG +; RUN: llc < %s -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL + +declare float @llvm.amdgcn.flat.atomic.fmin.num.f32.p1.f32(ptr %ptr, float %data) +declare float @llvm.amdgcn.flat.atomic.fmax.num.f32.p1.f32(ptr %ptr, float %data) + +define amdgpu_cs void @flat_atomic_fmin_num_f32_noret(ptr %ptr, float %data) { +; GFX12-LABEL: flat_atomic_fmin_num_f32_noret: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 +; GFX12-NEXT: s_endpgm + %ret = call float @llvm.amdgcn.flat.atomic.fmin.num.f32.p1.f32(ptr %ptr, float %data) + ret void +} + +define amdgpu_cs void @flat_atomic_fmax_num_f32_noret(ptr %ptr, float %data) { +; GFX12-LABEL: flat_atomic_fmax_num_f32_noret: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 +; GFX12-NEXT: s_endpgm + %ret = call float @llvm.amdgcn.flat.atomic.fmax.num.f32.p1.f32(ptr %ptr, float %data) + ret void +} + +define amdgpu_cs float @flat_atomic_fmin_num_f32_rtn(ptr %ptr, float %data, ptr %out) { +; GFX12-LABEL: flat_atomic_fmin_num_f32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_store_b32 v[3:4], v0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog + %ret = call float @llvm.amdgcn.flat.atomic.fmin.num.f32.p1.f32(ptr %ptr, float %data) + store float %ret, ptr %out + ret float %ret +} + +define amdgpu_cs float @flat_atomic_fmax_num_f32_rtn(ptr %ptr, float %data, ptr %out) { +; GFX12-LABEL: flat_atomic_fmax_num_f32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_store_b32 v[3:4], v0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog + %ret = call float @llvm.amdgcn.flat.atomic.fmax.num.f32.p1.f32(ptr %ptr, float %data) + store float %ret, ptr %out + ret float %ret +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX12-GISEL: {{.*}} +; GFX12-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll new file mode 100644 index 0000000000000..cf069ee92639f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG +; RUN: llc < %s -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL + +declare float @llvm.amdgcn.global.atomic.fmin.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data) +declare float @llvm.amdgcn.global.atomic.fmax.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data) + +define amdgpu_cs void @global_atomic_fmin_num_f32_noret(ptr addrspace(1) %ptr, float %data) { +; GFX12-LABEL: global_atomic_fmin_num_f32_noret: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %ret = call float @llvm.amdgcn.global.atomic.fmin.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data) + ret void +} + +define amdgpu_cs void @global_atomic_fmax_num_f32_noret(ptr addrspace(1) %ptr, float %data) { +; GFX12-LABEL: global_atomic_fmax_num_f32_noret: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %ret = call float @llvm.amdgcn.global.atomic.fmax.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data) + ret void +} + +define amdgpu_cs void @global_atomic_fmax_num_f32_rtn(ptr addrspace(1) %ptr, float %data, ptr addrspace(1) %out) { +; GFX12-LABEL: global_atomic_fmax_num_f32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %ret = call float @llvm.amdgcn.global.atomic.fmax.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data) + store float %ret, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @global_atomic_fmin_num_f32_rtn(ptr addrspace(1) %ptr, float %data, ptr addrspace(1) %out) { +; GFX12-LABEL: global_atomic_fmin_num_f32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b32 v[3:4], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %ret = call float @llvm.amdgcn.global.atomic.fmin.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data) + store float %ret, ptr addrspace(1) %out + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX12-GISEL: {{.*}} +; GFX12-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll index c2b51467edd08..8cd25298d7473 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll @@ -1,21 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX908_GFX11 %s -; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX908_GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX908 %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX940 %s +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX940 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX11_GFX12 %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX11_GFX12 %s define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) { - ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic - ; GFX908_GFX11: bb.0 (%ir-block.0): - ; GFX908_GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX908_GFX11-NEXT: {{ $}} - ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11-NEXT: S_ENDPGM 0 + ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic + ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0): + ; GFX908_GFX11_GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX908_GFX11_GFX12-NEXT: {{ $}} + ; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX908_GFX11_GFX12-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0 ; ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic ; GFX90A_GFX940: bb.0 (%ir-block.0): @@ -33,17 +34,17 @@ define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(ptr addrspace(1) } define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, float %data) { - ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic - ; GFX908_GFX11: bb.0 (%ir-block.0): - ; GFX908_GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX908_GFX11-NEXT: {{ $}} - ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11-NEXT: S_ENDPGM 0 + ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic + ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0): + ; GFX908_GFX11_GFX12-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX908_GFX11_GFX12-NEXT: {{ $}} + ; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX908_GFX11_GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0 ; ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic ; GFX90A_GFX940: bb.0 (%ir-block.0): @@ -61,17 +62,17 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_intrinsic(ptr addrspa } define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, float %data) { - ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic - ; GFX908_GFX11: bb.0 (%ir-block.0): - ; GFX908_GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX908_GFX11-NEXT: {{ $}} - ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11-NEXT: S_ENDPGM 0 + ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic + ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0): + ; GFX908_GFX11_GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX908_GFX11_GFX12-NEXT: {{ $}} + ; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX908_GFX11_GFX12-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0 ; ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic ; GFX90A_GFX940: bb.0 (%ir-block.0): @@ -89,17 +90,17 @@ define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_flat_intrinsic(ptr addrspac } define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, float %data) { - ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic - ; GFX908_GFX11: bb.0 (%ir-block.0): - ; GFX908_GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX908_GFX11-NEXT: {{ $}} - ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11-NEXT: S_ENDPGM 0 + ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic + ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0): + ; GFX908_GFX11_GFX12-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX908_GFX11_GFX12-NEXT: {{ $}} + ; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX908_GFX11_GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0 ; ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic ; GFX90A_GFX940: bb.0 (%ir-block.0): @@ -117,17 +118,17 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic(ptr ad } define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) #0 { - ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw - ; GFX908_GFX11: bb.0 (%ir-block.0): - ; GFX908_GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX908_GFX11-NEXT: {{ $}} - ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11-NEXT: S_ENDPGM 0 + ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw + ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0): + ; GFX908_GFX11_GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX908_GFX11_GFX12-NEXT: {{ $}} + ; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX908_GFX11_GFX12-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0 ; ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw ; GFX90A_GFX940: bb.0 (%ir-block.0): @@ -145,6 +146,68 @@ define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1) } define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, float %data) #0 { + ; GFX908-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw + ; GFX908: bb.0 (%ir-block.0): + ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) + ; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE]] + ; GFX908-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX908-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX908-NEXT: S_BRANCH %bb.1 + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.1 (%ir-block.5): + ; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX908-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec + ; GFX908-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 + ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], killed [[S_MOV_B32_1]], implicit-def dead $scc, implicit $exec + ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec + ; GFX908-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GFX908-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX908-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, killed [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX908-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX908-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX908-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX908-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX908-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_2]] + ; GFX908-NEXT: early-clobber %1:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec + ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX908-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX908-NEXT: S_BRANCH %bb.2 + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.2 (%ir-block.35): + ; GFX908-NEXT: successors: %bb.3(0x80000000) + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY %1 + ; GFX908-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_1]], [[COPY8]], [[COPY3]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.3.Flow: + ; GFX908-NEXT: successors: %bb.4(0x80000000) + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.4 (%ir-block.37): + ; GFX908-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX908-NEXT: S_ENDPGM 0 + ; ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) @@ -206,6 +269,60 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A_GFX940-NEXT: bb.4 (%ir-block.37): ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; + ; GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw + ; GFX11_GFX12: bb.0 (%ir-block.0): + ; GFX11_GFX12-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) + ; GFX11_GFX12-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX11_GFX12-NEXT: {{ $}} + ; GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11_GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE]] + ; GFX11_GFX12-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_32 = SI_PS_LIVE + ; GFX11_GFX12-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX11_GFX12-NEXT: S_BRANCH %bb.1 + ; GFX11_GFX12-NEXT: {{ $}} + ; GFX11_GFX12-NEXT: bb.1 (%ir-block.5): + ; GFX11_GFX12-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX11_GFX12-NEXT: {{ $}} + ; GFX11_GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $exec_lo + ; GFX11_GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX11_GFX12-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[S_MOV_B32_]], implicit $exec + ; GFX11_GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 + ; GFX11_GFX12-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], killed [[S_MOV_B32_1]], implicit-def dead $scc, implicit $exec + ; GFX11_GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec + ; GFX11_GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 353, 15, 15, 0, implicit $exec + ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec + ; GFX11_GFX12-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 354, 15, 15, 0, implicit $exec + ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, killed [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec + ; GFX11_GFX12-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_1]], 356, 15, 15, 0, implicit $exec + ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec + ; GFX11_GFX12-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 360, 15, 15, 0, implicit $exec + ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec + ; GFX11_GFX12-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX11_GFX12-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[V_ADD_F32_e64_3]], 0, implicit $exec + ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_PERMLANEX16_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX11_GFX12-NEXT: early-clobber %1:vgpr_32 = STRICT_WWM killed [[V_ADD_F32_e64_4]], implicit $exec + ; GFX11_GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX11_GFX12-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX11_GFX12-NEXT: S_BRANCH %bb.2 + ; GFX11_GFX12-NEXT: {{ $}} + ; GFX11_GFX12-NEXT: bb.2 (%ir-block.28): + ; GFX11_GFX12-NEXT: successors: %bb.3(0x80000000) + ; GFX11_GFX12-NEXT: {{ $}} + ; GFX11_GFX12-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_1]], %1, [[COPY3]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11_GFX12-NEXT: {{ $}} + ; GFX11_GFX12-NEXT: bb.3.Flow: + ; GFX11_GFX12-NEXT: successors: %bb.4(0x80000000) + ; GFX11_GFX12-NEXT: {{ $}} + ; GFX11_GFX12-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX11_GFX12-NEXT: {{ $}} + ; GFX11_GFX12-NEXT: bb.4 (%ir-block.30): + ; GFX11_GFX12-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX11_GFX12-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic ret void } diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll index c559750d4f1b4..77be7f506aaf2 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) { ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 2099ed7dafc37..de4f748413e6c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -2,6 +2,8 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s ; Test using saddr addressing mode of global_*load_* flat instructions. @@ -24,6 +26,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0(ptr addrspace(1) inreg %sb ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_offset_0: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %load = load i8, ptr addrspace(1) %sbase %zext = zext i8 %load to i32 %to.vgpr = bitcast i32 %zext to float @@ -52,6 +61,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4095(ptr addrspace(1) inreg ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_offset_4095: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -74,6 +90,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4096(ptr addrspace(1) inreg ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_offset_4096: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4096 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4096 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -96,6 +119,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4097(ptr addrspace(1) inreg ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_offset_4097: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4097 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4097 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -126,6 +156,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(ptr addrspace(1) inr ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_offset_neg4096: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4096 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -160,6 +197,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(ptr addrspace(1) inr ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_offset_neg4097: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4097 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4097 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -194,6 +238,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(ptr addrspace(1) inr ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_offset_neg4098: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4098 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4098 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -223,6 +274,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2048(ptr addrspace(1) inreg ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2048 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_offset_2048: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:2048 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2048 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -252,6 +310,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2049(ptr addrspace(1) inreg ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2049 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_offset_2049: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:2049 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2049 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -281,6 +346,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2050(ptr addrspace(1) inreg ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2050 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_offset_2050: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:2050 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2050 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -303,6 +375,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(ptr addrspace(1) inr ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_offset_neg2048: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2048 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -333,6 +412,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(ptr addrspace(1) inr ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2049 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_offset_neg2049: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2049 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2049 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -363,6 +449,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(ptr addrspace(1) inr ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2050 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_offset_neg2050: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2050 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2050 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -370,27 +463,109 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(ptr addrspace(1) inr ret float %to.vgpr } -define amdgpu_ps float @global_load_saddr_i8_offset_4294967295(ptr addrspace(1) inreg %sbase) { -; GFX9-LABEL: global_load_saddr_i8_offset_4294967295: +define amdgpu_ps float @global_load_saddr_i8_offset_0x7FFFFF(ptr addrspace(1) inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_0x7FFFFF: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7ff000 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_offset_0x7FFFFF: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, 0x7ff800 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_0x7FFFFF: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0x7ff000 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_offset_0x7FFFFF: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:8388607 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 8388607 + %load = load i8, ptr addrspace(1) %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFF(ptr addrspace(1) inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_0xFFFFFF: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_u32 s0, s2, 0xff800000 +; GFX9-NEXT: s_addc_u32 s1, s3, -1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_offset_0xFFFFFF: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xff800000, s2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_0xFFFFFF: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0xff800000, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_offset_0xFFFFFF: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-8388608 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -8388608 + %load = load i8, ptr addrspace(1) %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFFFF(ptr addrspace(1) inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_0xFFFFFFFF: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0xfffff000 ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: global_load_saddr_i8_offset_4294967295: +; GFX10-LABEL: global_load_saddr_i8_offset_0xFFFFFFFF: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v0, 0xfffff800 ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: global_load_saddr_i8_offset_4294967295: +; GFX11-LABEL: global_load_saddr_i8_offset_0xFFFFFFFF: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 0xfffff000 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_offset_0xFFFFFFFF: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, 0xff800000 +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:8388607 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967295 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -398,8 +573,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967295(ptr addrspace(1) ret float %to.vgpr } -define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(ptr addrspace(1) inreg %sbase) { -; GFX9-LABEL: global_load_saddr_i8_offset_4294967296: +define amdgpu_ps float @global_load_saddr_i8_offset_0x100000000(ptr addrspace(1) inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_0x100000000: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 @@ -408,7 +583,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(ptr addrspace(1) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: global_load_saddr_i8_offset_4294967296: +; GFX10-LABEL: global_load_saddr_i8_offset_0x100000000: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] @@ -416,7 +591,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(ptr addrspace(1) ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: global_load_saddr_i8_offset_4294967296: +; GFX11-LABEL: global_load_saddr_i8_offset_0x100000000: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -424,6 +599,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(ptr addrspace(1) ; GFX11-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000000: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100000000: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967296 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -431,8 +625,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(ptr addrspace(1) ret float %to.vgpr } -define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(ptr addrspace(1) inreg %sbase) { -; GFX9-LABEL: global_load_saddr_i8_offset_4294967297: +define amdgpu_ps float @global_load_saddr_i8_offset_0x100000001(ptr addrspace(1) inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_0x100000001: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 @@ -441,7 +635,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(ptr addrspace(1) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: global_load_saddr_i8_offset_4294967297: +; GFX10-LABEL: global_load_saddr_i8_offset_0x100000001: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] @@ -449,7 +643,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(ptr addrspace(1) ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: global_load_saddr_i8_offset_4294967297: +; GFX11-LABEL: global_load_saddr_i8_offset_0x100000001: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -457,6 +651,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(ptr addrspace(1) ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000001: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:1 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100000001: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 1 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967297 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -464,8 +677,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(ptr addrspace(1) ret float %to.vgpr } -define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(ptr addrspace(1) inreg %sbase) { -; GFX9-LABEL: global_load_saddr_i8_offset_4294971391: +define amdgpu_ps float @global_load_saddr_i8_offset_0x100000FFF(ptr addrspace(1) inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_0x100000FFF: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s2, 0xfff ; GFX9-NEXT: s_addc_u32 s1, s3, 1 @@ -474,7 +687,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(ptr addrspace(1) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: global_load_saddr_i8_offset_4294971391: +; GFX10-LABEL: global_load_saddr_i8_offset_0x100000FFF: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x800, s2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] @@ -482,7 +695,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(ptr addrspace(1) ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: global_load_saddr_i8_offset_4294971391: +; GFX11-LABEL: global_load_saddr_i8_offset_0x100000FFF: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -490,6 +703,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(ptr addrspace(1) ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000FFF: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100000FFF: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971391 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -497,8 +729,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(ptr addrspace(1) ret float %to.vgpr } -define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(ptr addrspace(1) inreg %sbase) { -; GFX9-LABEL: global_load_saddr_i8_offset_4294971392: +define amdgpu_ps float @global_load_saddr_i8_offset_0x100001000(ptr addrspace(1) inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_0x100001000: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s2, 0x1000 ; GFX9-NEXT: s_addc_u32 s1, s3, 1 @@ -507,7 +739,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(ptr addrspace(1) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: global_load_saddr_i8_offset_4294971392: +; GFX10-LABEL: global_load_saddr_i8_offset_0x100001000: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] @@ -515,7 +747,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(ptr addrspace(1) ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: global_load_saddr_i8_offset_4294971392: +; GFX11-LABEL: global_load_saddr_i8_offset_0x100001000: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -523,6 +755,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(ptr addrspace(1) ; GFX11-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100001000: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100001000: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971392 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -530,8 +781,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(ptr addrspace(1) ret float %to.vgpr } -define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(ptr addrspace(1) inreg %sbase) { -; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967295: +define amdgpu_ps float @global_load_saddr_i8_offset_neg0xFFFFFFFF(ptr addrspace(1) inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 @@ -541,7 +792,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(ptr addrspace( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967295: +; GFX10-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x800, s2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] @@ -549,7 +800,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(ptr addrspace( ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967295: +; GFX11-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -557,6 +808,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(ptr addrspace( ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-4095 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s[0:1], 0x800000, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8388607 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 1 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967295 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -564,8 +834,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(ptr addrspace( ret float %to.vgpr } -define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(ptr addrspace(1) inreg %sbase) { -; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967296: +define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000000(ptr addrspace(1) inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_neg0x100000000: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 @@ -574,7 +844,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(ptr addrspace( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967296: +; GFX10-LABEL: global_load_saddr_i8_offset_neg0x100000000: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] @@ -582,7 +852,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(ptr addrspace( ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967296: +; GFX11-LABEL: global_load_saddr_i8_offset_neg0x100000000: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -590,6 +860,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(ptr addrspace( ; GFX11-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0x100000000: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg0x100000000: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967296 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -597,8 +886,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(ptr addrspace( ret float %to.vgpr } -define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(ptr addrspace(1) inreg %sbase) { -; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967297: +define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000001(ptr addrspace(1) inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_neg0x100000001: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 @@ -607,7 +896,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(ptr addrspace( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967297: +; GFX10-LABEL: global_load_saddr_i8_offset_neg0x100000001: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] @@ -615,7 +904,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(ptr addrspace( ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967297: +; GFX11-LABEL: global_load_saddr_i8_offset_neg0x100000001: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -623,6 +912,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(ptr addrspace( ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0x100000001: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-1 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg0x100000001: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, -1 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967297 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -647,6 +955,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr(ptr addrspace(1) inreg %s ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i8, ptr addrspace(1) %gep0 @@ -678,6 +992,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095(ptr addrspace ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4095 @@ -721,6 +1041,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(ptr addrspace ; GFX11-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4096 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4096 @@ -753,6 +1079,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4096(ptr addrsp ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -4096 @@ -796,6 +1128,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(ptr addrsp ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4097 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -4097 @@ -818,6 +1156,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2047(ptr addrspace ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2047 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:2047 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2047 @@ -850,6 +1194,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2048(ptr addrspace ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2048 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:2048 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2048 @@ -872,6 +1222,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2048(ptr addrsp ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048 @@ -904,6 +1260,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(ptr addrsp ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2049 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2049 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2049 @@ -913,6 +1275,103 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(ptr addrsp ret float %to.vgpr } +; Maximum positive offset on gfx12. +define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF(ptr addrspace(1) inreg %sbase, i32 %voffset) { %zext.offset = zext i32 %voffset to i64 +; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x7ff000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] +; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x7ff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1] +; GFX11-NEXT: v_add_co_u32 v0, vcc, 0x7ff000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:8388607 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 8388607 + %load = load i8, ptr addrspace(1) %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Minimum offset on gfx12. +define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF(ptr addrspace(1) inreg %sbase, i32 %voffset) { %zext.offset = zext i32 %voffset to i64 +; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xff800000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] +; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xff800000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1] +; GFX11-NEXT: v_add_co_u32 v0, vcc, 0xff800000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-8388608 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -8388608 + %load = load i8, ptr addrspace(1) %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + + ; Maximum positive offset on gfx9, and immediate needs to be moved lower. define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(ptr addrspace(1) inreg %sbase, i32 %voffset) { ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: @@ -936,6 +1395,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(ptr ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %zext.offset @@ -958,6 +1423,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint(ptr addrspace(1) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64 %add = add i64 %sbase.as.int, %zext.offset @@ -981,6 +1452,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(ptr ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64 %add = add i64 %zext.offset, %sbase.as.int @@ -1004,6 +1481,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_ ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64 %add = add i64 %zext.offset, %sbase.as.int @@ -1028,6 +1511,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_ ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64 %add.immoffset = add i64 %sbase.as.int, 128 @@ -1080,6 +1569,28 @@ define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) { ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_uniform_ptr_in_vgprs: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-SDAG-NEXT: ds_load_b64 v[1:2], v1 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_uniform_ptr_in_vgprs: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: ds_load_b64 v[1:2], v1 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v1, v0 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v2, vcc +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: ; return to shader part epilog %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1124,6 +1635,28 @@ define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %vo ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:42 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-SDAG-NEXT: ds_load_b64 v[1:2], v1 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] offset:42 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: ds_load_b64 v[1:2], v1 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v1, v0 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v2, vcc +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:42 +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: ; return to shader part epilog %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1149,6 +1682,13 @@ define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset(ptr addrspace(1 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_zext_uniform_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i8, ptr addrspace(1) %gep0 @@ -1172,6 +1712,13 @@ define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(ptr a ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-24 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-24 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -24 @@ -1196,6 +1743,13 @@ define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64 %add = add i64 %zext.offset, %sbase.as.int @@ -1221,6 +1775,13 @@ define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_ ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64 %add = add i64 %zext.offset, %sbase.as.int @@ -1257,6 +1818,26 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(ptr addrspace(1) %vbase, i3 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_i8_vgpr64_sgpr32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc, v0, s2 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_i8_vgpr64_sgpr32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_mov_b32 s3, 0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v0, v2 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset %load = load i8, ptr addrspace(1) %gep0 @@ -1292,6 +1873,26 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(ptr addrspace(1 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc, v0, s2 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_mov_b32 s3, 0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v0, v2 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:4095 +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4095 @@ -1345,6 +1946,36 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing(ptr addrspace(1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_f32_natural_addressing: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc, s2, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_f32_natural_addressing: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v2, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v3, v1, vcc +; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: ; return to shader part epilog %voffset = load i32, ptr addrspace(1) %voffset.ptr %zext.offset = zext i32 %voffset to i64 %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1369,6 +2000,14 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing_immoffset(ptr a ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_f32_natural_addressing_immoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %voffset = load i32, ptr addrspace(1) %voffset.ptr %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1396,6 +2035,15 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(ptr addrspace(1) i ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_f32_saddr_zext_vgpr_range: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !0, !noundef !{} %zext.offset = zext i32 %voffset to i64 %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1422,6 +2070,15 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_imm_offset(ptr add ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:400 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:400 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !0, !noundef !{} %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1470,6 +2127,36 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(ptr addr ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc, s2, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v2, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v3, v1, vcc +; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: ; return to shader part epilog %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !1, !noundef !{} %zext.offset = zext i32 %voffset to i64 %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1493,6 +2180,12 @@ define amdgpu_ps half @global_load_saddr_i16(ptr addrspace(1) inreg %sbase, i32 ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i16, ptr addrspace(1) %gep0 @@ -1512,6 +2205,12 @@ define amdgpu_ps half @global_load_saddr_i16_immneg128(ptr addrspace(1) inreg %s ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i16_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1532,6 +2231,12 @@ define amdgpu_ps half @global_load_saddr_f16(ptr addrspace(1) inreg %sbase, i32 ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load half, ptr addrspace(1) %gep0 @@ -1550,6 +2255,12 @@ define amdgpu_ps half @global_load_saddr_f16_immneg128(ptr addrspace(1) inreg %s ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_f16_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1569,6 +2280,12 @@ define amdgpu_ps float @global_load_saddr_i32(ptr addrspace(1) inreg %sbase, i32 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i32, ptr addrspace(1) %gep0 @@ -1588,6 +2305,12 @@ define amdgpu_ps float @global_load_saddr_i32_immneg128(ptr addrspace(1) inreg % ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i32_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1608,6 +2331,12 @@ define amdgpu_ps float @global_load_saddr_f32(ptr addrspace(1) inreg %sbase, i32 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load float, ptr addrspace(1) %gep0 @@ -1626,6 +2355,12 @@ define amdgpu_ps float @global_load_saddr_f32_immneg128(ptr addrspace(1) inreg % ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_f32_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1645,6 +2380,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_v2i16(ptr addrspace(1) inreg %sba ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v2i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load <2 x i16>, ptr addrspace(1) %gep0 @@ -1664,6 +2405,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_v2i16_immneg128(ptr addrspace(1) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v2i16_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1684,6 +2431,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_v2f16(ptr addrspace(1) inreg %sba ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load <2 x half>, ptr addrspace(1) %gep0 @@ -1702,6 +2455,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_v2f16_immneg128(ptr addrspace(1) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v2f16_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1721,6 +2480,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_p3(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_p3: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load ptr addrspace(3), ptr addrspace(1) %gep0 @@ -1741,6 +2506,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_p3_immneg128(ptr addrspace(1) inr ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_p3_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1762,6 +2533,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_f64(ptr addrspace(1) inreg %sbas ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load double, ptr addrspace(1) %gep0 @@ -1781,6 +2558,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_f64_immneg128(ptr addrspace(1) i ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_f64_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1801,6 +2584,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_i64(ptr addrspace(1) inreg %sbas ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i64, ptr addrspace(1) %gep0 @@ -1820,6 +2609,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_i64_immneg128(ptr addrspace(1) i ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i64_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1840,6 +2635,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_v2f32(ptr addrspace(1) inreg %sb ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v2f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load <2 x float>, ptr addrspace(1) %gep0 @@ -1858,6 +2659,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_v2f32_immneg128(ptr addrspace(1) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v2f32_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1877,6 +2684,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_v2i32(ptr addrspace(1) inreg %sb ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load <2 x i32>, ptr addrspace(1) %gep0 @@ -1896,6 +2709,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_v2i32_immneg128(ptr addrspace(1) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v2i32_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1916,6 +2735,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_v4i16(ptr addrspace(1) inreg %sb ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v4i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load <4 x i16>, ptr addrspace(1) %gep0 @@ -1935,6 +2760,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_v4i16_immneg128(ptr addrspace(1) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v4i16_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1955,6 +2786,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_v4f16(ptr addrspace(1) inreg %sb ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v4f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load <4 x half>, ptr addrspace(1) %gep0 @@ -1974,6 +2811,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_v4f16_immneg128(ptr addrspace(1) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v4f16_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1994,6 +2837,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_p1(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_p1: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load ptr addrspace(1), ptr addrspace(1) %gep0 @@ -2014,6 +2863,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_p1_immneg128(ptr addrspace(1) in ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_p1_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2035,6 +2890,12 @@ define amdgpu_ps <3 x float> @global_load_saddr_v3f32(ptr addrspace(1) inreg %sb ; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v3f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b96 v[0:2], v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load <3 x float>, ptr addrspace(1) %gep0 @@ -2053,6 +2914,12 @@ define amdgpu_ps <3 x float> @global_load_saddr_v3f32_immneg128(ptr addrspace(1) ; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v3f32_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2072,6 +2939,12 @@ define amdgpu_ps <3 x float> @global_load_saddr_v3i32(ptr addrspace(1) inreg %sb ; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v3i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b96 v[0:2], v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load <3 x i32>, ptr addrspace(1) %gep0 @@ -2091,6 +2964,12 @@ define amdgpu_ps <3 x float> @global_load_saddr_v3i32_immneg128(ptr addrspace(1) ; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v3i32_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2111,6 +2990,12 @@ define amdgpu_ps <6 x half> @global_load_saddr_v6f16(ptr addrspace(1) inreg %sba ; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v6f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b96 v[0:2], v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load <6 x half>, ptr addrspace(1) %gep0 @@ -2129,6 +3014,12 @@ define amdgpu_ps <6 x half> @global_load_saddr_v6f16_immneg128(ptr addrspace(1) ; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v6f16_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2148,6 +3039,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4f32(ptr addrspace(1) inreg %sb ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v4f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load <4 x float>, ptr addrspace(1) %gep0 @@ -2166,6 +3063,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4f32_immneg128(ptr addrspace(1) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v4f32_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2185,6 +3088,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4i32(ptr addrspace(1) inreg %sb ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load <4 x i32>, ptr addrspace(1) %gep0 @@ -2204,6 +3113,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4i32_immneg128(ptr addrspace(1) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v4i32_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2224,6 +3139,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v2i64(ptr addrspace(1) inreg %sb ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v2i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load <2 x i64>, ptr addrspace(1) %gep0 @@ -2243,6 +3164,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v2i64_immneg128(ptr addrspace(1) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v2i64_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2263,6 +3190,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_i128(ptr addrspace(1) inreg %sba ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i128, ptr addrspace(1) %gep0 @@ -2282,6 +3215,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_i128_immneg128(ptr addrspace(1) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i128_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2302,6 +3241,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v2p1(ptr addrspace(1) inreg %sba ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v2p1: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load <2 x ptr addrspace(1)>, ptr addrspace(1) %gep0 @@ -2322,6 +3267,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v2p1_immneg128(ptr addrspace(1) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v2p1_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2343,6 +3294,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4p3(ptr addrspace(1) inreg %sba ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v4p3: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load <4 x ptr addrspace(3)>, ptr addrspace(1) %gep0 @@ -2363,6 +3320,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4p3_immneg128(ptr addrspace(1) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_v4p3_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2388,6 +3351,12 @@ define amdgpu_ps float @global_sextload_saddr_i8(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: global_load_i8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_sextload_saddr_i8: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_i8 v0, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i8, ptr addrspace(1) %gep0 @@ -2408,6 +3377,12 @@ define amdgpu_ps float @global_sextload_saddr_i8_immneg128(ptr addrspace(1) inre ; GFX11-NEXT: global_load_i8 v0, v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_sextload_saddr_i8_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_i8 v0, v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2429,6 +3404,12 @@ define amdgpu_ps float @global_sextload_saddr_i16(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: global_load_i16 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_sextload_saddr_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_i16 v0, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i16, ptr addrspace(1) %gep0 @@ -2449,6 +3430,12 @@ define amdgpu_ps float @global_sextload_saddr_i16_immneg128(ptr addrspace(1) inr ; GFX11-NEXT: global_load_i16 v0, v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_sextload_saddr_i16_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_i16 v0, v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2470,6 +3457,12 @@ define amdgpu_ps float @global_zextload_saddr_i8(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_zextload_saddr_i8: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i8, ptr addrspace(1) %gep0 @@ -2490,6 +3483,12 @@ define amdgpu_ps float @global_zextload_saddr_i8_immneg128(ptr addrspace(1) inre ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_zextload_saddr_i8_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2511,6 +3510,12 @@ define amdgpu_ps float @global_zextload_saddr_i16(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_zextload_saddr_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i16, ptr addrspace(1) %gep0 @@ -2531,6 +3536,12 @@ define amdgpu_ps float @global_zextload_saddr_i16_immneg128(ptr addrspace(1) inr ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_zextload_saddr_i16_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2567,6 +3578,14 @@ define amdgpu_ps float @atomic_global_load_saddr_i32(ptr addrspace(1) inreg %sba ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: atomic_global_load_saddr_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] th:TH_LOAD_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load atomic i32, ptr addrspace(1) %gep0 seq_cst, align 4 @@ -2597,6 +3616,14 @@ define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: atomic_global_load_saddr_i32_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 th:TH_LOAD_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2628,6 +3655,14 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(ptr addrspace(1) inre ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: atomic_global_load_saddr_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] th:TH_LOAD_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load atomic i64, ptr addrspace(1) %gep0 seq_cst, align 8 @@ -2658,6 +3693,14 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(ptr addrspa ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: atomic_global_load_saddr_i64_immneg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 th:TH_LOAD_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2682,6 +3725,18 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(ptr addrspace( ; GFX11-NEXT: global_load_d16_b16 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_undef_hi: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_d16_b16 v0, v0, s[2:3] +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_undef_hi: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i16, ptr addrspace(1) %gep0 @@ -2702,6 +3757,18 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(ptr ; GFX11-NEXT: global_load_d16_b16 v0, v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_d16_b16 v0, v0, s[2:3] offset:-128 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2727,6 +3794,21 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(ptr addrspace(1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_zero_hi: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-SDAG-NEXT: global_load_d16_b16 v1, v0, s[2:3] +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zero_hi: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i16, ptr addrspace(1) %gep0 @@ -2751,6 +3833,21 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-SDAG-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:-128 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2774,6 +3871,20 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_reg_hi: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_d16_b16 v1, v0, s[2:3] +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_reg_hi: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i16, ptr addrspace(1) %gep0 @@ -2796,6 +3907,20 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(ptr ad ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:-128 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2819,6 +3944,20 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(ptr addrs ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_d16_u8 v1, v0, s[2:3] +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i8, ptr addrspace(1) %gep0 @@ -2842,6 +3981,20 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_d16_u8 v1, v0, s[2:3] offset:-128 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[2:3] offset:-128 +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2866,6 +4019,22 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(ptr addrs ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_d16_i8 v1, v0, s[2:3] +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i8, ptr addrspace(1) %gep0 @@ -2889,6 +4058,22 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_d16_i8 v1, v0, s[2:3] offset:-128 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3] offset:-128 +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2915,6 +4100,19 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(ptr addrspace( ; GFX11-NEXT: global_load_d16_hi_b16 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_undef_hi: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v0, v0, s[2:3] +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_undef_hi: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i16, ptr addrspace(1) %gep0 @@ -2935,6 +4133,19 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(ptr ; GFX11-NEXT: global_load_d16_hi_b16 v0, v0, s[2:3] offset:-128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v0, v0, s[2:3] offset:-128 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2960,6 +4171,21 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(ptr addrspace(1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zero_hi: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zero_hi: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i16, ptr addrspace(1) %gep0 @@ -2984,6 +4210,21 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -3007,6 +4248,22 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_reg_hi: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_reg_hi: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i16, ptr addrspace(1) %gep0 @@ -3029,6 +4286,22 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(ptr ad ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -3052,6 +4325,22 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(ptr addrs ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3] +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i8, ptr addrspace(1) %gep0 @@ -3075,6 +4364,22 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3] offset:-128 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[2:3] offset:-128 +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -3099,6 +4404,23 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(ptr addrs ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %load = load i8, ptr addrspace(1) %gep0 @@ -3122,6 +4444,23 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3] offset:-128 +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -3153,6 +4492,14 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(ptr add ; GFX11-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_or_b32_e32 v0, 16, v0 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.idx = zext i32 %idx to i64 %or = or i64 %zext.idx, 16 %addr = inttoptr i64 %or to ptr addrspace(1) @@ -3178,6 +4525,14 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr a ; GFX11-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_or_b32_e32 v0, 0x1040, v0 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %zext.idx = zext i32 %idx to i64 %or = or i64 %zext.idx, 4160 %addr = inttoptr i64 %or to ptr addrspace(1) @@ -3196,7 +4551,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: .LBB128_1: ; %bb3 +; GFX9-NEXT: .LBB132_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_add_u32 s4, s2, s0 ; GFX9-NEXT: s_addc_u32 s5, s3, s1 @@ -3205,7 +4560,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400 -; GFX9-NEXT: s_cbranch_scc0 .LBB128_1 +; GFX9-NEXT: s_cbranch_scc0 .LBB132_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm ; @@ -3213,7 +4568,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: .LBB128_1: ; %bb3 +; GFX10-NEXT: .LBB132_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s4, s2, s0 @@ -3223,7 +4578,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400 -; GFX10-NEXT: s_cbranch_scc0 .LBB128_1 +; GFX10-NEXT: s_cbranch_scc0 .LBB132_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm ; @@ -3231,7 +4586,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-NEXT: .LBB128_1: ; %bb3 +; GFX11-NEXT: .LBB132_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_u32 s4, s2, s0 @@ -3241,9 +4596,46 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_addc_u32 s1, s1, 0 ; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400 -; GFX11-NEXT: s_cbranch_scc0 .LBB128_1 +; GFX11-NEXT: s_cbranch_scc0 .LBB132_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: global_addr_64bit_lsr_iv: +; GFX12-SDAG: ; %bb.0: ; %bb +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-SDAG-NEXT: .LBB132_1: ; %bb3 +; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400 +; GFX12-SDAG-NEXT: s_cbranch_scc0 .LBB132_1 +; GFX12-SDAG-NEXT: ; %bb.2: ; %bb2 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: global_addr_64bit_lsr_iv: +; GFX12-GISEL: ; %bb.0: ; %bb +; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-GISEL-NEXT: .LBB132_1: ; %bb3 +; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v1, v3, vcc +; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc, v2, 4 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc +; GFX12-GISEL-NEXT: global_load_b32 v4, v[4:5], off th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0x400, v2 +; GFX12-GISEL-NEXT: s_cbranch_vccz .LBB132_1 +; GFX12-GISEL-NEXT: ; %bb.2: ; %bb2 +; GFX12-GISEL-NEXT: s_endpgm bb: br label %bb3 @@ -3267,7 +4659,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: .LBB129_1: ; %bb3 +; GFX9-NEXT: .LBB133_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_add_u32 s4, s2, s0 ; GFX9-NEXT: s_addc_u32 s5, s3, s1 @@ -3279,7 +4671,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400 ; GFX9-NEXT: ; kill: killed $sgpr4 killed $sgpr5 -; GFX9-NEXT: s_cbranch_scc0 .LBB129_1 +; GFX9-NEXT: s_cbranch_scc0 .LBB133_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm ; @@ -3287,7 +4679,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: .LBB129_1: ; %bb3 +; GFX10-NEXT: .LBB133_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s4, s2, s0 @@ -3300,7 +4692,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400 ; GFX10-NEXT: ; kill: killed $sgpr4 killed $sgpr5 -; GFX10-NEXT: s_cbranch_scc0 .LBB129_1 +; GFX10-NEXT: s_cbranch_scc0 .LBB133_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm ; @@ -3308,7 +4700,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-NEXT: .LBB129_1: ; %bb3 +; GFX11-NEXT: .LBB133_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_u32 s4, s2, s0 @@ -3320,9 +4712,50 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_addc_u32 s1, s1, 0 ; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400 -; GFX11-NEXT: s_cbranch_scc0 .LBB129_1 +; GFX11-NEXT: s_cbranch_scc0 .LBB133_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: global_addr_64bit_lsr_iv_multiload: +; GFX12-SDAG: ; %bb.0: ; %bb +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-SDAG-NEXT: .LBB133_1: ; %bb3 +; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400 +; GFX12-SDAG-NEXT: s_cbranch_scc0 .LBB133_1 +; GFX12-SDAG-NEXT: ; %bb.2: ; %bb2 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: global_addr_64bit_lsr_iv_multiload: +; GFX12-GISEL: ; %bb.0: ; %bb +; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-GISEL-NEXT: .LBB133_1: ; %bb3 +; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v1, v3, vcc +; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc, v2, 4 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc +; GFX12-GISEL-NEXT: global_load_b32 v6, v[4:5], off th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_load_b32 v4, v[4:5], off th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0x400, v2 +; GFX12-GISEL-NEXT: s_cbranch_vccz .LBB133_1 +; GFX12-GISEL-NEXT: ; %bb.2: ; %bb2 +; GFX12-GISEL-NEXT: s_endpgm bb: br label %bb3 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll index 9cf5b6b9273f3..2a4a3ad2e3bca 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12 %s ; Test using saddr addressing mode of global_*store_* flat instructions. @@ -21,6 +22,15 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr(ptr addrspace(1) inreg %s ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_i8_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %voffset = load i32, ptr addrspace(1) %voffset.ptr %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -45,6 +55,15 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_2047(ptr addrspace ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_i8_zext_vgpr_offset_2047: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v0, v2, s[2:3] offset:2047 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %voffset = load i32, ptr addrspace(1) %voffset.ptr %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -70,6 +89,15 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_neg2048(ptr addrsp ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_i8_zext_vgpr_offset_neg2048: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v0, v2, s[2:3] offset:-2048 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %voffset = load i32, ptr addrspace(1) %voffset.ptr %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -118,6 +146,18 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_uniform_ptr_in_vgprs: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: ds_load_b64 v[2:3], v2 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NEXT: v_readfirstlane_b32 s1, v3 +; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -159,6 +199,18 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %vo ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: ds_load_b64 v[2:3], v2 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NEXT: v_readfirstlane_b32 s1, v3 +; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] offset:-120 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -183,6 +235,13 @@ define amdgpu_ps void @global_store_saddr_i16_zext_vgpr(ptr addrspace(1) inreg % ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_i16_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b16 v0, v1, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store i16 %data, ptr addrspace(1) %gep0 @@ -201,6 +260,13 @@ define amdgpu_ps void @global_store_saddr_i16_zext_vgpr_offset_neg128(ptr addrsp ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_i16_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b16 v0, v1, s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -220,6 +286,13 @@ define amdgpu_ps void @global_store_saddr_f16_zext_vgpr(ptr addrspace(1) inreg % ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_f16_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b16 v0, v1, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store half %data, ptr addrspace(1) %gep0 @@ -238,6 +311,13 @@ define amdgpu_ps void @global_store_saddr_f16_zext_vgpr_offset_neg128(ptr addrsp ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_f16_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b16 v0, v1, s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -257,6 +337,13 @@ define amdgpu_ps void @global_store_saddr_i32_zext_vgpr(ptr addrspace(1) inreg % ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_i32_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store i32 %data, ptr addrspace(1) %gep0 @@ -275,6 +362,13 @@ define amdgpu_ps void @global_store_saddr_i32_zext_vgpr_offset_neg128(ptr addrsp ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_i32_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -294,6 +388,13 @@ define amdgpu_ps void @global_store_saddr_f32_zext_vgpr(ptr addrspace(1) inreg % ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_f32_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store float %data, ptr addrspace(1) %gep0 @@ -312,6 +413,13 @@ define amdgpu_ps void @global_store_saddr_f32_zext_vgpr_offset_neg128(ptr addrsp ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_f32_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -331,6 +439,13 @@ define amdgpu_ps void @global_store_saddr_p3_zext_vgpr(ptr addrspace(1) inreg %s ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_p3_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store ptr addrspace(3) %data, ptr addrspace(1) %gep0 @@ -349,6 +464,13 @@ define amdgpu_ps void @global_store_saddr_p3_zext_vgpr_offset_neg128(ptr addrspa ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_p3_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -368,6 +490,13 @@ define amdgpu_ps void @global_store_saddr_i64_zext_vgpr(ptr addrspace(1) inreg % ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_i64_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store i64 %data, ptr addrspace(1) %gep0 @@ -386,6 +515,13 @@ define amdgpu_ps void @global_store_saddr_i64_zext_vgpr_offset_neg128(ptr addrsp ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_i64_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -405,6 +541,13 @@ define amdgpu_ps void @global_store_saddr_f64_zext_vgpr(ptr addrspace(1) inreg % ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_f64_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store double %data, ptr addrspace(1) %gep0 @@ -423,6 +566,13 @@ define amdgpu_ps void @global_store_saddr_f64_zext_vgpr_offset_neg128(ptr addrsp ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_f64_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -442,6 +592,13 @@ define amdgpu_ps void @global_store_saddr_v2i32_zext_vgpr(ptr addrspace(1) inreg ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v2i32_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store <2 x i32> %data, ptr addrspace(1) %gep0 @@ -460,6 +617,13 @@ define amdgpu_ps void @global_store_saddr_v2i32_zext_vgpr_offset_neg128(ptr addr ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v2i32_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -479,6 +643,13 @@ define amdgpu_ps void @global_store_saddr_v2f32_zext_vgpr(ptr addrspace(1) inreg ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v2f32_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store <2 x float> %data, ptr addrspace(1) %gep0 @@ -497,6 +668,13 @@ define amdgpu_ps void @global_store_saddr_v2f32_zext_vgpr_offset_neg128(ptr addr ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v2f32_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -516,6 +694,13 @@ define amdgpu_ps void @global_store_saddr_v4i16_zext_vgpr(ptr addrspace(1) inreg ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v4i16_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store <4 x i16> %data, ptr addrspace(1) %gep0 @@ -534,6 +719,13 @@ define amdgpu_ps void @global_store_saddr_v4i16_zext_vgpr_offset_neg128(ptr addr ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v4i16_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -553,6 +745,13 @@ define amdgpu_ps void @global_store_saddr_v4f16_zext_vgpr(ptr addrspace(1) inreg ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v4f16_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store <4 x half> %data, ptr addrspace(1) %gep0 @@ -571,6 +770,13 @@ define amdgpu_ps void @global_store_saddr_v4f16_zext_vgpr_offset_neg128(ptr addr ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v4f16_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -590,6 +796,13 @@ define amdgpu_ps void @global_store_saddr_p1_zext_vgpr(ptr addrspace(1) inreg %s ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_p1_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store ptr addrspace(1) %data, ptr addrspace(1) %gep0 @@ -608,6 +821,13 @@ define amdgpu_ps void @global_store_saddr_p1_zext_vgpr_offset_neg128(ptr addrspa ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_p1_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -627,6 +847,13 @@ define amdgpu_ps void @global_store_saddr_v3i32_zext_vgpr(ptr addrspace(1) inreg ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v3i32_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b96 v0, v[1:3], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store <3 x i32> %data, ptr addrspace(1) %gep0 @@ -645,6 +872,13 @@ define amdgpu_ps void @global_store_saddr_v3i32_zext_vgpr_offset_neg128(ptr addr ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v3i32_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b96 v0, v[1:3], s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -664,6 +898,13 @@ define amdgpu_ps void @global_store_saddr_v3f32_zext_vgpr(ptr addrspace(1) inreg ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v3f32_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b96 v0, v[1:3], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store <3 x float> %data, ptr addrspace(1) %gep0 @@ -682,6 +923,13 @@ define amdgpu_ps void @global_store_saddr_v3f32_zext_vgpr_offset_neg128(ptr addr ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v3f32_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b96 v0, v[1:3], s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -701,6 +949,13 @@ define amdgpu_ps void @global_store_saddr_v6i16_zext_vgpr(ptr addrspace(1) inreg ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v6i16_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b96 v0, v[1:3], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store <6 x i16> %data, ptr addrspace(1) %gep0 @@ -719,6 +974,13 @@ define amdgpu_ps void @global_store_saddr_v6i16_zext_vgpr_offset_neg128(ptr addr ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v6i16_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b96 v0, v[1:3], s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -738,6 +1000,13 @@ define amdgpu_ps void @global_store_saddr_v6f16_zext_vgpr(ptr addrspace(1) inreg ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v6f16_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b96 v0, v[1:3], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store <6 x half> %data, ptr addrspace(1) %gep0 @@ -756,6 +1025,13 @@ define amdgpu_ps void @global_store_saddr_v6f16_zext_vgpr_offset_neg128(ptr addr ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v6f16_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b96 v0, v[1:3], s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -775,6 +1051,13 @@ define amdgpu_ps void @global_store_saddr_v4i32_zext_vgpr(ptr addrspace(1) inreg ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v4i32_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store <4 x i32> %data, ptr addrspace(1) %gep0 @@ -793,6 +1076,13 @@ define amdgpu_ps void @global_store_saddr_v4i32_zext_vgpr_offset_neg128(ptr addr ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v4i32_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -812,6 +1102,13 @@ define amdgpu_ps void @global_store_saddr_v4f32_zext_vgpr(ptr addrspace(1) inreg ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v4f32_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store <4 x float> %data, ptr addrspace(1) %gep0 @@ -830,6 +1127,13 @@ define amdgpu_ps void @global_store_saddr_v4f32_zext_vgpr_offset_neg128(ptr addr ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v4f32_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -849,6 +1153,13 @@ define amdgpu_ps void @global_store_saddr_v2i64_zext_vgpr(ptr addrspace(1) inreg ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v2i64_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store <2 x i64> %data, ptr addrspace(1) %gep0 @@ -867,6 +1178,13 @@ define amdgpu_ps void @global_store_saddr_v2i64_zext_vgpr_offset_neg128(ptr addr ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v2i64_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -886,6 +1204,13 @@ define amdgpu_ps void @global_store_saddr_v2f64_zext_vgpr(ptr addrspace(1) inreg ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v2f64_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store <2 x double> %data, ptr addrspace(1) %gep0 @@ -904,6 +1229,13 @@ define amdgpu_ps void @global_store_saddr_v2f64_zext_vgpr_offset_neg128(ptr addr ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v2f64_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -923,6 +1255,13 @@ define amdgpu_ps void @global_store_saddr_v8i16_zext_vgpr(ptr addrspace(1) inreg ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v8i16_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store <8 x i16> %data, ptr addrspace(1) %gep0 @@ -941,6 +1280,13 @@ define amdgpu_ps void @global_store_saddr_v8i16_zext_vgpr_offset_neg128(ptr addr ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v8i16_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -960,6 +1306,13 @@ define amdgpu_ps void @global_store_saddr_v8f16_zext_vgpr(ptr addrspace(1) inreg ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v8f16_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store <8 x half> %data, ptr addrspace(1) %gep0 @@ -978,6 +1331,13 @@ define amdgpu_ps void @global_store_saddr_v8f16_zext_vgpr_offset_neg128(ptr addr ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v8f16_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -997,6 +1357,13 @@ define amdgpu_ps void @global_store_saddr_v2p1_zext_vgpr(ptr addrspace(1) inreg ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v2p1_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store <2 x ptr addrspace(1)> %data, ptr addrspace(1) %gep0 @@ -1015,6 +1382,13 @@ define amdgpu_ps void @global_store_saddr_v2p1_zext_vgpr_offset_neg128(ptr addrs ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v2p1_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1034,6 +1408,13 @@ define amdgpu_ps void @global_store_saddr_v4p3_zext_vgpr(ptr addrspace(1) inreg ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v4p3_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store <4 x ptr addrspace(3)> %data, ptr addrspace(1) %gep0 @@ -1052,6 +1433,13 @@ define amdgpu_ps void @global_store_saddr_v4p3_zext_vgpr_offset_neg128(ptr addrs ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_v4p3_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1075,6 +1463,13 @@ define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr(ptr addrspace(1) ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_global_store_saddr_i32_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store atomic i32 %data, ptr addrspace(1) %gep0 seq_cst, align 4 @@ -1093,6 +1488,13 @@ define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr_offset_neg128(ptr ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_global_store_saddr_i32_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1112,6 +1514,13 @@ define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr(ptr addrspace(1) ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_global_store_saddr_i64_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset store atomic i64 %data, ptr addrspace(1) %gep0 seq_cst, align 8 @@ -1130,6 +1539,13 @@ define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr_offset_neg128(ptr ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_global_store_saddr_i64_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1153,6 +1569,13 @@ define amdgpu_ps void @global_store_saddr_i16_d16hi_zext_vgpr(ptr addrspace(1) i ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_i16_d16hi_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_d16_hi_b16 v0, v1, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %data.hi = extractelement <2 x i16> %data, i32 1 @@ -1172,6 +1595,13 @@ define amdgpu_ps void @global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128(ptr ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_d16_hi_b16 v0, v1, s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1192,6 +1622,13 @@ define amdgpu_ps void @global_store_saddr_i16_d16hi_trunci8_zext_vgpr(ptr addrsp ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_d16_hi_b8 v0, v1, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %data.hi = extractelement <2 x i16> %data, i32 1 @@ -1212,6 +1649,13 @@ define amdgpu_ps void @global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_d16_hi_b8 v0, v1, s[2:3] offset:-128 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll index 31de604647a13..3d11c8bc499e2 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s ; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_add_i64_offset: @@ -41,6 +42,18 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile add ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst @@ -99,6 +112,23 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile add ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst @@ -158,6 +188,23 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -225,6 +272,24 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -275,6 +340,18 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile add ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst ret void @@ -332,6 +409,23 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile add ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst store i64 %tmp0, ptr addrspace(1) %out2 @@ -388,6 +482,23 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %tmp0 = atomicrmw volatile add ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst @@ -452,6 +563,24 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %tmp0 = atomicrmw volatile add ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst @@ -497,6 +626,18 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst @@ -555,6 +696,23 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst @@ -614,6 +772,23 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -681,6 +856,24 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -731,6 +924,18 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst ret void @@ -788,6 +993,23 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst store i64 %tmp0, ptr addrspace(1) %out2 @@ -844,6 +1066,23 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %tmp0 = atomicrmw volatile and ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst @@ -908,6 +1147,24 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %tmp0 = atomicrmw volatile and ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst @@ -953,6 +1210,18 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst @@ -1011,6 +1280,23 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst @@ -1070,6 +1356,23 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -1137,6 +1440,24 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -1187,6 +1508,18 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst ret void @@ -1244,6 +1577,23 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst store i64 %tmp0, ptr addrspace(1) %out2 @@ -1300,6 +1650,23 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %tmp0 = atomicrmw volatile sub ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst @@ -1364,6 +1731,24 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %tmp0 = atomicrmw volatile sub ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst @@ -1403,6 +1788,17 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst @@ -1458,6 +1854,22 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst @@ -1511,6 +1923,22 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -1575,6 +2003,23 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -1619,6 +2064,17 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst ret void @@ -1673,6 +2129,22 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst store i64 %tmp0, ptr addrspace(1) %out2 @@ -1723,6 +2195,22 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %tmp0 = atomicrmw volatile max ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst @@ -1784,6 +2272,23 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %tmp0 = atomicrmw volatile max ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst @@ -1823,6 +2328,17 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst @@ -1878,6 +2394,22 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst @@ -1931,6 +2463,22 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -1995,6 +2543,23 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -2039,6 +2604,17 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst ret void @@ -2093,6 +2669,22 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst store i64 %tmp0, ptr addrspace(1) %out2 @@ -2143,6 +2735,22 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %tmp0 = atomicrmw volatile umax ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst @@ -2204,6 +2812,23 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %tmp0 = atomicrmw volatile umax ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst @@ -2243,6 +2868,17 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst @@ -2298,6 +2934,22 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst @@ -2351,6 +3003,22 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -2415,6 +3083,23 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -2459,6 +3144,17 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst ret void @@ -2513,6 +3209,22 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst store i64 %tmp0, ptr addrspace(1) %out2 @@ -2563,6 +3275,22 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %tmp0 = atomicrmw volatile min ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst @@ -2624,6 +3352,23 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %tmp0 = atomicrmw volatile min ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst @@ -2663,6 +3408,17 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst @@ -2718,6 +3474,22 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst @@ -2771,6 +3543,22 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -2835,6 +3623,23 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -2879,6 +3684,17 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst ret void @@ -2933,6 +3749,22 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst store i64 %tmp0, ptr addrspace(1) %out2 @@ -2983,6 +3815,22 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %tmp0 = atomicrmw volatile umin ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst @@ -3044,6 +3892,23 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %tmp0 = atomicrmw volatile umin ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst @@ -3089,6 +3954,18 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst @@ -3147,6 +4024,23 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst @@ -3206,6 +4100,23 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -3273,6 +4184,24 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -3323,6 +4252,18 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst ret void @@ -3380,6 +4321,23 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst store i64 %tmp0, ptr addrspace(1) %out2 @@ -3436,6 +4394,23 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %tmp0 = atomicrmw volatile or ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst @@ -3500,6 +4475,24 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %tmp0 = atomicrmw volatile or ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst @@ -3545,6 +4538,18 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst @@ -3589,6 +4594,18 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_f64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr double, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, double %in syncscope("agent") seq_cst @@ -3633,6 +4650,18 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_pointer_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr ptr, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, ptr %in syncscope("agent") seq_cst @@ -3691,6 +4720,23 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst @@ -3750,6 +4796,23 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -3817,6 +4880,24 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -3867,6 +4948,18 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst ret void @@ -3924,6 +5017,23 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst store i64 %tmp0, ptr addrspace(1) %out2 @@ -3980,6 +5090,23 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst @@ -4044,6 +5171,24 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst @@ -4089,6 +5234,18 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst @@ -4147,6 +5304,23 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst @@ -4206,6 +5380,23 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -4273,6 +5464,24 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -4323,6 +5532,18 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst ret void @@ -4380,6 +5601,23 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst store i64 %tmp0, ptr addrspace(1) %out2 @@ -4436,6 +5674,23 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %tmp0 = atomicrmw volatile xor ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst @@ -4500,6 +5755,24 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %tmp0 = atomicrmw volatile xor ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst @@ -4558,6 +5831,21 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst @@ -4617,6 +5905,21 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_soffset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:72000 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 9000 %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst @@ -4678,6 +5981,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v4, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst @@ -4742,6 +6061,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -4818,6 +6153,27 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v4, v[0:1], s[6:7] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -4878,6 +6234,21 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %val = cmpxchg volatile ptr addrspace(1) %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst ret void @@ -4938,6 +6309,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v4, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %val = cmpxchg volatile ptr addrspace(1) %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst %extract0 = extractvalue { i64, i1 } %val, 0 @@ -4999,6 +6386,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %val = cmpxchg volatile ptr addrspace(1) %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst @@ -5072,6 +6475,27 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v4, v[0:1], s[6:7] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %val = cmpxchg volatile ptr addrspace(1) %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst @@ -5125,6 +6549,20 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %in, i64 4 %val = load atomic i64, ptr addrspace(1) %gep seq_cst, align 8 @@ -5179,6 +6617,20 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_i64_neg_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:-32 th:TH_LOAD_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %in, i64 -4 %val = load atomic i64, ptr addrspace(1) %gep seq_cst, align 8 @@ -5229,6 +6681,20 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %val = load atomic i64, ptr addrspace(1) %in syncscope("agent") seq_cst, align 8 store i64 %val, ptr addrspace(1) %out @@ -5292,6 +6758,25 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %in, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -5355,6 +6840,25 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %in, i64 %index %val = load atomic i64, ptr addrspace(1) %ptr seq_cst, align 8 @@ -5419,6 +6923,25 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr addrspace(1) %in, i64 %index %gep = getelementptr double, ptr addrspace(1) %ptr, i64 4 @@ -5463,6 +6986,17 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] offset:32 ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] offset:32 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 store atomic i64 %in, ptr addrspace(1) %gep seq_cst, align 8 @@ -5503,6 +7037,17 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: store atomic i64 %in, ptr addrspace(1) %out seq_cst, align 8 ret void @@ -5555,6 +7100,22 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace ; GFX9-NEXT: s_addc_u32 s1, s7, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -5607,6 +7168,22 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou ; GFX9-NEXT: s_addc_u32 s1, s7, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index store atomic i64 %in, ptr addrspace(1) %ptr seq_cst, align 8 @@ -5660,6 +7237,22 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp ; GFX9-NEXT: s_addc_u32 s1, s7, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr addrspace(1) %out, i64 %index %gep = getelementptr double, ptr addrspace(1) %ptr, i64 4 @@ -5705,6 +7298,18 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst @@ -5763,6 +7368,23 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst @@ -5822,6 +7444,23 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_incr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -5867,6 +7506,18 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst @@ -5925,6 +7576,23 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst @@ -5984,6 +7652,23 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_decr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll index a046179636cd3..5c8845bca5141 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll @@ -1,11 +1,13 @@ -; RUN: llc < %s -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=GCN -; RUN: llc < %s -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs | FileCheck %s -check-prefix=GCN +; RUN: llc < %s -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,PREGFX12 +; RUN: llc < %s -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,PREGFX12 +; RUN: llc < %s -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX12PLUS declare i32 @llvm.amdgcn.buffer.atomic.csub(i32, <4 x i32>, i32, i32, i1) declare i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1), i32) ; GCN-LABEL: {{^}}buffer_atomic_csub_rtn: -; GCN: buffer_atomic_csub v0, v1, s[0:3], 0 idxen glc +; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen glc +; GFX12PLUS: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null idxen th:TH_ATOMIC_RETURN define amdgpu_ps void @buffer_atomic_csub_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) { main_body: %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) @@ -13,7 +15,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_atomic_csub_no_rtn: -; GCN: buffer_atomic_csub v0, v1, s[0:3], 0 idxen +; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen +; GFX12PLUS: buffer_atomic_csub_u32 v0, v1, s[0:3], null idxen define amdgpu_ps void @buffer_atomic_csub_no_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) #0 { main_body: %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) @@ -21,7 +24,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_atomic_csub_off4_slc_rtn: -; GCN: buffer_atomic_csub v0, v1, s[0:3], 0 idxen offset:4 glc slc +; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen offset:4 glc slc +; GFX12PLUS: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN define amdgpu_ps void @buffer_atomic_csub_off4_slc_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) { main_body: %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) @@ -29,7 +33,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_atomic_csub_off4_slc_no_rtn: -; GCN: buffer_atomic_csub v0, v1, s[0:3], 0 idxen offset:4 slc +; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen offset:4 slc +; GFX12PLUS: buffer_atomic_csub_u32 v0, v1, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT define amdgpu_ps void @buffer_atomic_csub_off4_slc_no_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) #0 { main_body: %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) @@ -37,7 +42,8 @@ main_body: } ; GCN-LABEL: {{^}}global_atomic_csub_rtn: -; GCN: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9:]+}}, s{{\[[0-9]+:[0-9]+\]}} glc +; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9:]+}}, s{{\[[0-9]+:[0-9]+\]}} glc +; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN define amdgpu_kernel void @global_atomic_csub_rtn(ptr addrspace(1) %ptr, i32 %data) { main_body: %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data) @@ -45,7 +51,8 @@ main_body: } ; GCN-LABEL: {{^}}global_atomic_csub_no_rtn: -; GCN: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v1, s[0:1] define amdgpu_kernel void @global_atomic_csub_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 { main_body: %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data) @@ -53,7 +60,8 @@ main_body: } ; GCN-LABEL: {{^}}global_atomic_csub_off4_rtn: -; GCN: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 glc +; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 glc +; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] offset:4 th:TH_ATOMIC_RETURN define amdgpu_kernel void @global_atomic_csub_off4_rtn(ptr addrspace(1) %ptr, i32 %data) { main_body: %p = getelementptr i32, ptr addrspace(1) %ptr, i64 1 @@ -62,7 +70,8 @@ main_body: } ; GCN-LABEL: {{^}}global_atomic_csub_off4_no_rtn: -; GCN: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 +; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 +; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v1, s[0:1] offset:4 define amdgpu_kernel void @global_atomic_csub_off4_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 { main_body: %p = getelementptr i32, ptr addrspace(1) %ptr, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll index 5921fe1524720..047cb3ab40008 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -10,6 +10,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12-CU %s define amdgpu_kernel void @private_nontemporal_load_0( ; GFX6-LABEL: private_nontemporal_load_0: @@ -179,6 +181,34 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_nontemporal_load_0: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_clause 0x1 +; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_NT_HT +; GFX12-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX12-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-WGP-NEXT: s_nop 0 +; GFX12-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_nontemporal_load_0: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_clause 0x1 +; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_NT_HT +; GFX12-CU-NEXT: s_waitcnt vmcnt(0) +; GFX12-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-CU-NEXT: s_nop 0 +; GFX12-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-CU-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(1) %out) { entry: %val = load i32, ptr addrspace(5) %in, align 4, !nontemporal !0 @@ -361,6 +391,36 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_nontemporal_load_1: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_clause 0x1 +; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX12-WGP-NEXT: scratch_load_b32 v0, v0, off th:TH_LOAD_NT_HT +; GFX12-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX12-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-WGP-NEXT: s_nop 0 +; GFX12-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_nontemporal_load_1: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_clause 0x1 +; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX12-CU-NEXT: scratch_load_b32 v0, v0, off th:TH_LOAD_NT_HT +; GFX12-CU-NEXT: s_waitcnt vmcnt(0) +; GFX12-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-CU-NEXT: s_nop 0 +; GFX12-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-CU-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -532,6 +592,26 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc ; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_nontemporal_store_0: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s2 th:TH_STORE_NT_WB +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_nontemporal_store_0: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s2 th:TH_STORE_NT_WB +; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(5) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4 @@ -708,6 +788,28 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: scratch_store_b32 v0, v1, off glc slc dlc ; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_nontemporal_store_1: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: scratch_store_b32 v0, v1, off th:TH_STORE_NT_WB +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_nontemporal_store_1: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-CU-NEXT: scratch_store_b32 v0, v1, off th:TH_STORE_NT_WB +; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(5) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll index 5b4a1059aee04..4b1fb295adec2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -6,6 +6,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12-CU %s define amdgpu_kernel void @private_volatile_load_0( ; GFX6-LABEL: private_volatile_load_0: @@ -123,6 +125,34 @@ define amdgpu_kernel void @private_volatile_load_0( ; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_volatile_load_0: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_clause 0x1 +; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_RT_NT +; GFX12-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX12-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-WGP-NEXT: s_nop 0 +; GFX12-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_volatile_load_0: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_clause 0x1 +; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_RT_NT +; GFX12-CU-NEXT: s_waitcnt vmcnt(0) +; GFX12-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-CU-NEXT: s_nop 0 +; GFX12-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-CU-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(5) %in, align 4 @@ -251,6 +281,36 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_volatile_load_1: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_clause 0x1 +; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX12-WGP-NEXT: scratch_load_b32 v0, v0, off th:TH_LOAD_RT_NT +; GFX12-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX12-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-WGP-NEXT: s_nop 0 +; GFX12-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_volatile_load_1: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_clause 0x1 +; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX12-CU-NEXT: scratch_load_b32 v0, v0, off th:TH_LOAD_RT_NT +; GFX12-CU-NEXT: s_waitcnt vmcnt(0) +; GFX12-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-CU-NEXT: s_nop 0 +; GFX12-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-CU-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -377,6 +437,28 @@ define amdgpu_kernel void @private_volatile_store_0( ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_volatile_store_0: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s2 th:TH_STORE_NT_RT +; GFX12-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_volatile_store_0: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s2 th:TH_STORE_NT_RT +; GFX12-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(5) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4 @@ -506,6 +588,30 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX11-CU-NEXT: scratch_store_b32 v0, v1, off dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_volatile_store_1: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: scratch_store_b32 v0, v1, off th:TH_STORE_NT_RT +; GFX12-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_volatile_store_1: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-CU-NEXT: scratch_store_b32 v0, v1, off th:TH_STORE_NT_RT +; GFX12-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(5) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll index dcaf7664d5b5a..5a8814e7b20fa 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll @@ -2,9 +2,11 @@ ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s ; Test splitting flat instruction offsets into the low and high bits ; when the offset doesn't fit in the offset field. @@ -32,6 +34,13 @@ define i8 @flat_inst_valu_offset_1(ptr %p) { ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_inst_valu_offset_1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:1 +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 1 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -60,6 +69,13 @@ define i8 @flat_inst_valu_offset_11bit_max(ptr %p) { ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:2047 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_inst_valu_offset_11bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:2047 +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 2047 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -88,6 +104,13 @@ define i8 @flat_inst_valu_offset_12bit_max(ptr %p) { ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_inst_valu_offset_12bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 4095 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -121,6 +144,13 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: flat_inst_valu_offset_13bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_13bit_max: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -143,6 +173,63 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) { ret i8 %load } +define i8 @flat_inst_valu_offset_24bit_max(ptr %p) { +; GFX9-SDAG-LABEL: flat_inst_valu_offset_24bit_max: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x7ff000, v0 +; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_inst_valu_offset_24bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fffff, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: flat_inst_valu_offset_24bit_max: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff000, v0 +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_inst_valu_offset_24bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8388607 +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: flat_inst_valu_offset_24bit_max: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fffff, v0 +; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: flat_inst_valu_offset_24bit_max: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fffff, v0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i8, ptr %p, i64 8388607 + %load = load i8, ptr %gep, align 4 + ret i8 %load +} + define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) { ; GFX9-LABEL: flat_inst_valu_offset_neg_11bit_max: ; GFX9: ; %bb.0: @@ -170,6 +257,13 @@ define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) { ; GFX11-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_inst_valu_offset_neg_11bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-2048 +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -2048 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -202,6 +296,13 @@ define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) { ; GFX11-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_inst_valu_offset_neg_12bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -4096 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -234,11 +335,58 @@ define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) { ; GFX11-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_inst_valu_offset_neg_13bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -8192 %load = load i8, ptr %gep, align 4 ret i8 %load } +define i8 @flat_inst_valu_offset_neg_24bit_max(ptr %p) { +; GFX9-LABEL: flat_inst_valu_offset_neg_24bit_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xff800000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_inst_valu_offset_neg_24bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_neg_24bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_inst_valu_offset_neg_24bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8388608 +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i8, ptr %p, i64 -8388608 + %load = load i8, ptr %gep, align 4 + ret i8 %load +} + + define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) { ; GFX9-LABEL: flat_inst_valu_offset_2x_11bit_max: ; GFX9: ; %bb.0: @@ -262,6 +410,13 @@ define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) { ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_inst_valu_offset_2x_11bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 4095 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -295,6 +450,13 @@ define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: flat_inst_valu_offset_2x_12bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -345,6 +507,13 @@ define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: flat_inst_valu_offset_2x_13bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:16383 +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -367,6 +536,74 @@ define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) { ret i8 %load } +define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) { +; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_24bit_max: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfff000, v0 +; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4094 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_inst_valu_offset_2x_24bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffffe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_24bit_max: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff000, v0 +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4094 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: flat_inst_valu_offset_2x_24bit_max: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8388606 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_24bit_max: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffe, v0 +; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_24bit_max: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffffe, v0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_24bit_max: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffffe, v0 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i8, ptr %p, i64 16777214 + %load = load i8, ptr %gep, align 4 + ret i8 %load +} + define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) { ; GFX9-LABEL: flat_inst_valu_offset_2x_neg_11bit_max: ; GFX9: ; %bb.0: @@ -394,6 +631,13 @@ define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) { ; GFX11-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_inst_valu_offset_2x_neg_11bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -4096 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -426,6 +670,13 @@ define i8 @flat_inst_valu_offset_2x_neg_12bit_max(ptr %p) { ; GFX11-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_inst_valu_offset_2x_neg_12bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -8192 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -458,11 +709,68 @@ define i8 @flat_inst_valu_offset_2x_neg_13bit_max(ptr %p) { ; GFX11-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_inst_valu_offset_2x_neg_13bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-16384 +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -16384 %load = load i8, ptr %gep, align 4 ret i8 %load } +define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) { +; GFX9-LABEL: flat_inst_valu_offset_2x_neg_24bit_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xff000001, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: flat_load_ubyte v0, v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_inst_valu_offset_2x_neg_24bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000001, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: flat_load_ubyte v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_2x_neg_24bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000001, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: flat_inst_valu_offset_2x_neg_24bit_max: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8388607 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_24bit_max: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000001, v0 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i8, ptr %p, i64 -16777215 + %load = load i8, ptr %gep, align 4 + ret i8 %load +} + ; Fill 11-bit low-bits (1ull << 33) | 2047 define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split0: @@ -492,6 +800,15 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -531,6 +848,20 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) { ; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x7ff +; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589936639 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -565,6 +896,15 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -604,6 +944,20 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) { ; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x800 +; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589936640 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -638,6 +992,15 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -677,6 +1040,20 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) { ; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0xfff +; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589938687 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -711,6 +1088,15 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4096 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -750,6 +1136,20 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) { ; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1000 +; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589938688 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -784,6 +1184,15 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8191 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -823,6 +1232,20 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) { ; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1fff +; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589942783 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -857,6 +1280,15 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8192 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -896,6 +1328,20 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) { ; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x2000 +; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589942784 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -931,6 +1377,15 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386561 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -970,6 +1425,20 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) { ; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x7ff +; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854773761 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -1005,6 +1474,15 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386560 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1044,6 +1522,20 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) { ; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x800 +; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854773760 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -1079,6 +1571,15 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384513 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1118,6 +1619,20 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) { ; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0xfff +; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854771713 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -1153,6 +1668,15 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384512 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1192,6 +1716,20 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) { ; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1000 +; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854771712 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -1227,6 +1765,15 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380417 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1266,6 +1813,20 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) { ; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1fff +; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854767617 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -1301,6 +1862,15 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380416 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1340,6 +1910,20 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) { ; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x2000 +; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854767616 %load = load i8, ptr %gep, align 4 ret i8 %load @@ -1379,6 +1963,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_inst_salu_offset_1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:1 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 1 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr undef @@ -1419,6 +2013,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_inst_salu_offset_11bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:2047 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 2047 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr undef @@ -1459,6 +2063,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_inst_salu_offset_12bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 4095 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr undef @@ -1504,6 +2118,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-LABEL: flat_inst_salu_offset_13bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_13bit_max: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -1574,6 +2198,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-LABEL: flat_inst_salu_offset_neg_11bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-2048 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -1644,6 +2278,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-LABEL: flat_inst_salu_offset_neg_12bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -1714,6 +2358,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-LABEL: flat_inst_salu_offset_neg_13bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -1779,6 +2433,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_inst_salu_offset_2x_11bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 4095 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr undef @@ -1824,6 +2488,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-LABEL: flat_inst_salu_offset_2x_12bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -1894,6 +2568,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-LABEL: flat_inst_salu_offset_2x_13bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:16383 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -1964,6 +2648,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2034,6 +2728,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2104,6 +2808,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-16384 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2174,6 +2888,18 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047 th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-SDAG-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2199,6 +2925,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 8589936639 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr undef @@ -2244,6 +2983,18 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048 th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-SDAG-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2269,6 +3020,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 8589936640 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr undef @@ -2314,6 +3078,18 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-SDAG-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2339,6 +3115,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 8589938687 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr undef @@ -2385,6 +3174,18 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4096 th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-SDAG-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2410,6 +3211,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 8589938688 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr undef @@ -2456,6 +3270,18 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8191 th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-SDAG-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2481,6 +3307,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 8589942783 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr undef @@ -2527,6 +3366,18 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8192 th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-SDAG-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2552,6 +3403,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 8589942784 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr undef @@ -2600,6 +3464,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386561 th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-SDAG-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2625,6 +3502,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 -9223372036854773761 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr undef @@ -2673,6 +3563,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386560 th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-SDAG-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2698,6 +3601,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 -9223372036854773760 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr undef @@ -2746,6 +3662,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384513 th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-SDAG-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2771,6 +3700,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 -9223372036854771713 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr undef @@ -2819,6 +3761,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384512 th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-SDAG-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2844,6 +3799,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 -9223372036854771712 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr undef @@ -2892,6 +3860,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380417 th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-SDAG-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2917,6 +3898,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 -9223372036854767617 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr undef @@ -2965,6 +3959,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-SDAG-NEXT: s_endpgm ; +; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380416 th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-SDAG-NEXT: s_endpgm +; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2990,6 +3997,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 +; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 -9223372036854767616 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr undef diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll index 137c83a0fd80c..88be7e54ced10 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -2,9 +2,11 @@ ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s ; Test splitting flat instruction offsets into the low and high bits ; when the offset doesn't fit in the offset field. @@ -30,6 +32,13 @@ define i8 @global_inst_valu_offset_1(ptr addrspace(1) %p) { ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_inst_valu_offset_1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 1 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load @@ -56,6 +65,13 @@ define i8 @global_inst_valu_offset_11bit_max(ptr addrspace(1) %p) { ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:2047 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_inst_valu_offset_11bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:2047 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 2047 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load @@ -85,6 +101,13 @@ define i8 @global_inst_valu_offset_12bit_max(ptr addrspace(1) %p) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: global_inst_valu_offset_12bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:4095 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SDAG-LABEL: global_inst_valu_offset_12bit_max: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -126,6 +149,13 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: global_inst_valu_offset_13bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:8191 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-SDAG-LABEL: global_inst_valu_offset_13bit_max: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -157,6 +187,72 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) { ret i8 %load } +define i8 @global_inst_valu_offset_24bit_max(ptr addrspace(1) %p) { +; GFX9-GISEL-LABEL: global_inst_valu_offset_24bit_max: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fffff, v0 +; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: global_inst_valu_offset_24bit_max: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fffff, v0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: global_inst_valu_offset_24bit_max: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fffff, v0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_inst_valu_offset_24bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:8388607 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: global_inst_valu_offset_24bit_max: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x7ff000, v0 +; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-SDAG-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: global_inst_valu_offset_24bit_max: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff800, v0 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: global_inst_valu_offset_24bit_max: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff000, v0 +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i8, ptr addrspace(1) %p, i64 8388607 + %load = load i8, ptr addrspace(1) %gep, align 4 + ret i8 %load +} + define i8 @global_inst_valu_offset_neg_11bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_valu_offset_neg_11bit_max: ; GFX9: ; %bb.0: @@ -178,6 +274,13 @@ define i8 @global_inst_valu_offset_neg_11bit_max(ptr addrspace(1) %p) { ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-2048 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_inst_valu_offset_neg_11bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-2048 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -2048 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load @@ -206,6 +309,13 @@ define i8 @global_inst_valu_offset_neg_12bit_max(ptr addrspace(1) %p) { ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-4096 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_inst_valu_offset_neg_12bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-4096 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load @@ -238,11 +348,57 @@ define i8 @global_inst_valu_offset_neg_13bit_max(ptr addrspace(1) %p) { ; GFX11-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_inst_valu_offset_neg_13bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-8192 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load } +define i8 @global_inst_valu_offset_neg_24bit_max(ptr addrspace(1) %p) { +; GFX9-LABEL: global_inst_valu_offset_neg_24bit_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xff800000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_inst_valu_offset_neg_24bit_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_neg_24bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_inst_valu_offset_neg_24bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-8388608 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8388608 + %load = load i8, ptr addrspace(1) %gep, align 4 + ret i8 %load +} + define i8 @global_inst_valu_offset_2x_11bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_valu_offset_2x_11bit_max: ; GFX9: ; %bb.0: @@ -267,6 +423,13 @@ define i8 @global_inst_valu_offset_2x_11bit_max(ptr addrspace(1) %p) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: global_inst_valu_offset_2x_11bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:4095 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_11bit_max: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -308,6 +471,13 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: global_inst_valu_offset_2x_12bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:8191 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_12bit_max: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -367,6 +537,13 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: global_inst_valu_offset_2x_13bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:16383 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_13bit_max: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -398,6 +575,83 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) { ret i8 %load } +define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) { +; GFX9-GISEL-LABEL: global_inst_valu_offset_2x_24bit_max: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffe, v0 +; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_24bit_max: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffffe, v0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_24bit_max: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffffe, v0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_24bit_max: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffffe, v0 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfff000, v0 +; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-SDAG-NEXT: global_load_ubyte v0, v[0:1], off offset:4094 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff800, v0 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off offset:2046 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff000, v0 +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4094 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8388606 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i8, ptr addrspace(1) %p, i64 16777214 + %load = load i8, ptr addrspace(1) %gep, align 4 + ret i8 %load +} + define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_valu_offset_2x_neg_11bit_max: ; GFX9: ; %bb.0: @@ -421,6 +675,13 @@ define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) { ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-4096 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_inst_valu_offset_2x_neg_11bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-4096 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load @@ -453,6 +714,13 @@ define i8 @global_inst_valu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) { ; GFX11-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_inst_valu_offset_2x_neg_12bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-8192 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load @@ -485,11 +753,96 @@ define i8 @global_inst_valu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) { ; GFX11-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_inst_valu_offset_2x_neg_13bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-16384 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16384 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load } +define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) { +; GFX9-GISEL-LABEL: global_inst_valu_offset_2x_neg_24bit_max: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xff000001, v0 +; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_neg_24bit_max: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000001, v0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_neg_24bit_max: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000001, v0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_neg_24bit_max: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000001, v0 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xff001000, v0 +; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-SDAG-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000800, v0 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off offset:-2047 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xff001000, v0 +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-4095 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8388607 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16777215 + %load = load i8, ptr addrspace(1) %gep, align 4 + ret i8 %load +} + + ; Fill 11-bit low-bits (1ull << 33) | 2047 define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) { ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split0: @@ -532,6 +885,20 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x7ff +; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -558,6 +925,15 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) { ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load @@ -605,6 +981,20 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x800 +; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -631,6 +1021,15 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) { ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load @@ -678,6 +1077,20 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0xfff +; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -704,6 +1117,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) { ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load @@ -751,6 +1173,20 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1000 +; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -777,6 +1213,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) { ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load @@ -824,6 +1269,20 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1fff +; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -850,6 +1309,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) { ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8191 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load @@ -897,6 +1365,20 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x2000 +; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -923,6 +1405,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) { ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8192 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load @@ -970,6 +1461,20 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1) ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x7ff +; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -997,6 +1502,15 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1) ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-2049 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8386561 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load @@ -1044,6 +1558,20 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1) ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x800 +; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1071,6 +1599,15 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1) ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-2048 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8386560 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load @@ -1118,6 +1655,20 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1) ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0xfff +; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1145,6 +1696,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1) ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-1 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8384513 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load @@ -1192,6 +1752,20 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1) ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1000 +; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1219,6 +1793,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1) ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8384512 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load @@ -1266,6 +1849,20 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1) ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1fff +; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1293,6 +1890,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1) ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-1 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8380417 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load @@ -1340,6 +1946,20 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x2000 +; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1367,6 +1987,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8380416 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616 %load = load i8, ptr addrspace(1) %gep, align 4 ret i8 %load @@ -1404,6 +2033,18 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_inst_salu_offset_1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 1 %load = load volatile i8, ptr addrspace(1) %gep, align 1 store i8 %load, ptr addrspace(1) undef @@ -1442,6 +2083,18 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_inst_salu_offset_11bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 2047 %load = load volatile i8, ptr addrspace(1) %gep, align 1 store i8 %load, ptr addrspace(1) undef @@ -1480,6 +2133,18 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_inst_salu_offset_12bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 4095 %load = load volatile i8, ptr addrspace(1) %gep, align 1 store i8 %load, ptr addrspace(1) undef @@ -1518,6 +2183,18 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_inst_salu_offset_13bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8191 %load = load volatile i8, ptr addrspace(1) %gep, align 1 store i8 %load, ptr addrspace(1) undef @@ -1556,6 +2233,18 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_inst_salu_offset_neg_11bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -2048 %load = load volatile i8, ptr addrspace(1) %gep, align 1 store i8 %load, ptr addrspace(1) undef @@ -1598,6 +2287,18 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: global_inst_salu_offset_neg_12bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -1655,6 +2356,18 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; +; GFX12-LABEL: global_inst_salu_offset_neg_13bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -1717,6 +2430,18 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_inst_salu_offset_2x_11bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 4095 %load = load volatile i8, ptr addrspace(1) %gep, align 1 store i8 %load, ptr addrspace(1) undef @@ -1755,6 +2480,18 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_inst_salu_offset_2x_12bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8191 %load = load volatile i8, ptr addrspace(1) %gep, align 1 store i8 %load, ptr addrspace(1) undef @@ -1793,6 +2530,18 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_inst_salu_offset_2x_13bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:16383 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 16383 %load = load volatile i8, ptr addrspace(1) %gep, align 1 store i8 %load, ptr addrspace(1) undef @@ -1835,6 +2584,18 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: global_inst_salu_offset_2x_neg_11bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -1892,6 +2653,18 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; +; GFX12-LABEL: global_inst_salu_offset_2x_neg_12bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -1963,6 +2736,18 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; +; GFX12-LABEL: global_inst_salu_offset_2x_neg_13bit_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-16384 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2035,6 +2820,21 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; +; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2059,6 +2859,20 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639 %load = load volatile i8, ptr addrspace(1) %gep, align 1 store i8 %load, ptr addrspace(1) undef @@ -2107,6 +2921,21 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; +; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2131,6 +2960,20 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640 %load = load volatile i8, ptr addrspace(1) %gep, align 1 store i8 %load, ptr addrspace(1) undef @@ -2179,6 +3022,21 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; +; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2203,6 +3061,20 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687 %load = load volatile i8, ptr addrspace(1) %gep, align 1 store i8 %load, ptr addrspace(1) undef @@ -2251,6 +3123,21 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; +; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2275,6 +3162,20 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096 th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688 %load = load volatile i8, ptr addrspace(1) %gep, align 1 store i8 %load, ptr addrspace(1) undef @@ -2323,6 +3224,21 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; +; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2347,6 +3263,20 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8191 th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783 %load = load volatile i8, ptr addrspace(1) %gep, align 1 store i8 %load, ptr addrspace(1) undef @@ -2395,6 +3325,21 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; +; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -2419,6 +3364,20 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8192 th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784 %load = load volatile i8, ptr addrspace(1) %gep, align 1 store i8 %load, ptr addrspace(1) undef @@ -2464,6 +3423,35 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 +; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x7ff +; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761 %load = load volatile i8, ptr addrspace(1) %gep, align 1 store i8 %load, ptr addrspace(1) undef @@ -2509,6 +3497,35 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 +; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x800 +; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760 %load = load volatile i8, ptr addrspace(1) %gep, align 1 store i8 %load, ptr addrspace(1) undef @@ -2554,6 +3571,35 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 +; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_movk_i32 s2, 0xfff +; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713 %load = load volatile i8, ptr addrspace(1) %gep, align 1 store i8 %load, ptr addrspace(1) undef @@ -2599,6 +3645,35 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 +; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1000 +; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712 %load = load volatile i8, ptr addrspace(1) %gep, align 1 store i8 %load, ptr addrspace(1) undef @@ -2644,6 +3719,35 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 +; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1fff +; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617 %load = load volatile i8, ptr addrspace(1) %gep, align 1 store i8 %load, ptr addrspace(1) undef @@ -2689,6 +3793,35 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 +; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x2000 +; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616 %load = load volatile i8, ptr addrspace(1) %gep, align 1 store i8 %load, ptr addrspace(1) undef