diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 9c74c654d8e35..b1436cb5d6073 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7635,6 +7635,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); unsigned Opcode = Inst.getOpcode(); unsigned NewOpcode = getVALUOp(Inst); + const DebugLoc &DL = Inst.getDebugLoc(); + // Handle some special cases switch (Opcode) { default: @@ -7872,7 +7874,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, return; case AMDGPU::S_UADDO_PSEUDO: case AMDGPU::S_USUBO_PSEUDO: { - const DebugLoc &DL = Inst.getDebugLoc(); MachineOperand &Dest0 = Inst.getOperand(0); MachineOperand &Dest1 = Inst.getOperand(1); MachineOperand &Src0 = Inst.getOperand(2); @@ -7892,12 +7893,37 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, legalizeOperands(*NewInstr, MDT); MRI.replaceRegWith(Dest0.getReg(), DestReg); - addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, - Worklist); + addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); Inst.eraseFromParent(); } return; + case AMDGPU::S_LSHL1_ADD_U32: + case AMDGPU::S_LSHL2_ADD_U32: + case AMDGPU::S_LSHL3_ADD_U32: + case AMDGPU::S_LSHL4_ADD_U32: { + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1 + : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2 + : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3 + : 4); + + const TargetRegisterClass *NewRC = + RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())); + Register DestReg = MRI.createVirtualRegister(NewRC); + MachineInstr *NewInstr = + BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg) + .add(Src0) + .addImm(ShiftAmt) + .add(Src1); + legalizeOperands(*NewInstr, MDT); + MRI.replaceRegWith(Dest.getReg(), DestReg); + addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); + Inst.eraseFromParent(); + } + return; case AMDGPU::S_CSELECT_B32: case AMDGPU::S_CSELECT_B64: lowerSelect(Worklist, Inst, MDT); @@ -7994,7 +8020,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, return; } case AMDGPU::S_CVT_HI_F32_F16: { - const DebugLoc &DL = Inst.getDebugLoc(); Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); if (ST.useRealTrue16Insts()) { @@ -8024,7 +8049,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } case AMDGPU::S_MINIMUM_F32: case AMDGPU::S_MAXIMUM_F32: { - const DebugLoc &DL = Inst.getDebugLoc(); Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst) .addImm(0) // src0_modifiers @@ -8042,7 +8066,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } case AMDGPU::S_MINIMUM_F16: case AMDGPU::S_MAXIMUM_F16: { - const DebugLoc &DL = Inst.getDebugLoc(); Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass : &AMDGPU::VGPR_32RegClass); @@ -8066,7 +8089,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, case AMDGPU::V_S_RCP_F16_e64: case AMDGPU::V_S_RSQ_F16_e64: case AMDGPU::V_S_SQRT_F16_e64: { - const DebugLoc &DL = Inst.getDebugLoc(); Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass : &AMDGPU::VGPR_32RegClass); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index b7f63eceb5d5c..0bde5d3fd2f26 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -776,11 +776,7 @@ def xnor : PatFrag < foreach I = 1-4 in { def shl#I#_add : PatFrag < (ops node:$src0, node:$src1), - (add (shl_oneuse $src0, (i32 I)), $src1)> { - // FIXME: Poor substitute for disabling pattern in SelectionDAG - let PredicateCode = [{return false;}]; - let GISelPredicateCode = [{return true;}]; -} + (add (shl_oneuse $src0, (i32 I)), $src1)>; } multiclass SIAtomicM0Glue2 This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9-SDAG-NEXT: v_readlane_b32 s9, v0, s8 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s8 +; GFX9-SDAG-NEXT: s_max_u32 s7, s7, s9 ; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX9-SDAG-NEXT: ; %bb.2: -; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff -; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xffffe000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s6, 6, v0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s7, 6, v0 ; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc ; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -609,8 +603,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 % ; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000 ; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.0 -; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 2 -; GFX9-SDAG-NEXT: s_add_i32 s5, s5, 15 +; GFX9-SDAG-NEXT: s_lshl2_add_u32 s5, s5, 15 ; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff ; GFX9-SDAG-NEXT: s_and_b32 s5, s5, -16 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 @@ -639,8 +632,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 % ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: .LBB6_4: ; %bb.1 -; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2 -; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15 +; GFX9-SDAG-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6 @@ -719,20 +711,17 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 % ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.0 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 2 +; GFX11-SDAG-NEXT: s_lshl2_add_u32 s1, s1, 15 ; GFX11-SDAG-NEXT: s_add_i32 s3, s32, 0x7ff -; GFX11-SDAG-NEXT: s_add_i32 s1, s1, 15 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_and_b32 s4, s1, -16 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_and_b32 s1, s3, 0xfffff800 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_lshl_b32 s3, s4, 5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s3, exec_lo +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 ; GFX11-SDAG-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s3 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s5, v0, s4 @@ -750,18 +739,16 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 % ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX11-SDAG-NEXT: .LBB6_4: ; %bb.1 -; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 15 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15 +; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 -; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16 +; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s33 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -866,9 +853,8 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_cbranch_execnz .LBB7_5 ; GFX9-SDAG-NEXT: .LBB7_4: ; %bb.0 -; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 2 ; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0xfff -; GFX9-SDAG-NEXT: s_add_i32 s5, s5, 15 +; GFX9-SDAG-NEXT: s_lshl2_add_u32 s5, s5, 15 ; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xfffff000 ; GFX9-SDAG-NEXT: s_and_b32 s5, s5, -16 ; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 6 @@ -964,16 +950,15 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB7_5 ; GFX11-SDAG-NEXT: .LBB7_4: ; %bb.0 -; GFX11-SDAG-NEXT: s_lshl_b32 s0, s1, 2 +; GFX11-SDAG-NEXT: s_lshl2_add_u32 s1, s1, 15 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15 -; GFX11-SDAG-NEXT: s_add_i32 s1, s32, 0x7ff -; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16 -; GFX11-SDAG-NEXT: s_and_b32 s1, s1, 0xfffff800 -; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0x7ff +; GFX11-SDAG-NEXT: s_and_b32 s1, s1, -16 +; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0xfffff800 +; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 5 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0 +; GFX11-SDAG-NEXT: s_add_i32 s32, s0, s1 ; GFX11-SDAG-NEXT: .LBB7_5: ; %bb.2 ; GFX11-SDAG-NEXT: s_endpgm ; GFX11-SDAG-NEXT: .LBB7_6: @@ -1171,35 +1156,35 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s33 ; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0x1fc0 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s34 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s34 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x4000 +; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-SDAG-NEXT: s_and_b32 s6, s4, 0xffffe000 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec -; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s7, 0 ; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xffffe000 -; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x4000 ; GFX9-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9-SDAG-NEXT: v_readlane_b32 s9, v0, s8 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s8 +; GFX9-SDAG-NEXT: s_max_u32 s7, s7, s9 ; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-SDAG-NEXT: ; %bb.2: -; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff -; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xffffe000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s6, 6, v0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s7, 6, v0 ; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 10 ; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s32, s34 -; GFX9-SDAG-NEXT: s_mov_b32 s34, s10 -; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s11 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s10 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned: @@ -1240,34 +1225,35 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s5, s33 ; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 0x7f -; GFX11-SDAG-NEXT: s_mov_b32 s5, s34 -; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 -; GFX11-SDAG-NEXT: s_and_b32 s33, s33, 0xffffff80 +; GFX11-SDAG-NEXT: s_mov_b32 s6, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x100 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0xfff +; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo +; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX11-SDAG-NEXT: s_mov_b32 s1, 0 +; GFX11-SDAG-NEXT: s_and_b32 s33, s33, 0xffffff80 ; GFX11-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3 +; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3 +; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX11-SDAG-NEXT: ; %bb.2: -; GFX11-SDAG-NEXT: s_add_i32 s1, s32, 0xfff +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 10 -; GFX11-SDAG-NEXT: s_and_b32 s1, s1, 0xfffff000 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s0 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_mov_b32 s32, s34 -; GFX11-SDAG-NEXT: s_mov_b32 s34, s5 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s6 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned: @@ -1850,20 +1836,20 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-SDAG-NEXT: s_cbranch_execz .LBB14_6 ; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.0 +; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff ; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 +; GFX9-SDAG-NEXT: s_and_b32 s9, s6, 0xfffff000 ; GFX9-SDAG-NEXT: v_and_b32_e32 v1, -16, v1 ; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec ; GFX9-SDAG-NEXT: s_mov_b32 s10, 0 ; GFX9-SDAG-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s9, s[6:7] -; GFX9-SDAG-NEXT: v_readlane_b32 s11, v1, s9 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s9 -; GFX9-SDAG-NEXT: s_max_u32 s10, s10, s11 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s11, s[6:7] +; GFX9-SDAG-NEXT: v_readlane_b32 s12, v1, s11 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s11 +; GFX9-SDAG-NEXT: s_max_u32 s10, s10, s12 ; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0 ; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB14_2 ; GFX9-SDAG-NEXT: ; %bb.3: -; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff -; GFX9-SDAG-NEXT: s_and_b32 s9, s6, 0xfffff000 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s10, 6, v1 ; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 @@ -1894,7 +1880,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: .LBB14_6: ; %bb.1 ; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec ; GFX9-SDAG-NEXT: .LBB14_7: ; =>This Inner Loop Header: Depth=1 @@ -1912,7 +1897,8 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s4 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s32, s34 ; GFX9-SDAG-NEXT: s_mov_b32 s34, s14 @@ -2016,27 +2002,26 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: s_cbranch_execz .LBB14_6 ; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.0 ; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 -; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo +; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff +; GFX11-SDAG-NEXT: s_mov_b32 s4, exec_lo +; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, -16, v1 ; GFX11-SDAG-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s2 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s5, s4 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s5, v1, s4 -; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s4 -; GFX11-SDAG-NEXT: s_max_u32 s3, s3, s5 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-SDAG-NEXT: v_readlane_b32 s6, v1, s5 +; GFX11-SDAG-NEXT: s_bitset0_b32 s4, s5 +; GFX11-SDAG-NEXT: s_max_u32 s3, s3, s6 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_2 ; GFX11-SDAG-NEXT: ; %bb.3: ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v31 -; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff -; GFX11-SDAG-NEXT: s_mov_b32 s4, exec_lo -; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_lshl_add_u32 v2, s3, 5, s2 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 +; GFX11-SDAG-NEXT: s_mov_b32 s4, exec_lo ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x1ff0, v1 @@ -2059,31 +2044,30 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1 ; GFX11-SDAG-NEXT: .LBB14_6: ; %bb.1 ; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v0, 2, 15 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX11-SDAG-NEXT: .LBB14_7: ; =>This Inner Loop Header: Depth=1 ; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 ; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_7 ; GFX11-SDAG-NEXT: ; %bb.8: ; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s0, 5, s1 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s33 dlc +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s33 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s7 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s32, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s34, s8 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s7 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas: @@ -2189,9 +2173,9 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s11, s33 +; GFX9-SDAG-NEXT: s_mov_b32 s12, s33 ; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0xfc0 -; GFX9-SDAG-NEXT: s_mov_b32 s12, s34 +; GFX9-SDAG-NEXT: s_mov_b32 s13, s34 ; GFX9-SDAG-NEXT: s_mov_b32 s8, 0 ; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xfffff000 @@ -2201,24 +2185,24 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-SDAG-NEXT: s_cbranch_execz .LBB15_4 ; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.1 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX9-SDAG-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v1, 2, 15 +; GFX9-SDAG-NEXT: s_and_b32 s9, s6, 0xfffff000 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec ; GFX9-SDAG-NEXT: .LBB15_2: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s9, s[6:7] -; GFX9-SDAG-NEXT: v_readlane_b32 s10, v1, s9 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s9 -; GFX9-SDAG-NEXT: s_max_u32 s8, s8, s10 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s10, s[6:7] +; GFX9-SDAG-NEXT: v_readlane_b32 s11, v0, s10 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s10 +; GFX9-SDAG-NEXT: s_max_u32 s8, s8, s11 ; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0 ; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB15_2 ; GFX9-SDAG-NEXT: ; %bb.3: -; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff -; GFX9-SDAG-NEXT: s_and_b32 s6, s6, 0xfffff000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v2, s8, 6, v1 -; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v2 -; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s8, 6, v0 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: ; implicit-def: $vgpr31 ; GFX9-SDAG-NEXT: .LBB15_4: ; %Flow @@ -2248,8 +2232,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: .LBB15_8: ; %bb.2 ; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-SDAG-NEXT: s_mov_b32 s32, s34 -; GFX9-SDAG-NEXT: s_mov_b32 s34, s12 -; GFX9-SDAG-NEXT: s_mov_b32 s33, s11 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s13 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s12 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow: @@ -2321,9 +2305,9 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: s_mov_b32 s5, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s6, s33 ; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 63 -; GFX11-SDAG-NEXT: s_mov_b32 s6, s34 +; GFX11-SDAG-NEXT: s_mov_b32 s7, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s1, 0 ; GFX11-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX11-SDAG-NEXT: s_and_not1_b32 s33, s33, 63 @@ -2333,28 +2317,28 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-SDAG-NEXT: s_cbranch_execz .LBB15_4 ; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.1 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v1, 2, 15 +; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff +; GFX11-SDAG-NEXT: s_mov_b32 s3, exec_lo +; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX11-SDAG-NEXT: .LBB15_2: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s3 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s4, v1, s3 -; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3 -; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-SDAG-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11-SDAG-NEXT: s_bitset0_b32 s3, s4 +; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s5 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB15_2 ; GFX11-SDAG-NEXT: ; %bb.3: -; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s2 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 2 ; GFX11-SDAG-NEXT: ; implicit-def: $vgpr31 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s1, 5, s2 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s2 dlc +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1 ; GFX11-SDAG-NEXT: .LBB15_4: ; %Flow ; GFX11-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-SDAG-NEXT: s_cbranch_execz .LBB15_8 @@ -2383,8 +2367,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: .LBB15_8: ; %bb.2 ; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-SDAG-NEXT: s_mov_b32 s32, s34 -; GFX11-SDAG-NEXT: s_mov_b32 s34, s6 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s5 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s7 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s6 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow: diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index b0e6752386285..e01cb79382c05 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -524,7 +524,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX942-SDAG-NEXT: s_lshl1_add_u32 s0, s0, 0 ; GFX942-SDAG-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 @@ -695,7 +695,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX942-SDAG-NEXT: s_lshl1_add_u32 s0, s0, 0 ; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, s0 ; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -875,7 +875,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX942-SDAG-NEXT: s_lshl1_add_u32 s0, s0, 0 ; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, s0 ; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -1054,7 +1054,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 0 ; GFX942-SDAG-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 @@ -1225,7 +1225,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 0 ; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, s0 ; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -1405,7 +1405,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 0 ; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, s0 ; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index fc8883924dfbc..870b679a84d11 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -857,13 +857,13 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX9-LABEL: store_load_vindex_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s32 -; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: s_mov_b32 s0, s32 +; GFX9-NEXT: v_lshl_add_u32 v1, v0, 2, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -915,13 +915,13 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX9-PAL-LABEL: store_load_vindex_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s32 -; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 -; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: s_mov_b32 s0, s32 +; GFX9-PAL-NEXT: v_lshl_add_u32 v1, v0, 2, s0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] @@ -929,8 +929,8 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX942-LABEL: store_load_vindex_foo: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, s32 -; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, v1 +; GFX942-NEXT: s_mov_b32 s0, s32 +; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, s0 ; GFX942-NEXT: v_mov_b32_e32 v2, 15 ; GFX942-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1 @@ -2146,16 +2146,16 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX9-LABEL: store_load_vindex_small_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s1, s32, 0x100 ; GFX9-NEXT: scratch_load_dword v1, off, s32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s32, 0x100 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: s_mov_b32 s0, s1 +; GFX9-NEXT: v_lshl_add_u32 v1, v0, 2, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2214,16 +2214,16 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: s_add_i32 s1, s32, 0x100 ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x100 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 -; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: s_mov_b32 s0, s1 +; GFX9-PAL-NEXT: v_lshl_add_u32 v1, v0, 2, s0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] @@ -2231,11 +2231,11 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX942-LABEL: store_load_vindex_small_offset_foo: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_add_i32 s1, s32, 0x100 ; GFX942-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: s_add_i32 s0, s32, 0x100 -; GFX942-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, v1 +; GFX942-NEXT: s_mov_b32 s0, s1 +; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, s0 ; GFX942-NEXT: v_mov_b32_e32 v2, 15 ; GFX942-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1 @@ -3447,16 +3447,16 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX9-LABEL: store_load_vindex_large_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: s_mov_b32 s0, s1 +; GFX9-NEXT: v_lshl_add_u32 v1, v0, 2, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3516,16 +3516,16 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 -; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: s_mov_b32 s0, s1 +; GFX9-PAL-NEXT: v_lshl_add_u32 v1, v0, 2, s0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] @@ -3533,11 +3533,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX942-LABEL: store_load_vindex_large_offset_foo: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX942-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX942-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, v1 +; GFX942-NEXT: s_mov_b32 s0, s1 +; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, s0 ; GFX942-NEXT: v_mov_b32_e32 v2, 15 ; GFX942-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1 @@ -3940,12 +3940,12 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc @@ -4001,15 +4001,15 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] ; GFX9-PAL-NEXT: s_mov_b32 s12, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-PAL-NEXT: s_mov_b32 s1, 0 ; GFX9-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 ; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc @@ -4020,11 +4020,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: s_mov_b32 s1, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 15 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX942-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, 15 +; GFX942-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX942-NEXT: scratch_store_dword v0, v1, off offset:1024 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: scratch_load_dword v0, v0, off offset:1024 sc0 sc1 diff --git a/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll b/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll index b5f0b2ff9ef4c..61902b5fd4661 100644 --- a/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll @@ -18,8 +18,8 @@ define void @gep_noflags_alloca(i32 %idx, i32 %val) #0 { ; GFX9-LABEL: gep_noflags_alloca: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 6, s32 -; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX9-NEXT: s_lshr_b32 s4, s32, 6 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -45,8 +45,8 @@ define void @gep_inbounds_alloca(i32 %idx, i32 %val) #0 { ; GFX9-LABEL: gep_inbounds_alloca: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 6, s32 -; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX9-NEXT: s_lshr_b32 s4, s32, 6 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -72,8 +72,8 @@ define void @gep_nuw_alloca(i32 %idx, i32 %val) #0 { ; GFX9-LABEL: gep_nuw_alloca: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 6, s32 -; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX9-NEXT: s_lshr_b32 s4, s32, 6 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -99,8 +99,8 @@ define void @gep_nusw_alloca(i32 %idx, i32 %val) #0 { ; GFX9-LABEL: gep_nusw_alloca: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 6, s32 -; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX9-NEXT: s_lshr_b32 s4, s32, 6 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -126,8 +126,8 @@ define void @gep_inbounds_nuw_alloca(i32 %idx, i32 %val) #0 { ; GFX9-LABEL: gep_inbounds_nuw_alloca: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 6, s32 -; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX9-NEXT: s_lshr_b32 s4, s32, 6 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -153,8 +153,8 @@ define void @gep_nusw_nuw_alloca(i32 %idx, i32 %val) #0 { ; GFX9-LABEL: gep_nusw_nuw_alloca: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 6, s32 -; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX9-NEXT: s_lshr_b32 s4, s32, 6 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll index 8bd6c0f2652cf..d24b3a23cb9cd 100644 --- a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll +++ b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll @@ -22,7 +22,7 @@ define amdgpu_kernel void @dynamic_shared_array_0(ptr addrspace(1) %out) { } ; CHECK-LABEL: {{^}}dynamic_shared_array_1: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0xc00 +; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0xc00 ; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] define amdgpu_kernel void @dynamic_shared_array_1(ptr addrspace(1) %out, i32 %cond) { entry: @@ -49,7 +49,7 @@ endif: ; preds = %else, %if } ; CHECK-LABEL: {{^}}dynamic_shared_array_2: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x4000 +; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x4000 ; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] define amdgpu_kernel void @dynamic_shared_array_2(i32 %idx) { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -64,7 +64,7 @@ define amdgpu_kernel void @dynamic_shared_array_2(i32 %idx) { ; The offset to the dynamic shared memory array should be aligned on the type ; specified. ; CHECK-LABEL: {{^}}dynamic_shared_array_3: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44 +; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x44 ; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -80,7 +80,7 @@ define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) { ; The offset to the dynamic shared memory array should be aligned on the ; maximal one. ; CHECK-LABEL: {{^}}dynamic_shared_array_4: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x48 +; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x48 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]] define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) { @@ -99,7 +99,7 @@ define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) { ; Honor the explicit alignment from the specified variable. ; CHECK-LABEL: {{^}}dynamic_shared_array_5: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44 +; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x44 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]] define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) { @@ -118,7 +118,7 @@ define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) { ; Honor the explicit alignment from the specified variable. ; CHECK-LABEL: {{^}}dynamic_shared_array_6: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50 +; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x50 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]] define amdgpu_kernel void @dynamic_shared_array_6(i32 %idx) { diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll index 3eef616ba267d..ad894ce36c55b 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll @@ -97,8 +97,7 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) { ; CHECK-NEXT: v_mov_b32_e32 v0, 2 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_lshl_b32 s0, s0, 2 -; CHECK-NEXT: s_add_i32 s0, s0, 4 +; CHECK-NEXT: s_lshl2_add_u32 s0, s0, 4 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: ds_write_b16 v1, v0 ; CHECK-NEXT: ds_write_b32 v2, v1 @@ -136,10 +135,9 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) { ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21] -; CHECK-NEXT: s_lshl_b32 s4, s17, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_add_i32 s4, s4, 4 +; CHECK-NEXT: s_lshl2_add_u32 s4, s17, 4 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-NEXT: ds_write_b16 v1, v0 @@ -163,8 +161,7 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) { ; CHECK-NEXT: v_mov_b32_e32 v0, 2 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_lshl_b32 s0, s0, 2 -; CHECK-NEXT: s_add_i32 s0, s0, 4 +; CHECK-NEXT: s_lshl2_add_u32 s0, s0, 4 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: ds_write_b16 v1, v0 ; CHECK-NEXT: ds_write_b32 v2, v1 @@ -202,10 +199,9 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) { ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21] -; CHECK-NEXT: s_lshl_b32 s4, s17, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_add_i32 s4, s4, 8 +; CHECK-NEXT: s_lshl2_add_u32 s4, s17, 8 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-NEXT: ds_write_b16 v1, v0 @@ -229,8 +225,7 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) { ; CHECK-NEXT: v_mov_b32_e32 v0, 2 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_lshl_b32 s0, s0, 2 -; CHECK-NEXT: s_add_i32 s0, s0, 8 +; CHECK-NEXT: s_lshl2_add_u32 s0, s0, 8 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: ds_write_b16 v1, v0 ; CHECK-NEXT: ds_write_b32 v2, v1 @@ -268,10 +263,9 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) { ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21] -; CHECK-NEXT: s_lshl_b32 s4, s17, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_add_i32 s4, s4, 8 +; CHECK-NEXT: s_lshl2_add_u32 s4, s17, 8 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-NEXT: ds_write_b16 v1, v0 @@ -295,8 +289,7 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx) ; CHECK-NEXT: v_mov_b32_e32 v0, 2 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_lshl_b32 s0, s0, 2 -; CHECK-NEXT: s_add_i32 s0, s0, 8 +; CHECK-NEXT: s_lshl2_add_u32 s0, s0, 8 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: ds_write_b16 v1, v0 ; CHECK-NEXT: ds_write_b32 v2, v1 @@ -334,10 +327,9 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21] -; CHECK-NEXT: s_lshl_b32 s4, s17, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_add_i32 s4, s4, 8 +; CHECK-NEXT: s_lshl2_add_u32 s4, s17, 8 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-NEXT: ds_write_b16 v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll index 3c55dcb486675..447cb62643384 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll @@ -6,8 +6,8 @@ ; ELF: Relocations [ ; ELF-NEXT: Section (3) .rel.text { -; ELF-NEXT: 0x{{[0-9a-f]*}} R_AMDGPU_ABS32_LO lds.external -; ELF-NEXT: 0x{{[0-9a-f]*}} R_AMDGPU_ABS32_LO lds.defined +; ELF-NEXT: 0x{{[0-9A-F]*}} R_AMDGPU_ABS32_LO lds.external +; ELF-NEXT: 0x{{[0-9A-F]*}} R_AMDGPU_ABS32_LO lds.defined ; ELF-NEXT: } ; ELF-NEXT: ] @@ -32,10 +32,10 @@ ; ELF-NEXT: } ; GCN-LABEL: {{^}}test_basic: -; GCN: v_mov_b32_e32 v1, lds.external@abs32@lo ; encoding: [0xff,0x02,0x02,0x7e,A,A,A,A] +; GCN: s_mov_b32 s0, lds.external@abs32@lo ; encoding: [0xff,0x00,0x80,0xbe,A,A,A,A] ; GCN-NEXT: ; fixup A - offset: 4, value: lds.external@abs32@lo, kind: FK_Data_4{{$}} ; -; GCN: s_add_i32 s0, s0, lds.defined@abs32@lo ; encoding: [0x00,0xff,0x00,0x81,A,A,A,A] +; GCN: s_lshl2_add_u32 s0, s2, lds.defined@abs32@lo ; encoding: [0x02,0xff,0x80,0x97,A,A,A,A] ; GCN-NEXT: ; fixup A - offset: 4, value: lds.defined@abs32@lo, kind: FK_Data_4{{$}} ; ; GCN: .globl lds.external diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll index 69439d49e588f..de82dcdecda48 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll @@ -102,10 +102,9 @@ define void @test_workgroup_id_x_non_kernel_optimized_fixed(ptr addrspace(1) %ou ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: s_lshl_b32 s0, ttmp9, 1 -; GFX1250-SDAG-NEXT: s_and_b32 s1, ttmp6, 15 +; GFX1250-SDAG-NEXT: s_and_b32 s0, ttmp6, 15 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_add_co_i32 s0, s1, s0 +; GFX1250-SDAG-NEXT: s_lshl1_add_u32 s0, ttmp9, s0 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v2, off ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll index 497241cff392d..6b6658bd672de 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll @@ -234,19 +234,18 @@ define amdgpu_cs void @workgroup_id_optimized() "amdgpu-cluster-dims"="2,3,4" { ; ; GFX1250-SDAG-LABEL: workgroup_id_optimized: ; GFX1250-SDAG: ; %bb.0: ; %.entry -; GFX1250-SDAG-NEXT: s_lshl_b32 s0, ttmp9, 1 -; GFX1250-SDAG-NEXT: s_and_b32 s1, ttmp6, 15 -; GFX1250-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 14 -; GFX1250-SDAG-NEXT: s_add_co_i32 s1, s1, s0 -; GFX1250-SDAG-NEXT: s_and_b32 s0, s2, 0x3fffc +; GFX1250-SDAG-NEXT: s_lshr_b32 s1, ttmp7, 14 ; GFX1250-SDAG-NEXT: s_and_b32 s2, ttmp7, 0xffff +; GFX1250-SDAG-NEXT: s_and_b32 s0, ttmp6, 15 +; GFX1250-SDAG-NEXT: s_and_b32 s1, s1, 0x3fffc ; GFX1250-SDAG-NEXT: s_bfe_u32 s3, ttmp6, 0x40008 ; GFX1250-SDAG-NEXT: s_mul_i32 s2, s2, 3 ; GFX1250-SDAG-NEXT: s_bfe_u32 s4, ttmp6, 0x40004 -; GFX1250-SDAG-NEXT: s_add_co_i32 s3, s3, s0 +; GFX1250-SDAG-NEXT: s_lshl1_add_u32 s0, ttmp9, s0 +; GFX1250-SDAG-NEXT: s_add_co_i32 s3, s3, s1 ; GFX1250-SDAG-NEXT: s_add_co_i32 s4, s4, s2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s4 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s3 ; GFX1250-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null ; GFX1250-SDAG-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll index 4b5a7c207055a..8dea9e87e140f 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll @@ -1620,15 +1620,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_1-NEXT: v_writelane_b32 v1, s55, 0 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 -; GFX10_1-NEXT: s_lshl_b32 s4, s16, 2 -; GFX10_1-NEXT: s_lshr_b32 s55, s32, 5 -; GFX10_1-NEXT: s_add_i32 s55, s55, s4 +; GFX10_1-NEXT: s_lshr_b32 s4, s32, 5 +; GFX10_1-NEXT: s_addk_i32 s4, 0x4040 +; GFX10_1-NEXT: s_lshl2_add_u32 s55, s16, s4 ; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 -; GFX10_1-NEXT: s_addk_i32 s55, 0x4040 +; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use alloca0 v0 ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s55, scc ; GFX10_1-NEXT: ;;#ASMEND @@ -1650,15 +1649,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_3-NEXT: v_writelane_b32 v1, s55, 0 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 -; GFX10_3-NEXT: s_lshl_b32 s4, s16, 2 -; GFX10_3-NEXT: s_lshr_b32 s55, s32, 5 -; GFX10_3-NEXT: s_add_i32 s55, s55, s4 +; GFX10_3-NEXT: s_lshr_b32 s4, s32, 5 +; GFX10_3-NEXT: s_addk_i32 s4, 0x4040 +; GFX10_3-NEXT: s_lshl2_add_u32 s55, s16, s4 ; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 -; GFX10_3-NEXT: s_addk_i32 s55, 0x4040 +; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use alloca0 v0 ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s55, scc ; GFX10_3-NEXT: ;;#ASMEND @@ -1677,15 +1675,15 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX11-NEXT: s_add_i32 s2, s32, 0x8040 ; GFX11-NEXT: scratch_store_b32 off, v1, s2 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s1, s32, 64 ; GFX11-NEXT: v_writelane_b32 v1, s55, 0 -; GFX11-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-NEXT: s_add_i32 s1, s32, 64 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-NEXT: s_add_i32 s55, s32, s0 +; GFX11-NEXT: s_add_i32 s1, s32, 0x4040 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use alloca0 v0 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_addk_i32 s55, 0x4040 +; GFX11-NEXT: s_lshl2_add_u32 s55, s0, s1 ; GFX11-NEXT: s_and_b32 s0, 0, exec_lo ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s55, scc @@ -1710,16 +1708,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_writelane_b32 v1, s55, 0 -; GFX12-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-NEXT: s_add_co_i32 s1, s32, 0x4000 ; GFX12-NEXT: v_mov_b32_e32 v0, s32 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_add_co_i32 s55, s32, s0 +; GFX12-NEXT: s_lshl2_add_u32 s55, s0, s1 +; GFX12-NEXT: s_and_b32 s0, 0, exec_lo ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use alloca0 v0 ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_addk_co_i32 s55, 0x4000 -; GFX12-NEXT: s_and_b32 s0, 0, exec_lo ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s55, scc ; GFX12-NEXT: ;;#ASMEND @@ -1767,11 +1763,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX900-NEXT: s_add_i32 s6, s32, 0x201000 ; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_lshr_b32 s4, s32, 6 +; GFX900-NEXT: s_addk_i32 s4, 0x4040 ; GFX900-NEXT: v_writelane_b32 v1, s55, 0 -; GFX900-NEXT: s_lshl_b32 s4, s16, 2 -; GFX900-NEXT: s_lshr_b32 s55, s32, 6 -; GFX900-NEXT: s_add_i32 s55, s55, s4 -; GFX900-NEXT: s_addk_i32 s55, 0x4040 +; GFX900-NEXT: s_lshl2_add_u32 s55, s16, s4 ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX900-NEXT: ;;#ASMSTART @@ -1796,10 +1791,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX942-NEXT: s_add_i32 s1, s32, 0x8040 ; GFX942-NEXT: scratch_store_dword off, v1, s1 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[2:3] -; GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-NEXT: s_add_i32 s1, s32, 0x4040 ; GFX942-NEXT: v_writelane_b32 v1, s55, 0 -; GFX942-NEXT: s_add_i32 s55, s32, s0 -; GFX942-NEXT: s_addk_i32 s55, 0x4040 +; GFX942-NEXT: s_lshl2_add_u32 s55, s0, s1 ; GFX942-NEXT: s_add_i32 s0, s32, 64 ; GFX942-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index c6f7ce51f5ea2..9888204b997a9 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -260,12 +260,11 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX10-WGP-LABEL: local_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_mov_b32 s6, 2 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, s6, s7 +; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-WGP-NEXT: ds_read_b32 v1, v1 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -274,12 +273,11 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX10-CU-LABEL: local_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_mov_b32 s6, 2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, s6, s7 +; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-CU-NEXT: ds_read_b32 v1, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -311,15 +309,13 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff -; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 2 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s7, 0x3ff +; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s7 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 -; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s6, v2 +; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -328,15 +324,13 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff -; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 2 +; GFX90A-TGSPLIT-NEXT: s_mov_b32 s7, 0x3ff +; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s7 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 -; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s6, v2 +; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -345,15 +339,13 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX942-NOTTGSPLIT-LABEL: local_nontemporal_load_1: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff -; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 2 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 0x3ff +; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 -; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s2, v2 +; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -362,15 +354,13 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX942-TGSPLIT-LABEL: local_nontemporal_load_1: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff -; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 2 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 0x3ff +; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 -; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s2, v2 +; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v1 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -379,14 +369,13 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX11-WGP-LABEL: local_nontemporal_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX11-WGP-NEXT: s_mov_b32 s2, 2 +; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff +; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-WGP-NEXT: ds_load_b32 v1, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -395,14 +384,13 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX11-CU-LABEL: local_nontemporal_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX11-CU-NEXT: s_mov_b32 s2, 2 +; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff +; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-CU-NEXT: ds_load_b32 v1, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -411,15 +399,13 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX12-WGP-LABEL: local_nontemporal_load_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX12-WGP-NEXT: s_mov_b32 s2, 2 +; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff +; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX12-WGP-NEXT: ds_load_b32 v1, v1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -428,15 +414,13 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX12-CU-LABEL: local_nontemporal_load_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX12-CU-NEXT: s_mov_b32 s2, 2 +; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff +; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX12-CU-NEXT: ds_load_b32 v1, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -445,14 +429,13 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX1250-LABEL: local_nontemporal_load_1: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_mov_b32 s2, 0x3ff -; GFX1250-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 2 +; GFX1250-NEXT: s_mov_b32 s3, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX1250-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX1250-NEXT: ds_load_b32 v1, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -679,12 +662,11 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX10-WGP-LABEL: local_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_mov_b32 s5, 2 -; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, s5, s6 +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s5 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -692,12 +674,11 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX10-CU-LABEL: local_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-CU-NEXT: s_mov_b32 s5, 2 -; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, s5, s6 +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s5 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -720,15 +701,13 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 0x3ff -; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, s5, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff +; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s5 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -736,15 +715,13 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 0x3ff -; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, s5, v1 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff +; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6 +; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -752,15 +729,13 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX942-NOTTGSPLIT-LABEL: local_nontemporal_store_1: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 0x3ff -; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 2 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, s1, v1 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff +; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -768,15 +743,13 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX942-TGSPLIT-LABEL: local_nontemporal_store_1: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 0x3ff -; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 2 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, s1, v1 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff +; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -784,14 +757,13 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX11-WGP-LABEL: local_nontemporal_store_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff -; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX11-WGP-NEXT: s_mov_b32 s1, 2 -; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -799,14 +771,13 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX11-CU-LABEL: local_nontemporal_store_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff -; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX11-CU-NEXT: s_mov_b32 s1, 2 -; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -814,15 +785,13 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX12-WGP-LABEL: local_nontemporal_store_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff -; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX12-WGP-NEXT: s_mov_b32 s1, 2 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -830,15 +799,13 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX12-CU-LABEL: local_nontemporal_store_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff -; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX12-CU-NEXT: s_mov_b32 s1, 2 -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -846,15 +813,14 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX1250-LABEL: local_nontemporal_store_1: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s1, 0x3ff -; GFX1250-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX1250-NEXT: s_mov_b32 s1, 2 -; GFX1250-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_store_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index d686e7a2d5b4c..33c516c61e42c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -208,12 +208,11 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX10-WGP-LABEL: local_volatile_load_1: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_mov_b32 s6, 2 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, s6, s7 +; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-WGP-NEXT: ds_read_b32 v1, v1 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -222,12 +221,11 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX10-CU-LABEL: local_volatile_load_1: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_mov_b32 s6, 2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, s6, s7 +; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-CU-NEXT: ds_read_b32 v1, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -259,14 +257,13 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX11-WGP-LABEL: local_volatile_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX11-WGP-NEXT: s_mov_b32 s2, 2 +; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff +; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-WGP-NEXT: ds_load_b32 v1, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -275,14 +272,13 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX11-CU-LABEL: local_volatile_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX11-CU-NEXT: s_mov_b32 s2, 2 +; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff +; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-CU-NEXT: ds_load_b32 v1, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -291,15 +287,13 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX12-WGP-LABEL: local_volatile_load_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX12-WGP-NEXT: s_mov_b32 s2, 2 +; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff +; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX12-WGP-NEXT: ds_load_b32 v1, v1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -308,15 +302,13 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX12-CU-LABEL: local_volatile_load_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX12-CU-NEXT: s_mov_b32 s2, 2 +; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff +; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX12-CU-NEXT: ds_load_b32 v1, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -325,14 +317,13 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX1250-LABEL: local_volatile_load_1: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 -; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_mov_b32 s2, 0x3ff -; GFX1250-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX1250-NEXT: s_mov_b32 s2, 2 +; GFX1250-NEXT: s_mov_b32 s3, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX1250-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX1250-NEXT: ds_load_b32 v1, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -511,12 +502,11 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX10-WGP-LABEL: local_volatile_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_mov_b32 s5, 2 -; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, s5, s6 +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s5 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -524,12 +514,11 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX10-CU-LABEL: local_volatile_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-CU-NEXT: s_mov_b32 s5, 2 -; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, s5, s6 +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s5 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -552,14 +541,13 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX11-WGP-LABEL: local_volatile_store_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff -; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX11-WGP-NEXT: s_mov_b32 s1, 2 -; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -567,14 +555,13 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX11-CU-LABEL: local_volatile_store_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff -; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX11-CU-NEXT: s_mov_b32 s1, 2 -; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -582,15 +569,13 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX12-WGP-LABEL: local_volatile_store_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff -; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX12-WGP-NEXT: s_mov_b32 s1, 2 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 @@ -598,15 +583,13 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX12-CU-LABEL: local_volatile_store_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff -; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX12-CU-NEXT: s_mov_b32 s1, 2 -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 @@ -614,15 +597,14 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX1250-LABEL: local_volatile_store_1: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s1, 0x3ff -; GFX1250-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX1250-NEXT: s_mov_b32 s1, 2 -; GFX1250-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: ds_store_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll index 89de17ecbd1e8..6c19722ad6e33 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -270,12 +270,11 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_mov_b32 s6, 2 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, s6, s7 +; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-WGP-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen slc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -286,12 +285,11 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_mov_b32 s6, 2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, s6, s7 +; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-CU-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen slc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -330,15 +328,13 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff -; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 2 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s7, 0x3ff +; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s7 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 -; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s6, v2 +; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc slc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -349,15 +345,13 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff -; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 2 +; GFX90A-TGSPLIT-NEXT: s_mov_b32 s7, 0x3ff +; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s7 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 -; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s6, v2 +; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc slc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -366,15 +360,13 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX942-NOTTGSPLIT-LABEL: private_nontemporal_load_1: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff -; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 2 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 0x3ff +; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 -; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s2, v2 +; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v1, v1, off nt ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -383,15 +375,13 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX942-TGSPLIT-LABEL: private_nontemporal_load_1: ; GFX942-TGSPLIT: ; %bb.0: ; %entry ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff -; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 2 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 0x3ff +; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 -; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s2, v2 +; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX942-TGSPLIT-NEXT: scratch_load_dword v1, v1, off nt ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -400,14 +390,13 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX11-WGP-LABEL: private_nontemporal_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX11-WGP-NEXT: s_mov_b32 s2, 2 +; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff +; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-WGP-NEXT: scratch_load_b32 v1, v1, off slc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -416,14 +405,13 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX11-CU-LABEL: private_nontemporal_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX11-CU-NEXT: s_mov_b32 s2, 2 +; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff +; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-CU-NEXT: scratch_load_b32 v1, v1, off slc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -708,12 +696,11 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_mov_b32 s5, 2 -; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v0, s5, s6 +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v0, 2, s5 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc @@ -723,12 +710,11 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-CU-NEXT: s_mov_b32 s5, 2 -; GFX10-CU-NEXT: v_lshl_add_u32 v1, v0, s5, s6 +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-CU-NEXT: v_lshl_add_u32 v1, v0, 2, s5 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc @@ -758,15 +744,13 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 0x3ff -; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v0, s5, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff +; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v0, 2, s5 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc @@ -776,15 +760,13 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 0x3ff -; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v0, s5, v1 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff +; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6 +; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v0, 2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc @@ -792,15 +774,13 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX942-NOTTGSPLIT-LABEL: private_nontemporal_store_1: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 0x3ff -; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 2 -; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v0, s1, v1 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff +; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v0, 2, s1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword v1, v0, off nt @@ -808,15 +788,13 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX942-TGSPLIT-LABEL: private_nontemporal_store_1: ; GFX942-TGSPLIT: ; %bb.0: ; %entry -; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 0x3ff -; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 2 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v0, s1, v1 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff +; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v0, 2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: scratch_store_dword v1, v0, off nt @@ -824,14 +802,13 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX11-WGP-LABEL: private_nontemporal_store_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff -; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX11-WGP-NEXT: s_mov_b32 s1, 2 -; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v0, s1, s2 +; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v0, 2, s1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: scratch_store_b32 v1, v0, off glc slc dlc @@ -839,14 +816,13 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX11-CU-LABEL: private_nontemporal_store_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff -; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX11-CU-NEXT: s_mov_b32 s1, 2 -; GFX11-CU-NEXT: v_lshl_add_u32 v1, v0, s1, s2 +; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX11-CU-NEXT: v_lshl_add_u32 v1, v0, 2, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: scratch_store_b32 v1, v0, off glc slc dlc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll index 7faa0621aa6d0..7c23b76cec3e9 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -228,12 +228,11 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_mov_b32 s6, 2 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, s6, s7 +; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-WGP-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -244,12 +243,11 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_mov_b32 s6, 2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, s6, s7 +; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s6 ; GFX10-CU-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -286,14 +284,13 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX11-WGP-LABEL: private_volatile_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX11-WGP-NEXT: s_mov_b32 s2, 2 +; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff +; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-WGP-NEXT: scratch_load_b32 v1, v1, off glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -302,14 +299,13 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX11-CU-LABEL: private_volatile_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX11-CU-NEXT: s_mov_b32 s2, 2 +; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff +; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2 ; GFX11-CU-NEXT: scratch_load_b32 v1, v1, off glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -578,12 +574,11 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_mov_b32 s5, 2 -; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v0, s5, s6 +; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v0, 2, s5 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -594,12 +589,11 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-CU-NEXT: s_mov_b32 s5, 2 -; GFX10-CU-NEXT: v_lshl_add_u32 v1, v0, s5, s6 +; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-CU-NEXT: v_lshl_add_u32 v1, v0, 2, s5 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -629,14 +623,13 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX11-WGP-LABEL: private_volatile_store_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff -; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX11-WGP-NEXT: s_mov_b32 s1, 2 -; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v0, s1, s2 +; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff +; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v0, 2, s1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: scratch_store_b32 v1, v0, off dlc @@ -645,14 +638,13 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX11-CU-LABEL: private_volatile_store_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff -; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX11-CU-NEXT: s_mov_b32 s1, 2 -; GFX11-CU-NEXT: v_lshl_add_u32 v1, v0, s1, s2 +; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX11-CU-NEXT: v_lshl_add_u32 v1, v0, 2, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: scratch_store_b32 v1, v0, off dlc diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll new file mode 100644 index 0000000000000..b7e6ed26876c4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s + +define amdgpu_kernel void @lshl1_add(ptr addrspace(1) %in, ptr addrspace(7) %in2) { +; CHECK-LABEL: lshl1_add: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_b32 s5, s[4:5], 0x54 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_mov_b32 s6, s3 +; CHECK-NEXT: s_mov_b32 s7, s4 +; CHECK-NEXT: s_mov_b32 s3, s4 +; CHECK-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] +; CHECK-NEXT: s_mov_b32 s5, s2 +; CHECK-NEXT: s_mov_b32 s2, s1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] +; CHECK-NEXT: v_lshl_add_u32 v1, v1, 1, s0 +; CHECK-NEXT: buffer_store_b16 v0, v1, s[4:7], null offen +; CHECK-NEXT: s_endpgm + %vaddr = load volatile i32, ptr addrspace(1) %in, align 4 + %1 = sext i32 %vaddr to i64 + %gep = getelementptr i16, ptr addrspace(7) %in2, i64 %1 + store i16 0, ptr addrspace(7) %gep, align 2 + ret void +} + +define amdgpu_kernel void @lshl2_add(ptr addrspace(1) %in, ptr addrspace(7) %in2) { +; CHECK-LABEL: lshl2_add: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_b32 s5, s[4:5], 0x54 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_mov_b32 s6, s3 +; CHECK-NEXT: s_mov_b32 s7, s4 +; CHECK-NEXT: s_mov_b32 s3, s4 +; CHECK-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] +; CHECK-NEXT: s_mov_b32 s5, s2 +; CHECK-NEXT: s_mov_b32 s2, s1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] +; CHECK-NEXT: v_lshl_add_u32 v1, v1, 2, s0 +; CHECK-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen +; CHECK-NEXT: s_endpgm + %vaddr = load volatile i32, ptr addrspace(1) %in, align 4 + %1 = sext i32 %vaddr to i64 + %gep = getelementptr i32, ptr addrspace(7) %in2, i64 %1 + store i32 0, ptr addrspace(7) %gep, align 4 + ret void +} + +define amdgpu_kernel void @lshl3_add(ptr addrspace(1) %in, ptr addrspace(7) %in2) { +; CHECK-LABEL: lshl3_add: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_b32 s5, s[4:5], 0x54 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: global_load_b32 v2, v0, s[6:7] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_mov_b32 s6, s3 +; CHECK-NEXT: s_mov_b32 s7, s4 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] +; CHECK-NEXT: s_mov_b32 s5, s2 +; CHECK-NEXT: s_mov_b32 s2, s1 +; CHECK-NEXT: s_mov_b32 s3, s4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] +; CHECK-NEXT: v_lshl_add_u32 v2, v2, 3, s0 +; CHECK-NEXT: buffer_store_b64 v[0:1], v2, s[4:7], null offen +; CHECK-NEXT: s_endpgm + %vaddr = load volatile i32, ptr addrspace(1) %in, align 4 + %1 = sext i32 %vaddr to i64 + %gep = getelementptr i64, ptr addrspace(7) %in2, i64 %1 + store i64 0, ptr addrspace(7) %gep, align 8 + ret void +} + +define amdgpu_kernel void @lshl4_add(ptr addrspace(1) %in, ptr addrspace(7) %in2) { +; CHECK-LABEL: lshl4_add: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_b32 s5, s[4:5], 0x54 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: s_mov_b32 s9, s4 +; CHECK-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: global_load_b32 v3, v0, s[6:7] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_mov_b32 s7, s4 +; CHECK-NEXT: s_mov_b32 s6, s3 +; CHECK-NEXT: s_mov_b32 s8, s1 +; CHECK-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] +; CHECK-NEXT: s_mov_b32 s5, s2 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; CHECK-NEXT: v_lshl_add_u32 v4, v3, 4, s0 +; CHECK-NEXT: v_mov_b32_e32 v3, v0 +; CHECK-NEXT: buffer_store_b128 v[0:3], v4, s[4:7], null offen +; CHECK-NEXT: s_endpgm + %vaddr = load volatile i32, ptr addrspace(1) %in, align 4 + %1 = sext i32 %vaddr to i64 + %gep = getelementptr i128, ptr addrspace(7) %in2, i64 %1 + store i128 0, ptr addrspace(7) %gep, align 16 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll index 6e2d0f6503a20..7e2bfa666a19f 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll @@ -144,7 +144,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 { ; SICIVI: buffer_store_dword v{{[0-9]+}}, [[ADDR1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} ; GFX9: global_load_dword [[VADDR:v[0-9]+]], -; GFX9: v_lshlrev_b32_e32 [[ADDR:v[0-9]+]], 2, [[VADDR]] +; GFX9: v_lshl_add_u32 [[ADDR:v[0-9]+]], [[VADDR]], 2, s{{[0-9]+}} ; GFX9-NOT [[ADDR]] ; GFX9: buffer_store_dword v{{[0-9]+}}, [[ADDR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen offset:32 define amdgpu_kernel void @store_private_unknown_bits_vaddr() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll index 627f4ada95dba..c1f52173c7451 100644 --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -33,11 +33,10 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; MUBUF-NEXT: s_mov_b32 s6, s32 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v2, 1 -; MUBUF-NEXT: s_lshl_b32 s7, s10, 2 ; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000 ; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4 -; MUBUF-NEXT: s_add_i32 s6, s6, s7 +; MUBUF-NEXT: s_lshl2_add_u32 s6, s10, s6 ; MUBUF-NEXT: v_mov_b32_e32 v2, s6 ; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 @@ -68,10 +67,9 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; FLATSCR-NEXT: s_mov_b32 s2, s32 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 -; FLATSCR-NEXT: s_lshl_b32 s3, s6, 2 ; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2 -; FLATSCR-NEXT: s_add_i32 s2, s2, s3 +; FLATSCR-NEXT: s_lshl2_add_u32 s2, s6, s2 ; FLATSCR-NEXT: scratch_load_dword v2, off, s2 ; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -132,12 +130,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; MUBUF-NEXT: ; %bb.1: ; %bb.0 ; MUBUF-NEXT: s_add_i32 s4, s32, 0xfff ; MUBUF-NEXT: s_and_b32 s4, s4, 0xfffff000 -; MUBUF-NEXT: s_lshl_b32 s5, s5, 2 ; MUBUF-NEXT: s_add_i32 s32, s4, 0x1000 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v2, s4 ; MUBUF-NEXT: v_mov_b32_e32 v3, 1 -; MUBUF-NEXT: s_add_i32 s4, s4, s5 +; MUBUF-NEXT: s_lshl2_add_u32 s4, s5, s4 ; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 ; MUBUF-NEXT: v_mov_b32_e32 v2, s4 @@ -168,10 +165,9 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; FLATSCR-NEXT: s_and_b32 s0, s0, 0xfffff000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 -; FLATSCR-NEXT: s_lshl_b32 s1, s1, 2 ; FLATSCR-NEXT: s_add_i32 s32, s0, 0x1000 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0 -; FLATSCR-NEXT: s_add_i32 s0, s0, s1 +; FLATSCR-NEXT: s_lshl2_add_u32 s0, s1, s0 ; FLATSCR-NEXT: scratch_load_dword v2, off, s0 ; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/shlN_add.ll b/llvm/test/CodeGen/AMDGPU/shlN_add.ll index 3e507a0c5889f..ba8ae9554d0e8 100644 --- a/llvm/test/CodeGen/AMDGPU/shlN_add.ll +++ b/llvm/test/CodeGen/AMDGPU/shlN_add.ll @@ -14,8 +14,7 @@ define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) { ; GFX9-SDAG-LABEL: s_shl1_add_u32: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s1 +; GFX9-SDAG-NEXT: s_lshl1_add_u32 s0, s0, s1 ; GFX9-SDAG-NEXT: ; return to shader part epilog ; ; GFX8-SDAG-LABEL: s_shl1_add_u32: @@ -26,8 +25,7 @@ define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) { ; ; GFX10-SDAG-LABEL: s_shl1_add_u32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s1 +; GFX10-SDAG-NEXT: s_lshl1_add_u32 s0, s0, s1 ; GFX10-SDAG-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl1_add_u32: @@ -53,8 +51,7 @@ define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) { define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) { ; GFX9-SDAG-LABEL: s_shl2_add_u32: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s1 +; GFX9-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s1 ; GFX9-SDAG-NEXT: ; return to shader part epilog ; ; GFX8-SDAG-LABEL: s_shl2_add_u32: @@ -65,8 +62,7 @@ define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) { ; ; GFX10-SDAG-LABEL: s_shl2_add_u32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s1 +; GFX10-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s1 ; GFX10-SDAG-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl2_add_u32: @@ -92,8 +88,7 @@ define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) { define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) { ; GFX9-SDAG-LABEL: s_shl3_add_u32: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 3 -; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s1 +; GFX9-SDAG-NEXT: s_lshl3_add_u32 s0, s0, s1 ; GFX9-SDAG-NEXT: ; return to shader part epilog ; ; GFX8-SDAG-LABEL: s_shl3_add_u32: @@ -104,8 +99,7 @@ define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) { ; ; GFX10-SDAG-LABEL: s_shl3_add_u32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 3 -; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s1 +; GFX10-SDAG-NEXT: s_lshl3_add_u32 s0, s0, s1 ; GFX10-SDAG-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl3_add_u32: @@ -131,8 +125,7 @@ define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) { define amdgpu_ps i32 @s_shl4_add_u32(i32 inreg %src0, i32 inreg %src1) { ; GFX9-SDAG-LABEL: s_shl4_add_u32: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s1 +; GFX9-SDAG-NEXT: s_lshl4_add_u32 s0, s0, s1 ; GFX9-SDAG-NEXT: ; return to shader part epilog ; ; GFX8-SDAG-LABEL: s_shl4_add_u32: @@ -143,8 +136,7 @@ define amdgpu_ps i32 @s_shl4_add_u32(i32 inreg %src0, i32 inreg %src1) { ; ; GFX10-SDAG-LABEL: s_shl4_add_u32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s1 +; GFX10-SDAG-NEXT: s_lshl4_add_u32 s0, s0, s1 ; GFX10-SDAG-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl4_add_u32: @@ -598,10 +590,8 @@ define amdgpu_ps float @shl5_add_u32_vgpr1(i32 inreg %src0, i32 %src1) { define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) { ; GFX9-SDAG-LABEL: s_shl1_add_u32_v2: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-SDAG-NEXT: s_lshl_b32 s1, s1, 1 -; GFX9-SDAG-NEXT: s_add_i32 s1, s1, s3 -; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s2 +; GFX9-SDAG-NEXT: s_lshl1_add_u32 s1, s1, s3 +; GFX9-SDAG-NEXT: s_lshl1_add_u32 s0, s0, s2 ; GFX9-SDAG-NEXT: ; return to shader part epilog ; ; GFX8-SDAG-LABEL: s_shl1_add_u32_v2: @@ -614,10 +604,8 @@ define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i ; ; GFX10-SDAG-LABEL: s_shl1_add_u32_v2: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-SDAG-NEXT: s_lshl_b32 s1, s1, 1 -; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s2 -; GFX10-SDAG-NEXT: s_add_i32 s1, s1, s3 +; GFX10-SDAG-NEXT: s_lshl1_add_u32 s0, s0, s2 +; GFX10-SDAG-NEXT: s_lshl1_add_u32 s1, s1, s3 ; GFX10-SDAG-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl1_add_u32_v2: @@ -647,10 +635,8 @@ define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) { ; GFX9-SDAG-LABEL: s_shl2_add_u32_v2: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX9-SDAG-NEXT: s_lshl_b32 s1, s1, 2 -; GFX9-SDAG-NEXT: s_add_i32 s1, s1, s3 -; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s2 +; GFX9-SDAG-NEXT: s_lshl2_add_u32 s1, s1, s3 +; GFX9-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s2 ; GFX9-SDAG-NEXT: ; return to shader part epilog ; ; GFX8-SDAG-LABEL: s_shl2_add_u32_v2: @@ -663,10 +649,8 @@ define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i ; ; GFX10-SDAG-LABEL: s_shl2_add_u32_v2: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX10-SDAG-NEXT: s_lshl_b32 s1, s1, 2 -; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s2 -; GFX10-SDAG-NEXT: s_add_i32 s1, s1, s3 +; GFX10-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s2 +; GFX10-SDAG-NEXT: s_lshl2_add_u32 s1, s1, s3 ; GFX10-SDAG-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl2_add_u32_v2: @@ -696,10 +680,8 @@ define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) { ; GFX9-SDAG-LABEL: s_shl3_add_u32_v2: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 3 -; GFX9-SDAG-NEXT: s_lshl_b32 s1, s1, 3 -; GFX9-SDAG-NEXT: s_add_i32 s1, s1, s3 -; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s2 +; GFX9-SDAG-NEXT: s_lshl3_add_u32 s1, s1, s3 +; GFX9-SDAG-NEXT: s_lshl3_add_u32 s0, s0, s2 ; GFX9-SDAG-NEXT: ; return to shader part epilog ; ; GFX8-SDAG-LABEL: s_shl3_add_u32_v2: @@ -712,10 +694,8 @@ define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i ; ; GFX10-SDAG-LABEL: s_shl3_add_u32_v2: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 3 -; GFX10-SDAG-NEXT: s_lshl_b32 s1, s1, 3 -; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s2 -; GFX10-SDAG-NEXT: s_add_i32 s1, s1, s3 +; GFX10-SDAG-NEXT: s_lshl3_add_u32 s0, s0, s2 +; GFX10-SDAG-NEXT: s_lshl3_add_u32 s1, s1, s3 ; GFX10-SDAG-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl3_add_u32_v2: @@ -745,10 +725,8 @@ define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) { ; GFX9-SDAG-LABEL: s_shl4_add_u32_v2: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-SDAG-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-SDAG-NEXT: s_add_i32 s1, s1, s3 -; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s2 +; GFX9-SDAG-NEXT: s_lshl4_add_u32 s1, s1, s3 +; GFX9-SDAG-NEXT: s_lshl4_add_u32 s0, s0, s2 ; GFX9-SDAG-NEXT: ; return to shader part epilog ; ; GFX8-SDAG-LABEL: s_shl4_add_u32_v2: @@ -761,10 +739,8 @@ define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i ; ; GFX10-SDAG-LABEL: s_shl4_add_u32_v2: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-SDAG-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s2 -; GFX10-SDAG-NEXT: s_add_i32 s1, s1, s3 +; GFX10-SDAG-NEXT: s_lshl4_add_u32 s0, s0, s2 +; GFX10-SDAG-NEXT: s_lshl4_add_u32 s1, s1, s3 ; GFX10-SDAG-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl4_add_u32_v2: @@ -794,10 +770,8 @@ define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i define amdgpu_ps <2 x i32> @s_shl_2_4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) { ; GFX9-SDAG-LABEL: s_shl_2_4_add_u32_v2: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX9-SDAG-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-SDAG-NEXT: s_add_i32 s1, s1, s3 -; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s2 +; GFX9-SDAG-NEXT: s_lshl4_add_u32 s1, s1, s3 +; GFX9-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s2 ; GFX9-SDAG-NEXT: ; return to shader part epilog ; ; GFX8-SDAG-LABEL: s_shl_2_4_add_u32_v2: @@ -810,10 +784,8 @@ define amdgpu_ps <2 x i32> @s_shl_2_4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32 ; ; GFX10-SDAG-LABEL: s_shl_2_4_add_u32_v2: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX10-SDAG-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s2 -; GFX10-SDAG-NEXT: s_add_i32 s1, s1, s3 +; GFX10-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s2 +; GFX10-SDAG-NEXT: s_lshl4_add_u32 s1, s1, s3 ; GFX10-SDAG-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl_2_4_add_u32_v2: diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll index 5aafb0f576fb4..90304b2c730cb 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -69,6 +69,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %169:sreg_32, 31, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %169:sreg_32, implicit-def $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4) ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc @@ -92,7 +93,6 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, undef %358:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %368:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4) @@ -101,6 +101,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %352:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %363:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc @@ -113,10 +114,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %384:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_LSHL4_ADD_U32_:%[0-9]+]]:sreg_32 = S_LSHL4_ADD_U32 [[COPY12]], 16, implicit-def dead $scc + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %383:sgpr_128, [[S_LSHL4_ADD_U32_]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4) @@ -127,25 +126,25 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.140, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -233, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM5]], -249, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_10:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM3]], -297, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -313, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -233, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM5]], -249, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM3]], -297, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_10:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -313, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc + ; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc + ; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_3]], 31, implicit-def dead $scc + ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_3]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc + ; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc + ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc + ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_4]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4) @@ -164,11 +163,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.178, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.183, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc + ; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc + ; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc + ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) @@ -185,11 +184,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc + ; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc + ; CHECK-NEXT: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc + ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4) ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] @@ -198,32 +197,32 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %470:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %469:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) ; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1 - ; CHECK-NEXT: KILL undef %470:sreg_64 + ; CHECK-NEXT: KILL undef %469:sreg_64 ; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3 - ; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc + ; CHECK-NEXT: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4) - ; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc + ; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc + ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]] + ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]] ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_2]] ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY18]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -475, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -491, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -475, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -491, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4) @@ -236,10 +235,10 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]] + ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]] - ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]] - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]] ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec @@ -310,15 +309,15 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -216, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_36:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_35]], [[V_ADD_U32_e64_17]], implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_37:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_36]], [[V_ADD_U32_e64_18]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_38:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_7]], [[V_OR_B32_e64_37]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_39:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_8]], [[V_OR_B32_e64_38]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_40:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_9]], [[V_OR_B32_e64_39]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_41:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_10]], [[V_OR_B32_e64_40]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_42:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_11]], [[V_OR_B32_e64_41]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_43:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_12]], [[V_OR_B32_e64_42]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_44:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_13]], [[V_OR_B32_e64_43]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_38:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_6]], [[V_OR_B32_e64_37]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_39:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_7]], [[V_OR_B32_e64_38]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_40:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_8]], [[V_OR_B32_e64_39]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_41:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_9]], [[V_OR_B32_e64_40]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_42:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_10]], [[V_OR_B32_e64_41]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_43:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_11]], [[V_OR_B32_e64_42]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_44:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_12]], [[V_OR_B32_e64_43]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -457, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_45:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_14]], [[V_OR_B32_e64_44]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_45:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_13]], [[V_OR_B32_e64_44]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -458, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_46:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_45]], [[V_ADD_U32_e64_19]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -459, [[BUFFER_LOAD_FORMAT_X_IDXEN21]], 0, implicit $exec @@ -326,15 +325,15 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -466, [[BUFFER_LOAD_FORMAT_X_IDXEN22]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_48:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_47]], [[V_ADD_U32_e64_21]], implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_49:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_48]], [[V_ADD_U32_e64_22]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_50:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_15]], [[V_OR_B32_e64_49]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_51:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_16]], [[V_OR_B32_e64_50]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_52:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_17]], [[V_OR_B32_e64_51]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_53:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_23]], [[V_OR_B32_e64_52]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_54:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_18]], [[V_OR_B32_e64_53]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_55:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_19]], [[V_OR_B32_e64_54]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_56:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_20]], [[V_OR_B32_e64_55]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_57:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_21]], [[V_OR_B32_e64_56]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_58:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_22]], [[V_OR_B32_e64_57]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_50:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_14]], [[V_OR_B32_e64_49]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_51:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_15]], [[V_OR_B32_e64_50]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_52:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_16]], [[V_OR_B32_e64_51]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_53:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_22]], [[V_OR_B32_e64_52]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_54:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_17]], [[V_OR_B32_e64_53]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_55:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_18]], [[V_OR_B32_e64_54]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_56:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_19]], [[V_OR_B32_e64_55]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_57:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_20]], [[V_OR_B32_e64_56]], implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_58:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_21]], [[V_OR_B32_e64_57]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -555, [[BUFFER_LOAD_FORMAT_X_IDXEN23]], 0, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -556, [[BUFFER_LOAD_FORMAT_X_IDXEN24]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_59:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_58]], [[V_ADD_U32_e64_23]], implicit $exec @@ -351,13 +350,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec - ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %543:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %542:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4) ; CHECK-NEXT: [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec - ; CHECK-NEXT: [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc - ; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec + ; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc + ; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_23]], [[V_OR_B32_e64_66]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec ; CHECK-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec - ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %557:vgpr_32, undef %559:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8) + ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %556:vgpr_32, undef %558:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 .expVert: %0 = extractelement <31 x i32> %userData, i64 2 diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 21f0c008366a9..0fdc1a83dddbd 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -2029,10 +2029,10 @@ define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: s_mov_b32 s2, 0 ; GFX9-W64-NEXT: buffer_store_dword v1, off, s[8:11], 0 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-W64-NEXT: v_lshl_add_u32 v1, v2, 2, v1 +; GFX9-W64-NEXT: v_lshl_add_u32 v1, v2, 2, s2 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)