diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 8e90754103ff1..c990df622175f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -634,6 +634,61 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { case ISD::BUILD_VECTOR: { EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); + + auto IsSplatAllZeros = [this](SDNode *N) -> bool { + if (ISD::isConstantSplatVectorAllZeros(N)) + return true; + + // Types may have legalized by stripping the 16 bit multi-element vector + // into multiple BUILD_VECTORs. Peek through and see if it is all zeros + // regardless of what the legalizer did. Assumes cases along the lines of: + // v8i16 build_vector 0, 0, 0, 0, 0, 0, 0, 0 + // -> legalizer -> + // t0 = v2i16 build_vector 0, 0 + // t1 = bitcast t0 to i32 + // v4i32 build_vector t1, t1, t1, t1 + if (CurDAG->isSplatValue(SDValue(N, 0))) { + SDValue Op = peekThroughBitcasts(N->getOperand(0)); + EVT InnerVT = Op.getValueType(); + if (InnerVT.isVector() && Op.getOpcode() == ISD::BUILD_VECTOR && + InnerVT.getVectorNumElements() == 2) + return ISD::isConstantSplatVectorAllZeros(Op.getNode()); + } + return false; + }; + if (IsSplatAllZeros(N)) { + unsigned FixedBitSize = VT.getFixedSizeInBits(); + SDLoc DL(N); + if (FixedBitSize == 64) { + SDValue Set0 = { + CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, MVT::i64, + CurDAG->getTargetConstant(0, DL, MVT::i64)), + 0}; + CurDAG->SelectNodeTo(N, AMDGPU::COPY, VT, Set0); + return; + } else if (NumVectorElts <= 32 && (FixedBitSize % 64 == 0)) { + SmallVector Ops((FixedBitSize / 64) * 2 + 1); + SDValue Set0 = { + CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, MVT::i64, + CurDAG->getTargetConstant(0, DL, MVT::i64)), + 0}; + unsigned RCID = + SIRegisterInfo::getSGPRClassForBitWidth(FixedBitSize)->getID(); + Ops[0] = CurDAG->getTargetConstant(RCID, DL, MVT::i32); + + for (unsigned i = 0, CurrentBitSize = FixedBitSize; CurrentBitSize != 0; + ++i, CurrentBitSize -= 64) { + unsigned SubRegs = + SIRegisterInfo::getSubRegFromChannel(i * 2, /*NumRegs=*/2); + Ops[i * 2 + 1] = Set0; + Ops[i * 2 + 2] = CurDAG->getTargetConstant(SubRegs, DL, MVT::i32); + } + + CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, VT, Ops); + return; + } + } + if (VT.getScalarSizeInBits() == 16) { if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) { if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) { diff --git a/llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll b/llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll index 23a7bb6ece488..1d0a9f9585123 100644 --- a/llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll +++ b/llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll @@ -4,15 +4,14 @@ define float @test() { ; GFX10-LABEL: name: test ; GFX10: bb.0.bb: - ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_256 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3, [[S_MOV_B32_]], %subreg.sub4, [[S_MOV_B32_]], %subreg.sub5, [[S_MOV_B32_]], %subreg.sub6, [[S_MOV_B32_]], %subreg.sub7 - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX10-NEXT: [[IMAGE_LOAD_V2_V2_nsa_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_LOAD_V2_V2_nsa_gfx10 [[COPY]], [[COPY1]], killed [[REG_SEQUENCE]], 3, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8) - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub1 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub0 - ; GFX10-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY2]], 0, killed [[COPY3]], 0, 0, implicit $mode, implicit $exec - ; GFX10-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_256 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B64_]], %subreg.sub2_sub3, [[S_MOV_B64_]], %subreg.sub4_sub5, [[S_MOV_B64_]], %subreg.sub6_sub7 + ; GFX10-NEXT: [[V_MOV_B32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0 + ; GFX10-NEXT: [[IMAGE_LOAD_V2_V2_nsa_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_LOAD_V2_V2_nsa_gfx10 [[V_MOV_B32_]], [[V_MOV_B32_]], killed [[REG_SEQUENCE]], 3, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub1 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub0 + ; GFX10-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY]], 0, killed [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX10-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX10-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_1]] ; GFX10-NEXT: SI_RETURN implicit $vgpr0 bb: diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 3116b5d59a097..1f69299f8ad77 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -515,48 +515,47 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX908-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 -; GFX908-NEXT: s_load_dword s7, s[8:9], 0x18 -; GFX908-NEXT: s_mov_b32 s6, 0 -; GFX908-NEXT: s_mov_b32 s9, s6 +; GFX908-NEXT: s_mov_b32 s7, 0 +; GFX908-NEXT: s_load_dword s8, s[8:9], 0x18 +; GFX908-NEXT: v_mov_b32_e32 v19, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX908-NEXT: s_sub_i32 s8, 0, s3 -; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s7 -; GFX908-NEXT: v_mov_b32_e32 v19, 0 -; GFX908-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX908-NEXT: s_sub_i32 s6, 0, s3 +; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s8 +; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 -; GFX908-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: s_mul_i32 s8, s8, s10 -; GFX908-NEXT: s_mul_hi_u32 s8, s10, s8 -; GFX908-NEXT: s_add_i32 s10, s10, s8 -; GFX908-NEXT: s_mul_hi_u32 s8, s2, s10 -; GFX908-NEXT: s_mul_i32 s10, s8, s3 -; GFX908-NEXT: s_sub_i32 s2, s2, s10 -; GFX908-NEXT: s_add_i32 s11, s8, 1 -; GFX908-NEXT: s_sub_i32 s10, s2, s3 +; GFX908-NEXT: v_readfirstlane_b32 s9, v2 +; GFX908-NEXT: s_mul_i32 s6, s6, s9 +; GFX908-NEXT: s_mul_hi_u32 s6, s9, s6 +; GFX908-NEXT: s_add_i32 s9, s9, s6 +; GFX908-NEXT: s_mul_hi_u32 s6, s2, s9 +; GFX908-NEXT: s_mul_i32 s9, s6, s3 +; GFX908-NEXT: s_sub_i32 s2, s2, s9 +; GFX908-NEXT: s_add_i32 s10, s6, 1 +; GFX908-NEXT: s_sub_i32 s9, s2, s3 ; GFX908-NEXT: s_cmp_ge_u32 s2, s3 -; GFX908-NEXT: s_cselect_b32 s8, s11, s8 -; GFX908-NEXT: s_cselect_b32 s2, s10, s2 -; GFX908-NEXT: s_add_i32 s10, s8, 1 +; GFX908-NEXT: s_cselect_b32 s6, s10, s6 +; GFX908-NEXT: s_cselect_b32 s2, s9, s2 +; GFX908-NEXT: s_add_i32 s9, s6, 1 ; GFX908-NEXT: s_cmp_ge_u32 s2, s3 -; GFX908-NEXT: s_cselect_b32 s8, s10, s8 -; GFX908-NEXT: s_lshr_b32 s7, s7, 16 -; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s7 +; GFX908-NEXT: s_cselect_b32 s6, s9, s6 +; GFX908-NEXT: s_lshr_b32 s10, s8, 16 +; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s10 +; GFX908-NEXT: s_lshl_b64 s[10:11], s[6:7], 5 ; GFX908-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 -; GFX908-NEXT: s_lshl_b64 s[12:13], s[8:9], 5 -; GFX908-NEXT: s_lshl_b64 s[10:11], s[4:5], 5 -; GFX908-NEXT: s_or_b32 s10, s10, 28 +; GFX908-NEXT: s_lshl_b64 s[8:9], s[4:5], 5 +; GFX908-NEXT: s_or_b32 s8, s8, 28 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s7, v16 ; GFX908-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX908-NEXT: s_mul_i32 s1, s1, s7 -; GFX908-NEXT: s_mul_hi_u32 s9, s0, s7 +; GFX908-NEXT: s_mul_hi_u32 s12, s0, s7 ; GFX908-NEXT: s_mul_i32 s0, s0, s7 -; GFX908-NEXT: s_add_i32 s1, s9, s1 -; GFX908-NEXT: s_lshl_b64 s[14:15], s[0:1], 5 +; GFX908-NEXT: s_add_i32 s1, s12, s1 +; GFX908-NEXT: s_lshl_b64 s[12:13], s[0:1], 5 ; GFX908-NEXT: s_branch .LBB3_2 ; GFX908-NEXT: .LBB3_1: ; %Flow20 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 @@ -565,59 +564,58 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: .LBB3_2: ; %bb9 ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB3_5 Depth 2 -; GFX908-NEXT: s_mov_b64 s[16:17], -1 +; GFX908-NEXT: s_mov_b64 s[14:15], -1 ; GFX908-NEXT: s_cbranch_scc0 .LBB3_10 ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off ; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1 -; GFX908-NEXT: s_mov_b32 s7, s6 -; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX908-NEXT: v_mov_b32_e32 v4, s6 -; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v6 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 -; GFX908-NEXT: v_mov_b32_e32 v9, s7 -; GFX908-NEXT: v_mov_b32_e32 v5, s7 -; GFX908-NEXT: v_mov_b32_e32 v7, s7 -; GFX908-NEXT: v_mov_b32_e32 v8, s6 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v11, v5 -; GFX908-NEXT: s_mov_b64 s[18:19], s[10:11] -; GFX908-NEXT: v_mov_b32_e32 v10, v4 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v4, 0 +; GFX908-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] +; GFX908-NEXT: v_mov_b32_e32 v6, 0 +; GFX908-NEXT: v_mov_b32_e32 v8, 0 +; GFX908-NEXT: v_mov_b32_e32 v10, 0 +; GFX908-NEXT: v_mov_b32_e32 v5, 0 +; GFX908-NEXT: v_mov_b32_e32 v7, 0 +; GFX908-NEXT: v_mov_b32_e32 v9, 0 +; GFX908-NEXT: v_mov_b32_e32 v11, 0 +; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v12 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s7, v2 -; GFX908-NEXT: v_readfirstlane_b32 s9, v3 +; GFX908-NEXT: v_readfirstlane_b32 s16, v3 ; GFX908-NEXT: s_add_u32 s7, s7, 1 -; GFX908-NEXT: s_addc_u32 s9, s9, 0 -; GFX908-NEXT: s_mul_hi_u32 s20, s2, s7 -; GFX908-NEXT: s_mul_i32 s9, s2, s9 -; GFX908-NEXT: s_mul_i32 s21, s3, s7 -; GFX908-NEXT: s_add_i32 s9, s20, s9 +; GFX908-NEXT: s_addc_u32 s16, s16, 0 +; GFX908-NEXT: s_mul_hi_u32 s17, s2, s7 +; GFX908-NEXT: s_mul_i32 s16, s2, s16 +; GFX908-NEXT: s_mul_i32 s18, s3, s7 +; GFX908-NEXT: s_add_i32 s16, s17, s16 ; GFX908-NEXT: s_mul_i32 s7, s2, s7 -; GFX908-NEXT: s_add_i32 s9, s9, s21 +; GFX908-NEXT: s_add_i32 s22, s16, s18 +; GFX908-NEXT: s_mov_b64 s[16:17], s[8:9] ; GFX908-NEXT: s_branch .LBB3_5 ; GFX908-NEXT: .LBB3_4: ; %bb58 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX908-NEXT: s_add_u32 s18, s18, s14 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[2:3] -; GFX908-NEXT: s_addc_u32 s19, s19, s15 -; GFX908-NEXT: s_mov_b64 s[20:21], 0 -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23] +; GFX908-NEXT: s_add_u32 s16, s16, s12 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[20:21], -1, v[2:3] +; GFX908-NEXT: s_addc_u32 s17, s17, s13 +; GFX908-NEXT: s_mov_b64 s[18:19], 0 +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[20:21] ; GFX908-NEXT: s_cbranch_vccz .LBB3_9 ; GFX908-NEXT: .LBB3_5: ; %bb16 ; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: s_add_u32 s20, s18, s7 -; GFX908-NEXT: s_addc_u32 s21, s19, s9 -; GFX908-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc +; GFX908-NEXT: s_add_u32 s18, s16, s7 +; GFX908-NEXT: s_addc_u32 s19, s17, s22 +; GFX908-NEXT: global_load_dword v21, v19, s[18:19] offset:-12 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v20, v19, s[20:21] offset:-8 glc +; GFX908-NEXT: global_load_dword v20, v19, s[18:19] offset:-8 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v12, v19, s[20:21] offset:-4 glc +; GFX908-NEXT: global_load_dword v12, v19, s[18:19] offset:-4 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v12, v19, s[20:21] glc +; GFX908-NEXT: global_load_dword v12, v19, s[18:19] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: ds_read_b64 v[12:13], v19 ; GFX908-NEXT: ds_read_b64 v[14:15], v0 @@ -648,27 +646,27 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_add_f32_e32 v11, v11, v13 ; GFX908-NEXT: s_branch .LBB3_4 ; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: s_mov_b64 s[20:21], s[16:17] -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[20:21] +; GFX908-NEXT: s_mov_b64 s[18:19], s[14:15] +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[18:19] ; GFX908-NEXT: s_cbranch_vccz .LBB3_4 ; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_mov_b64 s[20:21], -1 +; GFX908-NEXT: s_mov_b64 s[18:19], -1 ; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX908-NEXT: ; implicit-def: $sgpr18_sgpr19 +; GFX908-NEXT: ; implicit-def: $sgpr16_sgpr17 ; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_xor_b64 s[16:17], s[20:21], -1 +; GFX908-NEXT: s_xor_b64 s[14:15], s[18:19], -1 ; GFX908-NEXT: .LBB3_10: ; %Flow19 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: s_mov_b64 s[0:1], -1 -; GFX908-NEXT: s_and_b64 vcc, exec, s[16:17] +; GFX908-NEXT: s_and_b64 vcc, exec, s[14:15] ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 ; GFX908-NEXT: ; %bb.11: ; %bb12 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_add_u32 s4, s4, s8 +; GFX908-NEXT: s_add_u32 s4, s4, s6 ; GFX908-NEXT: s_addc_u32 s5, s5, 0 -; GFX908-NEXT: s_add_u32 s10, s10, s12 -; GFX908-NEXT: s_addc_u32 s11, s11, s13 +; GFX908-NEXT: s_add_u32 s8, s8, s10 +; GFX908-NEXT: s_addc_u32 s9, s9, s11 ; GFX908-NEXT: s_mov_b64 s[0:1], 0 ; GFX908-NEXT: s_branch .LBB3_1 ; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock @@ -679,47 +677,46 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 -; GFX90A-NEXT: s_load_dword s7, s[8:9], 0x18 -; GFX90A-NEXT: s_mov_b32 s6, 0 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, 0 +; GFX90A-NEXT: s_load_dword s8, s[8:9], 0x18 +; GFX90A-NEXT: v_mov_b32_e32 v19, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX90A-NEXT: s_sub_i32 s8, 0, s3 -; GFX90A-NEXT: v_mov_b32_e32 v19, 0 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX90A-NEXT: s_sub_i32 s6, 0, s3 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s8 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GFX90A-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v2 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s7 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v3 -; GFX90A-NEXT: s_mul_i32 s8, s8, s10 -; GFX90A-NEXT: s_mul_hi_u32 s8, s10, s8 -; GFX90A-NEXT: s_add_i32 s10, s10, s8 -; GFX90A-NEXT: s_mul_hi_u32 s8, s2, s10 -; GFX90A-NEXT: s_mul_i32 s10, s8, s3 -; GFX90A-NEXT: s_sub_i32 s2, s2, s10 -; GFX90A-NEXT: s_add_i32 s11, s8, 1 -; GFX90A-NEXT: s_sub_i32 s10, s2, s3 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v3 +; GFX90A-NEXT: s_mul_i32 s6, s6, s9 +; GFX90A-NEXT: s_mul_hi_u32 s6, s9, s6 +; GFX90A-NEXT: s_add_i32 s9, s9, s6 +; GFX90A-NEXT: s_mul_hi_u32 s6, s2, s9 +; GFX90A-NEXT: s_mul_i32 s9, s6, s3 +; GFX90A-NEXT: s_sub_i32 s2, s2, s9 +; GFX90A-NEXT: s_add_i32 s10, s6, 1 +; GFX90A-NEXT: s_sub_i32 s9, s2, s3 ; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s8, s11, s8 -; GFX90A-NEXT: s_cselect_b32 s2, s10, s2 -; GFX90A-NEXT: s_add_i32 s10, s8, 1 +; GFX90A-NEXT: s_cselect_b32 s6, s10, s6 +; GFX90A-NEXT: s_cselect_b32 s2, s9, s2 +; GFX90A-NEXT: s_add_i32 s9, s6, 1 ; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s8, s10, s8 -; GFX90A-NEXT: s_lshr_b32 s7, s7, 16 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s7 +; GFX90A-NEXT: s_cselect_b32 s6, s9, s6 +; GFX90A-NEXT: s_lshr_b32 s10, s8, 16 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s10 +; GFX90A-NEXT: s_lshl_b64 s[10:11], s[6:7], 5 ; GFX90A-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 -; GFX90A-NEXT: s_lshl_b64 s[12:13], s[8:9], 5 -; GFX90A-NEXT: s_lshl_b64 s[10:11], s[4:5], 5 -; GFX90A-NEXT: s_or_b32 s10, s10, 28 +; GFX90A-NEXT: s_lshl_b64 s[8:9], s[4:5], 5 +; GFX90A-NEXT: s_or_b32 s8, s8, 28 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s7, v18 ; GFX90A-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX90A-NEXT: s_mul_i32 s1, s1, s7 -; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s7 +; GFX90A-NEXT: s_mul_hi_u32 s12, s0, s7 ; GFX90A-NEXT: s_mul_i32 s0, s0, s7 -; GFX90A-NEXT: s_add_i32 s1, s9, s1 -; GFX90A-NEXT: s_lshl_b64 s[14:15], s[0:1], 5 +; GFX90A-NEXT: s_add_i32 s1, s12, s1 +; GFX90A-NEXT: s_lshl_b64 s[12:13], s[0:1], 5 ; GFX90A-NEXT: s_branch .LBB3_2 ; GFX90A-NEXT: .LBB3_1: ; %Flow20 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 @@ -728,60 +725,59 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: .LBB3_2: ; %bb9 ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB3_5 Depth 2 -; GFX90A-NEXT: s_mov_b64 s[16:17], -1 +; GFX90A-NEXT: s_mov_b64 s[14:15], -1 ; GFX90A-NEXT: s_cbranch_scc0 .LBB3_10 ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1 -; GFX90A-NEXT: s_mov_b32 s7, s6 -; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8 -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0 -; GFX90A-NEXT: s_mov_b64 s[18:19], s[10:11] -; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[14:15], s[4:5], 0 +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], 0, 0 +; GFX90A-NEXT: s_mov_b64 s[16:17], s[8:9] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], 0, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], 0, 0 +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v12 +; GFX90A-NEXT: v_pk_mov_b32 v[12:13], 0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s7, v4 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v5 +; GFX90A-NEXT: v_readfirstlane_b32 s18, v5 ; GFX90A-NEXT: s_add_u32 s7, s7, 1 -; GFX90A-NEXT: s_addc_u32 s9, s9, 0 -; GFX90A-NEXT: s_mul_hi_u32 s20, s2, s7 -; GFX90A-NEXT: s_mul_i32 s9, s2, s9 -; GFX90A-NEXT: s_mul_i32 s21, s3, s7 -; GFX90A-NEXT: s_add_i32 s9, s20, s9 +; GFX90A-NEXT: s_addc_u32 s18, s18, 0 +; GFX90A-NEXT: s_mul_hi_u32 s19, s2, s7 +; GFX90A-NEXT: s_mul_i32 s18, s2, s18 +; GFX90A-NEXT: s_mul_i32 s20, s3, s7 +; GFX90A-NEXT: s_add_i32 s18, s19, s18 ; GFX90A-NEXT: s_mul_i32 s7, s2, s7 -; GFX90A-NEXT: s_add_i32 s9, s9, s21 +; GFX90A-NEXT: s_add_i32 s22, s18, s20 ; GFX90A-NEXT: s_branch .LBB3_5 ; GFX90A-NEXT: .LBB3_4: ; %bb58 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: s_add_u32 s18, s18, s14 -; GFX90A-NEXT: s_addc_u32 s19, s19, s15 -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[4:5] -; GFX90A-NEXT: s_mov_b64 s[20:21], 0 -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23] +; GFX90A-NEXT: s_add_u32 s16, s16, s12 +; GFX90A-NEXT: s_addc_u32 s17, s17, s13 +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[20:21], -1, v[4:5] +; GFX90A-NEXT: s_mov_b64 s[18:19], 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[20:21] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_9 ; GFX90A-NEXT: .LBB3_5: ; %bb16 ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: s_add_u32 s20, s18, s7 -; GFX90A-NEXT: s_addc_u32 s21, s19, s9 -; GFX90A-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc +; GFX90A-NEXT: s_add_u32 s18, s16, s7 +; GFX90A-NEXT: s_addc_u32 s19, s17, s22 +; GFX90A-NEXT: global_load_dword v21, v19, s[18:19] offset:-12 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v20, v19, s[20:21] offset:-8 glc +; GFX90A-NEXT: global_load_dword v20, v19, s[18:19] offset:-8 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v14, v19, s[20:21] offset:-4 glc +; GFX90A-NEXT: global_load_dword v14, v19, s[18:19] offset:-4 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v14, v19, s[20:21] glc +; GFX90A-NEXT: global_load_dword v14, v19, s[18:19] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ds_read_b64 v[14:15], v19 ; GFX90A-NEXT: ds_read_b64 v[16:17], v0 ; GFX90A-NEXT: s_and_b64 vcc, exec, s[0:1] -; GFX90A-NEXT: ; kill: killed $sgpr20 killed $sgpr21 +; GFX90A-NEXT: ; kill: killed $sgpr18 killed $sgpr19 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX90A-NEXT: ; %bb.6: ; %bb51 @@ -800,27 +796,27 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15] ; GFX90A-NEXT: s_branch .LBB3_4 ; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX90A-NEXT: s_mov_b64 s[20:21], s[16:17] -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[20:21] +; GFX90A-NEXT: s_mov_b64 s[18:19], s[14:15] +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[18:19] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_4 ; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_mov_b64 s[20:21], -1 +; GFX90A-NEXT: s_mov_b64 s[18:19], -1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $sgpr18_sgpr19 +; GFX90A-NEXT: ; implicit-def: $sgpr16_sgpr17 ; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_xor_b64 s[16:17], s[20:21], -1 +; GFX90A-NEXT: s_xor_b64 s[14:15], s[18:19], -1 ; GFX90A-NEXT: .LBB3_10: ; %Flow19 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: s_mov_b64 s[0:1], -1 -; GFX90A-NEXT: s_and_b64 vcc, exec, s[16:17] +; GFX90A-NEXT: s_and_b64 vcc, exec, s[14:15] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: ; %bb.11: ; %bb12 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_add_u32 s4, s4, s8 +; GFX90A-NEXT: s_add_u32 s4, s4, s6 ; GFX90A-NEXT: s_addc_u32 s5, s5, 0 -; GFX90A-NEXT: s_add_u32 s10, s10, s12 -; GFX90A-NEXT: s_addc_u32 s11, s11, s13 +; GFX90A-NEXT: s_add_u32 s8, s8, s10 +; GFX90A-NEXT: s_addc_u32 s9, s9, s11 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 ; GFX90A-NEXT: s_branch .LBB3_1 ; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock diff --git a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll index e03c9ca34b825..13f5b6598f6fa 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll @@ -11,7 +11,7 @@ define amdgpu_ps void @main(i32 %arg) { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b32 s1, exec_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b64 s[4:5], 0 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_branch .LBB0_2 ; GFX10-NEXT: .LBB0_1: ; in Loop: Header=BB0_2 Depth=1 @@ -31,9 +31,7 @@ define amdgpu_ps void @main(i32 %arg) { ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-NEXT: s_cbranch_execz .LBB0_1 ; GFX10-NEXT: ; %bb.4: ; in Loop: Header=BB0_2 Depth=1 -; GFX10-NEXT: s_mov_b32 s5, s4 -; GFX10-NEXT: s_mov_b32 s6, s4 -; GFX10-NEXT: s_mov_b32 s7, s4 +; GFX10-NEXT: s_mov_b64 s[6:7], s[4:5] ; GFX10-NEXT: buffer_atomic_and v0, off, s[4:7], 0 ; GFX10-NEXT: s_branch .LBB0_1 ; GFX10-NEXT: .LBB0_5: ; %bb8 diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll b/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll index de2e25651271a..e0f9855ef2741 100644 --- a/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @materialize_0_i32(ptr addrspace(1) %out) { ; GCN-LABEL: {{^}}materialize_0_i64: ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}} -; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}} +; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], 0{{$}} ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v[[[LOK]]:[[HIK]]] define amdgpu_kernel void @materialize_0_i64(ptr addrspace(1) %out) { store i64 0, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll index a6af63b816573..ab2f424c99254 100644 --- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll +++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll @@ -9,44 +9,36 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: s_addc_u32 s13, s13, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; CHECK-NEXT: s_load_dwordx8 s[48:55], s[8:9], 0x0 +; CHECK-NEXT: s_load_dwordx8 s[20:27], s[8:9], 0x0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b32 s12, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_cmp_lg_u32 s52, 0 +; CHECK-NEXT: s_cmp_lg_u32 s24, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_9 ; CHECK-NEXT: ; %bb.1: ; %if.end13.i.i -; CHECK-NEXT: s_cmp_eq_u32 s54, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB0_4 +; CHECK-NEXT: s_cmp_eq_u32 s26, 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_7 ; CHECK-NEXT: ; %bb.2: ; %if.else251.i.i -; CHECK-NEXT: s_cmp_lg_u32 s55, 0 +; CHECK-NEXT: s_cmp_lg_u32 s27, 0 ; CHECK-NEXT: s_mov_b32 s17, 0 ; CHECK-NEXT: s_cselect_b32 s12, -1, 0 ; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s12 -; CHECK-NEXT: s_cbranch_vccz .LBB0_5 +; CHECK-NEXT: s_cbranch_vccz .LBB0_4 ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: s_mov_b32 s18, 0 -; CHECK-NEXT: s_branch .LBB0_6 -; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: s_mov_b32 s14, s12 -; CHECK-NEXT: s_mov_b32 s15, s12 -; CHECK-NEXT: s_mov_b32 s13, s12 -; CHECK-NEXT: s_mov_b64 s[50:51], s[14:15] -; CHECK-NEXT: s_mov_b64 s[48:49], s[12:13] -; CHECK-NEXT: s_branch .LBB0_8 -; CHECK-NEXT: .LBB0_5: ; %if.then263.i.i -; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s53, 0 +; CHECK-NEXT: s_branch .LBB0_5 +; CHECK-NEXT: .LBB0_4: ; %if.then263.i.i +; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s25, 0 ; CHECK-NEXT: s_mov_b32 s18, 1.0 ; CHECK-NEXT: s_mov_b32 s17, 0x7fc00000 -; CHECK-NEXT: .LBB0_6: ; %Flow -; CHECK-NEXT: s_mov_b32 s48, 1.0 +; CHECK-NEXT: .LBB0_5: ; %Flow +; CHECK-NEXT: s_mov_b32 s20, 1.0 ; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12 -; CHECK-NEXT: s_mov_b32 s49, s48 -; CHECK-NEXT: s_mov_b32 s50, s48 -; CHECK-NEXT: s_mov_b32 s51, s48 +; CHECK-NEXT: s_mov_b32 s21, s20 +; CHECK-NEXT: s_mov_b32 s22, s20 +; CHECK-NEXT: s_mov_b32 s23, s20 ; CHECK-NEXT: s_cbranch_vccnz .LBB0_8 -; CHECK-NEXT: ; %bb.7: ; %if.end273.i.i +; CHECK-NEXT: ; %bb.6: ; %if.end273.i.i ; CHECK-NEXT: s_add_u32 s12, s8, 40 ; CHECK-NEXT: s_addc_u32 s13, s9, 0 ; CHECK-NEXT: s_getpc_b64 s[20:21] @@ -65,13 +57,12 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s14, s16 -; CHECK-NEXT: s_mov_b32 s48, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21] ; CHECK-NEXT: s_mov_b64 s[8:9], s[34:35] -; CHECK-NEXT: s_mov_b32 s49, s48 -; CHECK-NEXT: s_mov_b32 s50, s48 -; CHECK-NEXT: s_mov_b32 s51, s48 +; CHECK-NEXT: .LBB0_7: ; %if.end294.i.i +; CHECK-NEXT: s_mov_b64 s[20:21], 0 +; CHECK-NEXT: s_mov_b64 s[22:23], s[20:21] ; CHECK-NEXT: .LBB0_8: ; %if.end294.i.i ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 @@ -80,11 +71,11 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CHECK-NEXT: .LBB0_9: ; %kernel_direct_lighting.exit ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x20 -; CHECK-NEXT: v_mov_b32_e32 v0, s48 +; CHECK-NEXT: v_mov_b32_e32 v0, s20 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, s49 -; CHECK-NEXT: v_mov_b32_e32 v2, s50 -; CHECK-NEXT: v_mov_b32_e32 v3, s51 +; CHECK-NEXT: v_mov_b32_e32 v1, s21 +; CHECK-NEXT: v_mov_b32_e32 v2, s22 +; CHECK-NEXT: v_mov_b32_e32 v3, s23 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll index ea93e3ac1e595..ba58d82ef8e60 100644 --- a/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll +++ b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll @@ -4,15 +4,14 @@ define amdgpu_cs <2 x i32> @f() { ; CHECK-LABEL: f: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5] ; CHECK-NEXT: s_mov_b32 s1, 0 -; CHECK-NEXT: s_mov_b32 s5, s4 -; CHECK-NEXT: s_mov_b32 s6, s4 -; CHECK-NEXT: s_mov_b32 s7, s4 ; CHECK-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll index 986dd8a046424..ddb81ea7af764 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -446,19 +446,20 @@ define amdgpu_ps void @cluster_image_sample(<8 x i32> inreg %src, <4 x i32> inre ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: v_cvt_f32_i32_e32 v4, v0 ; GFX11-NEXT: v_cvt_f32_i32_e32 v5, v1 +; GFX11-NEXT: v_mov_b32_e32 v6, 1.0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_mov_b32 v6, 1.0 :: v_dual_add_f32 v11, 2.0, v5 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v5 :: v_dual_add_f32 v8, 1.0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_add_f32 v10, 2.0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_add_f32 v11, 2.0, v4 +; GFX11-NEXT: v_dual_add_f32 v8, 1.0, v4 :: v_dual_add_f32 v9, 1.0, v5 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_add_f32 v12, 2.0, v5 ; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: image_sample_d v[2:5], [v8, v9, v2, v2, v[2:3]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX11-NEXT: image_sample_d v[6:9], [v10, v11, v6, v6, v[6:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_sample_d v[2:5], [v8, v9, v10, v10, v[2:3]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_sample_d v[6:9], [v11, v12, v6, v6, v[6:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8 -; GFX11-NEXT: v_dual_add_f32 v3, v3, v7 :: v_dual_add_f32 v2, v2, v6 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index a47ecb2c5d7f2..f2ee579e0d128 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -982,60 +982,51 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-NEXT: s_movk_i32 s4, 0x207 ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b64 s[8:9], 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b64 s[6:7], 0 -; GCN-NEXT: s_branch .LBB5_3 -; GCN-NEXT: .LBB5_1: ; %Flow -; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[12:13] -; GCN-NEXT: .LBB5_2: ; %bb10 -; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 +; GCN-NEXT: s_branch .LBB5_2 +; GCN-NEXT: .LBB5_1: ; %bb10 +; GCN-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-NEXT: s_and_b64 s[6:7], exec, s[4:5] ; GCN-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GCN-NEXT: s_cbranch_execz .LBB5_7 -; GCN-NEXT: .LBB5_3: ; %bb1 +; GCN-NEXT: s_cbranch_execz .LBB5_5 +; GCN-NEXT: .LBB5_2: ; %bb1 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_and_b64 s[10:11], exec, vcc ; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] ; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN-NEXT: s_cbranch_execnz .LBB5_3 -; GCN-NEXT: ; %bb.4: ; %bb2 -; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 +; GCN-NEXT: s_cbranch_execnz .LBB5_2 +; GCN-NEXT: ; %bb.3: ; %bb2 +; GCN-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v8, v7 -; GCN-NEXT: v_mov_b32_e32 v2, v7 -; GCN-NEXT: v_mov_b32_e32 v6, v7 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.5: ; %bb4 -; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 -; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v8, v7 -; GCN-NEXT: v_mov_b32_e32 v2, v7 -; GCN-NEXT: v_mov_b32_e32 v6, v7 +; GCN-NEXT: s_cbranch_execz .LBB5_1 +; GCN-NEXT: ; %bb.4: ; %bb4 +; GCN-NEXT: ; in Loop: Header=BB5_2 Depth=1 +; GCN-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_gt_f32_e64 s[6:7], 0, v0 +; GCN-NEXT: v_cmp_gt_f32_e64 s[6:7], 0, v2 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_and_saveexec_b64 s[12:13], s[6:7] -; GCN-NEXT: s_cbranch_execz .LBB5_1 -; GCN-NEXT: ; %bb.6: ; %bb8 -; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 -; GCN-NEXT: v_mov_b32_e32 v8, v7 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6 +; GCN-NEXT: s_or_b64 exec, exec, s[12:13] ; GCN-NEXT: s_branch .LBB5_1 -; GCN-NEXT: .LBB5_7: ; %bb12 +; GCN-NEXT: .LBB5_5: ; %bb12 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -1043,135 +1034,126 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0: ; %bb.0: ; %bb ; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane +; GCN-O0-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane ; GCN-O0-NEXT: s_waitcnt expcnt(1) -; GCN-O0-NEXT: v_writelane_b32 v7, s6, 0 -; GCN-O0-NEXT: v_writelane_b32 v7, s7, 1 -; GCN-O0-NEXT: v_writelane_b32 v7, s4, 2 -; GCN-O0-NEXT: v_writelane_b32 v7, s5, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: v_writelane_b32 v6, s6, 0 +; GCN-O0-NEXT: v_writelane_b32 v6, s7, 1 +; GCN-O0-NEXT: v_writelane_b32 v6, s4, 2 +; GCN-O0-NEXT: v_writelane_b32 v6, s5, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: .LBB5_1: ; %bb1 ; GCN-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s8, v7, 2 -; GCN-O0-NEXT: v_readlane_b32 s9, v7, 3 -; GCN-O0-NEXT: v_readlane_b32 s6, v7, 0 -; GCN-O0-NEXT: v_readlane_b32 s7, v7, 1 -; GCN-O0-NEXT: v_writelane_b32 v7, s6, 4 -; GCN-O0-NEXT: v_writelane_b32 v7, s7, 5 +; GCN-O0-NEXT: v_readlane_b32 s8, v6, 2 +; GCN-O0-NEXT: v_readlane_b32 s9, v6, 3 +; GCN-O0-NEXT: v_readlane_b32 s6, v6, 0 +; GCN-O0-NEXT: v_readlane_b32 s7, v6, 1 +; GCN-O0-NEXT: v_writelane_b32 v6, s6, 4 +; GCN-O0-NEXT: v_writelane_b32 v6, s7, 5 ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s4, 0x207 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_cmp_lt_i32_e64 s[4:5], v0, s4 ; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GCN-O0-NEXT: v_writelane_b32 v7, s4, 6 -; GCN-O0-NEXT: v_writelane_b32 v7, s5, 7 -; GCN-O0-NEXT: v_writelane_b32 v7, s6, 0 -; GCN-O0-NEXT: v_writelane_b32 v7, s7, 1 +; GCN-O0-NEXT: v_writelane_b32 v6, s4, 6 +; GCN-O0-NEXT: v_writelane_b32 v6, s5, 7 +; GCN-O0-NEXT: v_writelane_b32 v6, s6, 0 +; GCN-O0-NEXT: v_writelane_b32 v6, s7, 1 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v7, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v7, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: v_writelane_b32 v6, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v6, s7, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1 ; GCN-O0-NEXT: ; %bb.2: ; %bb2 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v7, 6 -; GCN-O0-NEXT: v_readlane_b32 s5, v7, 7 +; GCN-O0-NEXT: v_readlane_b32 s4, v6, 6 +; GCN-O0-NEXT: v_readlane_b32 s5, v6, 7 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s6, 0 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s6 ; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 -; GCN-O0-NEXT: v_writelane_b32 v7, s4, 8 -; GCN-O0-NEXT: v_writelane_b32 v7, s5, 9 -; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v6, s4, 8 +; GCN-O0-NEXT: v_writelane_b32 v6, s5, 9 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 ; GCN-O0-NEXT: v_mov_b32_e32 v4, s4 -; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v1, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v5 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s5 +; GCN-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v2, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v5 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v7, s4, 10 -; GCN-O0-NEXT: v_writelane_b32 v7, s5, 11 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: v_writelane_b32 v6, s4, 10 +; GCN-O0-NEXT: v_writelane_b32 v6, s5, 11 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execz .LBB5_5 ; GCN-O0-NEXT: ; %bb.3: ; %bb4 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: ; implicit-def: $sgpr4 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 ; GCN-O0-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_cmp_lt_f32_e64 s[6:7], v0, s4 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 ; GCN-O0-NEXT: v_mov_b32_e32 v4, s4 -; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v1, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v5 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s5 +; GCN-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v2, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v5 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v7, s4, 12 -; GCN-O0-NEXT: v_writelane_b32 v7, s5, 13 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: v_writelane_b32 v6, s4, 12 +; GCN-O0-NEXT: v_writelane_b32 v6, s5, 13 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execz .LBB5_6 ; GCN-O0-NEXT: ; %bb.4: ; %bb8 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_mov_b32 s10, 0 -; GCN-O0-NEXT: ; implicit-def: $sgpr4 -; GCN-O0-NEXT: ; implicit-def: $sgpr5 -; GCN-O0-NEXT: ; implicit-def: $sgpr9 -; GCN-O0-NEXT: ; implicit-def: $sgpr5 -; GCN-O0-NEXT: ; implicit-def: $sgpr8 -; GCN-O0-NEXT: ; implicit-def: $sgpr5 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 -; GCN-O0-NEXT: s_mov_b32 s5, s10 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b64 s[8:9], 0 +; GCN-O0-NEXT: s_mov_b64 s[4:5], s[8:9] +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[8:9] ; GCN-O0-NEXT: s_waitcnt expcnt(1) ; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 @@ -1184,13 +1166,13 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_branch .LBB5_6 ; GCN-O0-NEXT: .LBB5_5: ; %Flow2 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v7, 10 -; GCN-O0-NEXT: v_readlane_b32 s5, v7, 11 +; GCN-O0-NEXT: v_readlane_b32 s4, v6, 10 +; GCN-O0-NEXT: v_readlane_b32 s5, v6, 11 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -1207,13 +1189,13 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_branch .LBB5_7 ; GCN-O0-NEXT: .LBB5_6: ; %Flow ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v7, 12 -; GCN-O0-NEXT: v_readlane_b32 s5, v7, 13 +; GCN-O0-NEXT: v_readlane_b32 s4, v6, 12 +; GCN-O0-NEXT: v_readlane_b32 s5, v6, 13 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload @@ -1230,52 +1212,52 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_branch .LBB5_5 ; GCN-O0-NEXT: .LBB5_7: ; %bb10 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s6, v7, 8 -; GCN-O0-NEXT: v_readlane_b32 s7, v7, 9 +; GCN-O0-NEXT: v_readlane_b32 s6, v6, 8 +; GCN-O0-NEXT: v_readlane_b32 s7, v6, 9 ; GCN-O0-NEXT: s_mov_b64 s[4:5], -1 -; GCN-O0-NEXT: v_writelane_b32 v7, s4, 14 -; GCN-O0-NEXT: v_writelane_b32 v7, s5, 15 +; GCN-O0-NEXT: v_writelane_b32 v6, s4, 14 +; GCN-O0-NEXT: v_writelane_b32 v6, s5, 15 ; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v7, s4, 16 -; GCN-O0-NEXT: v_writelane_b32 v7, s5, 17 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: v_writelane_b32 v6, s4, 16 +; GCN-O0-NEXT: v_writelane_b32 v6, s5, 17 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execz .LBB5_9 ; GCN-O0-NEXT: ; %bb.8: ; %Flow1 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 ; GCN-O0-NEXT: s_xor_b64 s[4:5], exec, -1 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_writelane_b32 v7, s4, 14 -; GCN-O0-NEXT: v_writelane_b32 v7, s5, 15 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: v_writelane_b32 v6, s4, 14 +; GCN-O0-NEXT: v_writelane_b32 v6, s5, 15 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: .LBB5_9: ; %Flow3 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s8, v7, 16 -; GCN-O0-NEXT: v_readlane_b32 s9, v7, 17 +; GCN-O0-NEXT: v_readlane_b32 s8, v6, 16 +; GCN-O0-NEXT: v_readlane_b32 s9, v6, 17 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-O0-NEXT: v_readlane_b32 s6, v7, 4 -; GCN-O0-NEXT: v_readlane_b32 s7, v7, 5 -; GCN-O0-NEXT: v_readlane_b32 s4, v7, 14 -; GCN-O0-NEXT: v_readlane_b32 s5, v7, 15 +; GCN-O0-NEXT: v_readlane_b32 s6, v6, 4 +; GCN-O0-NEXT: v_readlane_b32 s7, v6, 5 +; GCN-O0-NEXT: v_readlane_b32 s4, v6, 14 +; GCN-O0-NEXT: v_readlane_b32 s5, v6, 15 ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload @@ -1284,16 +1266,16 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 s[6:7], 0 ; GCN-O0-NEXT: s_mov_b64 s[8:9], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v7, s8, 0 -; GCN-O0-NEXT: v_writelane_b32 v7, s9, 1 -; GCN-O0-NEXT: v_writelane_b32 v7, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v7, s7, 3 +; GCN-O0-NEXT: v_writelane_b32 v6, s8, 0 +; GCN-O0-NEXT: v_writelane_b32 v6, s9, 1 +; GCN-O0-NEXT: v_writelane_b32 v6, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v6, s7, 3 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v7, s6, 18 -; GCN-O0-NEXT: v_writelane_b32 v7, s7, 19 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: v_writelane_b32 v6, s6, 18 +; GCN-O0-NEXT: v_writelane_b32 v6, s7, 19 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_waitcnt vmcnt(4) ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(4) @@ -1305,13 +1287,13 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1 ; GCN-O0-NEXT: ; %bb.10: ; %bb12 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(4) -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[12:13] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v7, 18 -; GCN-O0-NEXT: v_readlane_b32 s5, v7, 19 +; GCN-O0-NEXT: v_readlane_b32 s4, v6, 18 +; GCN-O0-NEXT: v_readlane_b32 s5, v6, 19 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: ; %bb.11: ; %bb12 ; GCN-O0-NEXT: s_waitcnt expcnt(3) @@ -1344,7 +1326,7 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index e72f3d3ce993a..e0f7fe0d0bc31 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -2840,7 +2840,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 -; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -2851,9 +2851,10 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; CI-LABEL: s_test_canonicalize_undef_v4f16: @@ -2861,8 +2862,8 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: v_mov_b32_e32 v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -2871,10 +2872,9 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef) store <4 x half> %canonicalized, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index 4e12a30c6f6f4..996c485fe9a8d 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -1363,7 +1363,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, v0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 @@ -1374,29 +1374,28 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_fold_canonicalize_p0_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_p0_f64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v1, v0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double 0.0) store double %canonicalized, ptr addrspace(1) %out @@ -1574,7 +1573,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, v0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 @@ -1585,29 +1584,28 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v1, v0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double)) store double %canonicalized, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll index 1e469b1951009..e475a713e243a 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll @@ -5,7 +5,6 @@ define amdgpu_gs void @f(i32 inreg %arg, i32 %arg1, i32 %arg2) { ; CHECK-LABEL: f: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_cmp_eq_u32 s0, 0 -; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %bb3 ; CHECK-NEXT: v_mov_b32_e32 v5, v0 @@ -15,10 +14,9 @@ define amdgpu_gs void @f(i32 inreg %arg, i32 %arg1, i32 %arg2) { ; CHECK-NEXT: v_mov_b32_e32 v5, 1 ; CHECK-NEXT: .LBB0_3: ; %bb4 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 -; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: s_mov_b32 s2, s0 -; CHECK-NEXT: s_mov_b32 s3, s0 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT: s_mov_b64 s[0:1], 0 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: s_mov_b64 s[2:3], s[0:1] ; CHECK-NEXT: v_mov_b32_e32 v7, v6 ; CHECK-NEXT: v_mov_b32_e32 v8, v6 ; CHECK-NEXT: v_mov_b32_e32 v2, v6 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index e7c8604776ce0..750026fb23dae 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -17,16 +17,15 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX9-LABEL: zero_init_kernel: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 -; GFX9-NEXT: s_mov_b32 s1, s0 -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 @@ -39,10 +38,8 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX10-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 -; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: s_mov_b32 s1, s0 -; GFX10-NEXT: s_mov_b32 s2, s0 -; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -55,11 +52,9 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX11-LABEL: zero_init_kernel: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x3 @@ -71,11 +66,9 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX12-LABEL: zero_init_kernel: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_mov_b64 s[0:1], 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_clause 0x3 @@ -90,18 +83,17 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] ; GFX9-PAL-NEXT: s_mov_b32 s12, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 -; GFX9-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-PAL-NEXT: s_mov_b32 s1, s0 -; GFX9-PAL-NEXT: s_mov_b32 s2, s0 -; GFX9-PAL-NEXT: s_mov_b32 s3, s0 +; GFX9-PAL-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 @@ -110,10 +102,8 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX942-LABEL: zero_init_kernel: ; GFX942: ; %bb.0: -; GFX942-NEXT: s_mov_b32 s0, 0 -; GFX942-NEXT: s_mov_b32 s1, s0 -; GFX942-NEXT: s_mov_b32 s2, s0 -; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 @@ -133,14 +123,13 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX1010-PAL-NEXT: s_addc_u32 s13, s13, 0 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 -; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 -; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 -; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 +; GFX1010-PAL-NEXT: s_mov_b64 s[0:1], 0 +; GFX1010-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 @@ -158,10 +147,8 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX1030-PAL-NEXT: s_addc_u32 s13, s13, 0 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 -; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 -; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 -; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 +; GFX1030-PAL-NEXT: s_mov_b64 s[0:1], 0 +; GFX1030-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -174,11 +161,9 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX11-PAL-LABEL: zero_init_kernel: ; GFX11-PAL: ; %bb.0: -; GFX11-PAL-NEXT: s_mov_b32 s0, 0 +; GFX11-PAL-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-PAL-NEXT: s_mov_b32 s1, s0 -; GFX11-PAL-NEXT: s_mov_b32 s2, s0 -; GFX11-PAL-NEXT: s_mov_b32 s3, s0 +; GFX11-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-PAL-NEXT: s_clause 0x3 @@ -190,11 +175,9 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX12-PAL-LABEL: zero_init_kernel: ; GFX12-PAL: ; %bb.0: -; GFX12-PAL-NEXT: s_mov_b32 s0, 0 +; GFX12-PAL-NEXT: s_mov_b64 s[0:1], 0 ; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-PAL-NEXT: s_mov_b32 s1, s0 -; GFX12-PAL-NEXT: s_mov_b32 s2, s0 -; GFX12-PAL-NEXT: s_mov_b32 s3, s0 +; GFX12-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-PAL-NEXT: s_clause 0x3 @@ -212,10 +195,8 @@ define void @zero_init_foo() { ; GFX9-LABEL: zero_init_foo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_mov_b32 s1, s0 -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -230,10 +211,8 @@ define void @zero_init_foo() { ; GFX10-LABEL: zero_init_foo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: s_mov_b32 s1, s0 -; GFX10-NEXT: s_mov_b32 s2, s0 -; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -247,11 +226,9 @@ define void @zero_init_foo() { ; GFX11-LABEL: zero_init_foo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x3 @@ -268,13 +245,11 @@ define void @zero_init_foo() { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b64 s[0:1], 0 ; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:48 @@ -286,10 +261,8 @@ define void @zero_init_foo() { ; GFX9-PAL-LABEL: zero_init_foo: ; GFX9-PAL: ; %bb.0: ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-PAL-NEXT: s_mov_b32 s1, s0 -; GFX9-PAL-NEXT: s_mov_b32 s2, s0 -; GFX9-PAL-NEXT: s_mov_b32 s3, s0 +; GFX9-PAL-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -304,10 +277,8 @@ define void @zero_init_foo() { ; GFX942-LABEL: zero_init_foo: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_mov_b32 s0, 0 -; GFX942-NEXT: s_mov_b32 s1, s0 -; GFX942-NEXT: s_mov_b32 s2, s0 -; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 @@ -320,10 +291,8 @@ define void @zero_init_foo() { ; GFX10-PAL-LABEL: zero_init_foo: ; GFX10-PAL: ; %bb.0: ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-PAL-NEXT: s_mov_b32 s0, 0 -; GFX10-PAL-NEXT: s_mov_b32 s1, s0 -; GFX10-PAL-NEXT: s_mov_b32 s2, s0 -; GFX10-PAL-NEXT: s_mov_b32 s3, s0 +; GFX10-PAL-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -337,11 +306,9 @@ define void @zero_init_foo() { ; GFX11-PAL-LABEL: zero_init_foo: ; GFX11-PAL: ; %bb.0: ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-PAL-NEXT: s_mov_b32 s0, 0 +; GFX11-PAL-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-PAL-NEXT: s_mov_b32 s1, s0 -; GFX11-PAL-NEXT: s_mov_b32 s2, s0 -; GFX11-PAL-NEXT: s_mov_b32 s3, s0 +; GFX11-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-PAL-NEXT: s_clause 0x3 @@ -358,13 +325,11 @@ define void @zero_init_foo() { ; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 ; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 -; GFX12-PAL-NEXT: s_mov_b32 s0, 0 -; GFX12-PAL-NEXT: s_wait_alu 0xfffe -; GFX12-PAL-NEXT: s_mov_b32 s1, s0 -; GFX12-PAL-NEXT: s_mov_b32 s2, s0 -; GFX12-PAL-NEXT: s_mov_b32 s3, s0 +; GFX12-PAL-NEXT: s_mov_b64 s[0:1], 0 ; GFX12-PAL-NEXT: s_wait_alu 0xfffe +; GFX12-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-PAL-NEXT: s_wait_alu 0xfffe ; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-PAL-NEXT: s_clause 0x3 ; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:48 @@ -1043,13 +1008,13 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s1, s0 -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 @@ -1064,10 +1029,8 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GFX10-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: s_mov_b32 s1, s0 -; GFX10-NEXT: s_mov_b32 s2, s0 -; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -1082,11 +1045,9 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX11: ; %bb.0: ; GFX11-NEXT: scratch_load_b32 v0, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x3 @@ -1100,11 +1061,9 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX12: ; %bb.0: ; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_mov_b64 s[0:1], 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_clause 0x3 @@ -1120,19 +1079,19 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX9-PAL-NEXT: s_mov_b32 s12, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-PAL-NEXT: s_mov_b32 s1, s0 -; GFX9-PAL-NEXT: s_mov_b32 s2, s0 -; GFX9-PAL-NEXT: s_mov_b32 s3, s0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 @@ -1143,10 +1102,8 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX942: ; %bb.0: ; GFX942-NEXT: scratch_load_dword v0, off, off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: s_mov_b32 s0, 0 -; GFX942-NEXT: s_mov_b32 s1, s0 -; GFX942-NEXT: s_mov_b32 s2, s0 -; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 @@ -1169,13 +1126,14 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 -; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 -; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 +; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-PAL-NEXT: s_mov_b64 s[0:1], 0 +; GFX1010-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 @@ -1195,10 +1153,8 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 -; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 -; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 -; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 +; GFX1030-PAL-NEXT: s_mov_b64 s[0:1], 0 +; GFX1030-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -1213,11 +1169,9 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX11-PAL: ; %bb.0: ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: s_mov_b32 s0, 0 +; GFX11-PAL-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-PAL-NEXT: s_mov_b32 s1, s0 -; GFX11-PAL-NEXT: s_mov_b32 s2, s0 -; GFX11-PAL-NEXT: s_mov_b32 s3, s0 +; GFX11-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-PAL-NEXT: s_clause 0x3 @@ -1231,11 +1185,9 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX12-PAL: ; %bb.0: ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 -; GFX12-PAL-NEXT: s_mov_b32 s0, 0 +; GFX12-PAL-NEXT: s_mov_b64 s[0:1], 0 ; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-PAL-NEXT: s_mov_b32 s1, s0 -; GFX12-PAL-NEXT: s_mov_b32 s2, s0 -; GFX12-PAL-NEXT: s_mov_b32 s3, s0 +; GFX12-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-PAL-NEXT: s_clause 0x3 @@ -1258,10 +1210,8 @@ define void @zero_init_small_offset_foo() { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, off, s32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_mov_b32 s1, s0 -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -1278,10 +1228,8 @@ define void @zero_init_small_offset_foo() { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: scratch_load_dword v0, off, s32 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: s_mov_b32 s1, s0 -; GFX10-NEXT: s_mov_b32 s2, s0 -; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -1297,11 +1245,9 @@ define void @zero_init_small_offset_foo() { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: scratch_load_b32 v0, off, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x3 @@ -1320,13 +1266,11 @@ define void @zero_init_small_offset_foo() { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b64 s[0:1], 0 ; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 @@ -1340,10 +1284,8 @@ define void @zero_init_small_offset_foo() { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-PAL-NEXT: s_mov_b32 s1, s0 -; GFX9-PAL-NEXT: s_mov_b32 s2, s0 -; GFX9-PAL-NEXT: s_mov_b32 s3, s0 +; GFX9-PAL-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -1360,10 +1302,8 @@ define void @zero_init_small_offset_foo() { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: scratch_load_dword v0, off, s32 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: s_mov_b32 s0, 0 -; GFX942-NEXT: s_mov_b32 s1, s0 -; GFX942-NEXT: s_mov_b32 s2, s0 -; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 @@ -1378,10 +1318,8 @@ define void @zero_init_small_offset_foo() { ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: s_mov_b32 s0, 0 -; GFX10-PAL-NEXT: s_mov_b32 s1, s0 -; GFX10-PAL-NEXT: s_mov_b32 s2, s0 -; GFX10-PAL-NEXT: s_mov_b32 s3, s0 +; GFX10-PAL-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -1397,11 +1335,9 @@ define void @zero_init_small_offset_foo() { ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: s_mov_b32 s0, 0 +; GFX11-PAL-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-PAL-NEXT: s_mov_b32 s1, s0 -; GFX11-PAL-NEXT: s_mov_b32 s2, s0 -; GFX11-PAL-NEXT: s_mov_b32 s3, s0 +; GFX11-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-PAL-NEXT: s_clause 0x3 @@ -1420,13 +1356,11 @@ define void @zero_init_small_offset_foo() { ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 -; GFX12-PAL-NEXT: s_mov_b32 s0, 0 -; GFX12-PAL-NEXT: s_wait_alu 0xfffe -; GFX12-PAL-NEXT: s_mov_b32 s1, s0 -; GFX12-PAL-NEXT: s_mov_b32 s2, s0 -; GFX12-PAL-NEXT: s_mov_b32 s3, s0 +; GFX12-PAL-NEXT: s_mov_b64 s[0:1], 0 ; GFX12-PAL-NEXT: s_wait_alu 0xfffe +; GFX12-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-PAL-NEXT: s_wait_alu 0xfffe ; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-PAL-NEXT: s_clause 0x3 ; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 @@ -2237,9 +2171,8 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s1, s0 -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -2259,10 +2192,8 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: s_mov_b32 s1, s0 -; GFX10-NEXT: s_mov_b32 s2, s0 -; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -2278,11 +2209,9 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX11: ; %bb.0: ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_movk_i32 s0, 0x4004 @@ -2297,11 +2226,9 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX12: ; %bb.0: ; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_mov_b64 s[0:1], 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_clause 0x3 @@ -2317,15 +2244,14 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX9-PAL-NEXT: s_mov_b32 s12, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-PAL-NEXT: s_mov_b32 s1, s0 -; GFX9-PAL-NEXT: s_mov_b32 s2, s0 -; GFX9-PAL-NEXT: s_mov_b32 s3, s0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -2341,10 +2267,8 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX942: ; %bb.0: ; GFX942-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: s_mov_b32 s0, 0 -; GFX942-NEXT: s_mov_b32 s1, s0 -; GFX942-NEXT: s_mov_b32 s2, s0 -; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-NEXT: s_movk_i32 s0, 0x4004 @@ -2368,9 +2292,9 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 -; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 -; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 +; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-PAL-NEXT: s_mov_b64 s[0:1], 0 +; GFX1010-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -2395,10 +2319,8 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 -; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 -; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 -; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 +; GFX1030-PAL-NEXT: s_mov_b64 s[0:1], 0 +; GFX1030-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -2414,11 +2336,9 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX11-PAL: ; %bb.0: ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: s_mov_b32 s0, 0 +; GFX11-PAL-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-PAL-NEXT: s_mov_b32 s1, s0 -; GFX11-PAL-NEXT: s_mov_b32 s2, s0 -; GFX11-PAL-NEXT: s_mov_b32 s3, s0 +; GFX11-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-PAL-NEXT: s_movk_i32 s0, 0x4004 @@ -2433,11 +2353,9 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX12-PAL: ; %bb.0: ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 -; GFX12-PAL-NEXT: s_mov_b32 s0, 0 +; GFX12-PAL-NEXT: s_mov_b64 s[0:1], 0 ; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-PAL-NEXT: s_mov_b32 s1, s0 -; GFX12-PAL-NEXT: s_mov_b32 s2, s0 -; GFX12-PAL-NEXT: s_mov_b32 s3, s0 +; GFX12-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-PAL-NEXT: s_clause 0x3 @@ -2460,10 +2378,8 @@ define void @zero_init_large_offset_foo() { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, off, s32 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_mov_b32 s1, s0 -; GFX9-NEXT: s_mov_b32 s2, s0 -; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -2484,10 +2400,8 @@ define void @zero_init_large_offset_foo() { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: scratch_load_dword v0, off, s32 offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: s_mov_b32 s1, s0 -; GFX10-NEXT: s_mov_b32 s2, s0 -; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -2507,11 +2421,9 @@ define void @zero_init_large_offset_foo() { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_add_i32 s0, s32, 0x4004 @@ -2533,13 +2445,11 @@ define void @zero_init_large_offset_foo() { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b64 s[0:1], 0 ; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16384 @@ -2553,10 +2463,8 @@ define void @zero_init_large_offset_foo() { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-PAL-NEXT: s_mov_b32 s1, s0 -; GFX9-PAL-NEXT: s_mov_b32 s2, s0 -; GFX9-PAL-NEXT: s_mov_b32 s3, s0 +; GFX9-PAL-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -2577,10 +2485,8 @@ define void @zero_init_large_offset_foo() { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: scratch_load_dword v0, off, s32 offset:4 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: s_mov_b32 s0, 0 -; GFX942-NEXT: s_mov_b32 s1, s0 -; GFX942-NEXT: s_mov_b32 s2, s0 -; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-NEXT: s_add_i32 s0, s32, 0x4004 @@ -2599,10 +2505,8 @@ define void @zero_init_large_offset_foo() { ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s32 offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 -; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 -; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 -; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 +; GFX1010-PAL-NEXT: s_mov_b64 s[0:1], 0 +; GFX1010-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -2625,10 +2529,8 @@ define void @zero_init_large_offset_foo() { ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s32 offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 -; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 -; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 -; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 +; GFX1030-PAL-NEXT: s_mov_b64 s[0:1], 0 +; GFX1030-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -2648,11 +2550,9 @@ define void @zero_init_large_offset_foo() { ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: s_mov_b32 s0, 0 +; GFX11-PAL-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-PAL-NEXT: s_mov_b32 s1, s0 -; GFX11-PAL-NEXT: s_mov_b32 s2, s0 -; GFX11-PAL-NEXT: s_mov_b32 s3, s0 +; GFX11-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004 @@ -2674,13 +2574,11 @@ define void @zero_init_large_offset_foo() { ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 -; GFX12-PAL-NEXT: s_mov_b32 s0, 0 -; GFX12-PAL-NEXT: s_wait_alu 0xfffe -; GFX12-PAL-NEXT: s_mov_b32 s1, s0 -; GFX12-PAL-NEXT: s_mov_b32 s2, s0 -; GFX12-PAL-NEXT: s_mov_b32 s3, s0 +; GFX12-PAL-NEXT: s_mov_b64 s[0:1], 0 ; GFX12-PAL-NEXT: s_wait_alu 0xfffe +; GFX12-PAL-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-PAL-NEXT: s_wait_alu 0xfffe ; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-PAL-NEXT: s_clause 0x3 ; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16384 @@ -4770,12 +4668,12 @@ bb: define amdgpu_ps void @large_offset() { ; GFX9-LABEL: large_offset: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:3024 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4798,11 +4696,11 @@ define amdgpu_ps void @large_offset() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_movk_i32 s0, 0x810 ; GFX10-NEXT: s_add_i32 s1, s0, 0x3c0 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dwordx4 v[0:3], off, s1 glc dlc @@ -4819,12 +4717,11 @@ define amdgpu_ps void @large_offset() { ; GFX11-LABEL: large_offset: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_mov_b32 s0, 16 ; GFX11-NEXT: s_movk_i32 s1, 0x810 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:3024 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 glc dlc @@ -4840,11 +4737,11 @@ define amdgpu_ps void @large_offset() { ; GFX12-LABEL: large_offset: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_movk_i32 s1, 0x800 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:3008 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b128 v[0:3], off, off offset:3008 scope:SCOPE_SYS @@ -4863,9 +4760,9 @@ define amdgpu_ps void @large_offset() { ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0 @@ -4887,10 +4784,8 @@ define amdgpu_ps void @large_offset() { ; ; GFX942-LABEL: large_offset: ; GFX942: ; %bb.0: ; %bb -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1 @@ -4917,11 +4812,11 @@ define amdgpu_ps void @large_offset() { ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x810 ; GFX1010-PAL-NEXT: s_add_i32 s1, s0, 0x3c0 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, v0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, v1 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, v0 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, v0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, s1 glc dlc @@ -4948,11 +4843,11 @@ define amdgpu_ps void @large_offset() { ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x810 ; GFX1030-PAL-NEXT: s_add_i32 s1, s0, 0x3c0 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, v0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, v1 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, v0 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, v0 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, s1 glc dlc @@ -4969,12 +4864,11 @@ define amdgpu_ps void @large_offset() { ; GFX11-PAL-LABEL: large_offset: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-PAL-NEXT: s_mov_b32 s0, 16 ; GFX11-PAL-NEXT: s_movk_i32 s1, 0x810 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:3024 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 glc dlc @@ -4990,11 +4884,11 @@ define amdgpu_ps void @large_offset() { ; GFX12-PAL-LABEL: large_offset: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-PAL-NEXT: s_mov_b32 s0, 0 ; GFX12-PAL-NEXT: s_movk_i32 s1, 0x800 ; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-PAL-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 -; GFX12-PAL-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-PAL-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:3008 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 ; GFX12-PAL-NEXT: scratch_load_b128 v[0:3], off, off offset:3008 scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index 6384fdba7a45a..232e394c5fc2d 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -1982,11 +1982,9 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 { ; GFX11-LABEL: return_512xi32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: s_clause 0x1f @@ -3186,11 +3184,9 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v60, s33 offset:1600 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_mov_b32 s36, s34 diff --git a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll index 4aa49f2c9296d..11f931aacbc96 100644 --- a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll +++ b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll @@ -5,9 +5,9 @@ define amdgpu_kernel void @foo() { ; CHECK-LABEL: foo: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b64 s[0:1], src_shared_base -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1 -; CHECK-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, 0 ; CHECK-NEXT: flat_store_b64 v[0:1], v[2:3] ; CHECK-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 8dbd6c5d133ea..8090a36c07b42 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -6,100 +6,127 @@ define void @main(i1 %arg) #0 { ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: v_writelane_b32 v5, s30, 0 -; CHECK-NEXT: v_writelane_b32 v5, s31, 1 -; CHECK-NEXT: v_writelane_b32 v5, s34, 2 -; CHECK-NEXT: v_writelane_b32 v5, s35, 3 -; CHECK-NEXT: v_writelane_b32 v5, s36, 4 -; CHECK-NEXT: v_writelane_b32 v5, s37, 5 -; CHECK-NEXT: v_writelane_b32 v5, s38, 6 -; CHECK-NEXT: s_getpc_b64 s[24:25] -; CHECK-NEXT: v_writelane_b32 v5, s39, 7 -; CHECK-NEXT: s_movk_i32 s20, 0xf0 -; CHECK-NEXT: s_mov_b32 s21, s24 -; CHECK-NEXT: v_writelane_b32 v5, s48, 8 -; CHECK-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 -; CHECK-NEXT: s_mov_b64 s[20:21], 0 -; CHECK-NEXT: v_writelane_b32 v5, s49, 9 -; CHECK-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 -; CHECK-NEXT: v_writelane_b32 v5, s50, 10 +; CHECK-NEXT: v_writelane_b32 v6, s30, 0 +; CHECK-NEXT: v_writelane_b32 v6, s31, 1 +; CHECK-NEXT: v_writelane_b32 v6, s34, 2 +; CHECK-NEXT: v_writelane_b32 v6, s35, 3 +; CHECK-NEXT: v_writelane_b32 v6, s36, 4 +; CHECK-NEXT: v_writelane_b32 v6, s37, 5 +; CHECK-NEXT: v_writelane_b32 v6, s38, 6 +; CHECK-NEXT: v_writelane_b32 v6, s39, 7 +; CHECK-NEXT: v_writelane_b32 v6, s48, 8 +; CHECK-NEXT: v_writelane_b32 v6, s49, 9 +; CHECK-NEXT: v_writelane_b32 v6, s50, 10 +; CHECK-NEXT: s_getpc_b64 s[8:9] +; CHECK-NEXT: s_mov_b64 s[12:13], 0 +; CHECK-NEXT: v_writelane_b32 v6, s51, 11 +; CHECK-NEXT: s_movk_i32 s10, 0xf0 +; CHECK-NEXT: s_mov_b32 s11, s8 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[12:13], 0x0 +; CHECK-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0x0 +; CHECK-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v6, s52, 12 +; CHECK-NEXT: v_writelane_b32 v6, s53, 13 +; CHECK-NEXT: v_writelane_b32 v6, s54, 14 +; CHECK-NEXT: v_writelane_b32 v6, s55, 15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s22, 0x130 -; CHECK-NEXT: s_mov_b32 s23, s24 -; CHECK-NEXT: v_writelane_b32 v5, s51, 11 -; CHECK-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0x0 -; CHECK-NEXT: s_mov_b32 s28, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_mov_b32_e32 v2, s20 -; CHECK-NEXT: v_mov_b32_e32 v3, v1 -; CHECK-NEXT: s_mov_b32 s29, s28 -; CHECK-NEXT: s_mov_b32 s30, s28 -; CHECK-NEXT: s_mov_b32 s31, s28 -; CHECK-NEXT: image_sample_lz v3, v[2:3], s[12:19], s[28:31] dmask:0x1 -; CHECK-NEXT: v_mov_b32_e32 v2, v1 -; CHECK-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane -; CHECK-NEXT: v_writelane_b32 v5, s52, 12 +; CHECK-NEXT: v_writelane_b32 v7, s36, 0 +; CHECK-NEXT: v_writelane_b32 v7, s37, 1 +; CHECK-NEXT: v_writelane_b32 v7, s38, 2 +; CHECK-NEXT: v_writelane_b32 v7, s39, 3 +; CHECK-NEXT: v_writelane_b32 v7, s40, 4 +; CHECK-NEXT: v_writelane_b32 v7, s41, 5 +; CHECK-NEXT: v_writelane_b32 v7, s42, 6 +; CHECK-NEXT: v_writelane_b32 v7, s43, 7 +; CHECK-NEXT: v_writelane_b32 v6, s64, 16 +; CHECK-NEXT: v_writelane_b32 v7, s44, 8 +; CHECK-NEXT: v_writelane_b32 v6, s65, 17 +; CHECK-NEXT: v_writelane_b32 v7, s45, 9 +; CHECK-NEXT: v_writelane_b32 v6, s66, 18 +; CHECK-NEXT: s_movk_i32 s6, 0x130 +; CHECK-NEXT: s_mov_b32 s7, s8 +; CHECK-NEXT: v_writelane_b32 v7, s46, 10 +; CHECK-NEXT: v_writelane_b32 v6, s67, 19 +; CHECK-NEXT: s_load_dwordx16 s[52:67], s[6:7], 0x0 +; CHECK-NEXT: v_writelane_b32 v7, s47, 11 +; CHECK-NEXT: v_writelane_b32 v7, s48, 12 +; CHECK-NEXT: v_writelane_b32 v7, s49, 13 +; CHECK-NEXT: s_mov_b64 s[28:29], 0 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: v_writelane_b32 v7, s50, 14 +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_mov_b64 s[30:31], s[28:29] +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: v_writelane_b32 v7, s51, 15 +; CHECK-NEXT: image_sample_lz v1, v[1:2], s[44:51], s[28:31] dmask:0x1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v6, s36, 0 -; CHECK-NEXT: v_writelane_b32 v6, s37, 1 -; CHECK-NEXT: v_writelane_b32 v6, s38, 2 -; CHECK-NEXT: v_writelane_b32 v6, s39, 3 -; CHECK-NEXT: v_writelane_b32 v6, s40, 4 -; CHECK-NEXT: v_writelane_b32 v6, s41, 5 -; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[28:31] dmask:0x1 -; CHECK-NEXT: v_writelane_b32 v6, s42, 6 -; CHECK-NEXT: v_writelane_b32 v6, s43, 7 -; CHECK-NEXT: v_writelane_b32 v6, s44, 8 -; CHECK-NEXT: v_writelane_b32 v6, s45, 9 -; CHECK-NEXT: v_writelane_b32 v5, s53, 13 -; CHECK-NEXT: v_writelane_b32 v6, s46, 10 -; CHECK-NEXT: v_writelane_b32 v5, s54, 14 -; CHECK-NEXT: v_writelane_b32 v6, s47, 11 -; CHECK-NEXT: v_writelane_b32 v5, s55, 15 -; CHECK-NEXT: v_writelane_b32 v6, s48, 12 -; CHECK-NEXT: v_writelane_b32 v5, s64, 16 -; CHECK-NEXT: v_writelane_b32 v6, s49, 13 -; CHECK-NEXT: v_writelane_b32 v5, s65, 17 -; CHECK-NEXT: v_writelane_b32 v6, s50, 14 -; CHECK-NEXT: v_writelane_b32 v5, s66, 18 -; CHECK-NEXT: v_writelane_b32 v6, s51, 15 -; CHECK-NEXT: s_mov_b32 s40, 48 -; CHECK-NEXT: s_movk_i32 s56, 0x1f0 +; CHECK-NEXT: v_writelane_b32 v7, s52, 16 +; CHECK-NEXT: v_writelane_b32 v7, s53, 17 +; CHECK-NEXT: v_writelane_b32 v7, s54, 18 +; CHECK-NEXT: v_writelane_b32 v7, s55, 19 +; CHECK-NEXT: v_writelane_b32 v7, s56, 20 +; CHECK-NEXT: v_writelane_b32 v7, s57, 21 +; CHECK-NEXT: image_sample_lz v5, v[3:4], s[52:59], s[28:31] dmask:0x1 +; CHECK-NEXT: v_writelane_b32 v7, s58, 22 +; CHECK-NEXT: v_writelane_b32 v7, s59, 23 +; CHECK-NEXT: v_writelane_b32 v7, s60, 24 +; CHECK-NEXT: v_writelane_b32 v7, s61, 25 +; CHECK-NEXT: v_writelane_b32 v7, s62, 26 +; CHECK-NEXT: v_writelane_b32 v7, s63, 27 +; CHECK-NEXT: v_writelane_b32 v7, s64, 28 +; CHECK-NEXT: v_writelane_b32 v7, s65, 29 +; CHECK-NEXT: v_writelane_b32 v7, s66, 30 +; CHECK-NEXT: s_mov_b32 s4, 48 +; CHECK-NEXT: s_movk_i32 s40, 0x1f0 ; CHECK-NEXT: s_movk_i32 s34, 0x2f0 -; CHECK-NEXT: s_mov_b32 s41, s24 -; CHECK-NEXT: s_mov_b32 s57, s24 -; CHECK-NEXT: s_mov_b32 s35, s24 -; CHECK-NEXT: v_writelane_b32 v5, s67, 19 -; CHECK-NEXT: s_load_dwordx8 s[20:27], s[40:41], 0x0 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: s_load_dwordx16 s[36:51], s[56:57], 0x0 +; CHECK-NEXT: s_mov_b32 s5, s8 +; CHECK-NEXT: s_mov_b32 s41, s8 +; CHECK-NEXT: s_mov_b32 s35, s8 +; CHECK-NEXT: v_writelane_b32 v7, s67, 31 +; CHECK-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx16 s[52:67], s[40:41], 0x0 ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: s_load_dwordx16 s[52:67], s[34:35], 0x0 +; CHECK-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; CHECK-NEXT: v_writelane_b32 v5, s68, 20 +; CHECK-NEXT: v_writelane_b32 v6, s68, 20 ; CHECK-NEXT: s_xor_b64 s[72:73], vcc, -1 -; CHECK-NEXT: v_writelane_b32 v5, s69, 21 +; CHECK-NEXT: v_writelane_b32 v6, s69, 21 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mul_f32_e32 v0, v4, v3 +; CHECK-NEXT: v_mul_f32_e32 v0, v5, v1 ; CHECK-NEXT: s_and_saveexec_b64 vcc, s[72:73] ; CHECK-NEXT: s_xor_b64 s[34:35], exec, vcc ; CHECK-NEXT: s_cbranch_execz .LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %bb48 -; CHECK-NEXT: image_sample_lz v3, v[1:2], s[12:19], s[28:31] dmask:0x1 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_readlane_b32 s4, v7, 0 +; CHECK-NEXT: v_readlane_b32 s12, v7, 8 +; CHECK-NEXT: v_readlane_b32 s13, v7, 9 +; CHECK-NEXT: v_readlane_b32 s14, v7, 10 +; CHECK-NEXT: v_readlane_b32 s15, v7, 11 +; CHECK-NEXT: v_readlane_b32 s16, v7, 12 +; CHECK-NEXT: v_readlane_b32 s17, v7, 13 +; CHECK-NEXT: v_readlane_b32 s18, v7, 14 +; CHECK-NEXT: v_readlane_b32 s19, v7, 15 +; CHECK-NEXT: v_mov_b32_e32 v1, v2 ; CHECK-NEXT: s_and_b64 vcc, exec, -1 +; CHECK-NEXT: v_readlane_b32 s5, v7, 1 +; CHECK-NEXT: v_readlane_b32 s6, v7, 2 +; CHECK-NEXT: v_readlane_b32 s7, v7, 3 +; CHECK-NEXT: image_sample_lz v3, v[3:4], s[12:19], s[28:31] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s8, v7, 4 +; CHECK-NEXT: v_readlane_b32 s9, v7, 5 +; CHECK-NEXT: v_readlane_b32 s10, v7, 6 +; CHECK-NEXT: v_readlane_b32 s11, v7, 7 ; CHECK-NEXT: .LBB0_2: ; %bb50 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_mov_b32 s29, s28 -; CHECK-NEXT: s_mov_b32 s30, s28 -; CHECK-NEXT: s_mov_b32 s31, s28 +; CHECK-NEXT: s_mov_b64 s[30:31], s[28:29] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: image_sample_lz v4, v[1:2], s[44:51], s[24:27] dmask:0x1 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[60:67], s[24:27] dmask:0x1 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[28:31] dmask:0x1 +; CHECK-NEXT: image_sample_lz v1, v[1:2], s[44:51], s[28:31] dmask:0x1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_sub_f32_e32 v1, v1, v4 ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v0 @@ -107,68 +134,69 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 ; CHECK-NEXT: .LBB0_3: ; %Flow14 -; CHECK-NEXT: s_andn2_saveexec_b64 s[12:13], s[34:35] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_andn2_saveexec_b64 s[24:25], s[34:35] ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.4: ; %bb32 ; CHECK-NEXT: s_and_saveexec_b64 s[14:15], s[72:73] -; CHECK-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; CHECK-NEXT: s_xor_b64 s[26:27], exec, s[14:15] ; CHECK-NEXT: s_cbranch_execz .LBB0_6 ; CHECK-NEXT: ; %bb.5: ; %bb43 -; CHECK-NEXT: s_mov_b32 s16, 0 -; CHECK-NEXT: s_mov_b32 s17, s16 -; CHECK-NEXT: v_mov_b32_e32 v2, s16 -; CHECK-NEXT: v_mov_b32_e32 v3, s17 -; CHECK-NEXT: s_mov_b32 s18, s16 -; CHECK-NEXT: s_mov_b32 s19, s16 -; CHECK-NEXT: image_sample_lz v1, v[2:3], s[4:11], s[16:19] dmask:0x1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_mov_b64 s[4:5], s[36:37] -; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] -; CHECK-NEXT: s_mov_b64 s[8:9], s[40:41] -; CHECK-NEXT: s_mov_b64 s[10:11], s[42:43] -; CHECK-NEXT: v_readlane_b32 s36, v6, 0 -; CHECK-NEXT: v_readlane_b32 s44, v6, 8 -; CHECK-NEXT: v_readlane_b32 s45, v6, 9 -; CHECK-NEXT: v_readlane_b32 s46, v6, 10 -; CHECK-NEXT: v_readlane_b32 s47, v6, 11 -; CHECK-NEXT: v_readlane_b32 s48, v6, 12 -; CHECK-NEXT: v_readlane_b32 s49, v6, 13 -; CHECK-NEXT: v_readlane_b32 s50, v6, 14 -; CHECK-NEXT: v_readlane_b32 s51, v6, 15 -; CHECK-NEXT: v_readlane_b32 s37, v6, 1 -; CHECK-NEXT: v_readlane_b32 s38, v6, 2 -; CHECK-NEXT: v_readlane_b32 s39, v6, 3 -; CHECK-NEXT: v_readlane_b32 s40, v6, 4 -; CHECK-NEXT: v_readlane_b32 s41, v6, 5 -; CHECK-NEXT: image_sample_lz v0, v[2:3], s[44:51], s[20:23] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s42, v6, 6 -; CHECK-NEXT: v_readlane_b32 s43, v6, 7 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: s_mov_b64 s[42:43], s[10:11] -; CHECK-NEXT: v_mov_b32_e32 v3, v2 -; CHECK-NEXT: s_mov_b64 s[40:41], s[8:9] -; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] -; CHECK-NEXT: s_mov_b64 s[36:37], s[4:5] +; CHECK-NEXT: s_mov_b64 s[44:45], 0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_readlane_b32 s4, v7, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_mov_b64 s[46:47], s[44:45] +; CHECK-NEXT: v_readlane_b32 s5, v7, 1 +; CHECK-NEXT: v_readlane_b32 s6, v7, 2 +; CHECK-NEXT: v_readlane_b32 s7, v7, 3 +; CHECK-NEXT: v_readlane_b32 s8, v7, 4 +; CHECK-NEXT: v_readlane_b32 s9, v7, 5 +; CHECK-NEXT: v_readlane_b32 s10, v7, 6 +; CHECK-NEXT: v_readlane_b32 s11, v7, 7 +; CHECK-NEXT: v_readlane_b32 s12, v7, 8 +; CHECK-NEXT: v_readlane_b32 s13, v7, 9 +; CHECK-NEXT: v_readlane_b32 s14, v7, 10 +; CHECK-NEXT: v_readlane_b32 s15, v7, 11 +; CHECK-NEXT: v_readlane_b32 s16, v7, 12 +; CHECK-NEXT: v_readlane_b32 s17, v7, 13 +; CHECK-NEXT: v_readlane_b32 s18, v7, 14 +; CHECK-NEXT: v_readlane_b32 s19, v7, 15 +; CHECK-NEXT: image_sample_lz v2, v[0:1], s[4:11], s[44:47] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s4, v7, 16 +; CHECK-NEXT: v_readlane_b32 s12, v7, 24 +; CHECK-NEXT: v_readlane_b32 s13, v7, 25 +; CHECK-NEXT: v_readlane_b32 s14, v7, 26 +; CHECK-NEXT: v_readlane_b32 s15, v7, 27 +; CHECK-NEXT: v_readlane_b32 s16, v7, 28 +; CHECK-NEXT: v_readlane_b32 s17, v7, 29 +; CHECK-NEXT: v_readlane_b32 s18, v7, 30 +; CHECK-NEXT: v_readlane_b32 s19, v7, 31 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, v3 +; CHECK-NEXT: v_readlane_b32 s5, v7, 17 +; CHECK-NEXT: v_readlane_b32 s6, v7, 18 +; CHECK-NEXT: v_readlane_b32 s7, v7, 19 +; CHECK-NEXT: image_sample_lz v0, v[0:1], s[12:19], s[20:23] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s8, v7, 20 +; CHECK-NEXT: v_readlane_b32 s9, v7, 21 +; CHECK-NEXT: v_readlane_b32 s10, v7, 22 +; CHECK-NEXT: v_readlane_b32 s11, v7, 23 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dwordx3 v[1:3], off, s[16:19], 0 +; CHECK-NEXT: buffer_store_dwordx3 v[2:4], off, s[44:47], 0 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[44:47], 0 ; CHECK-NEXT: ; implicit-def: $vgpr0 ; CHECK-NEXT: .LBB0_6: ; %Flow12 -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[14:15] +; CHECK-NEXT: s_andn2_saveexec_b64 s[14:15], s[26:27] ; CHECK-NEXT: s_cbranch_execz .LBB0_9 ; CHECK-NEXT: ; %bb.7: ; %bb33.preheader -; CHECK-NEXT: s_mov_b32 s8, 0 -; CHECK-NEXT: s_mov_b32 s6, s8 -; CHECK-NEXT: s_mov_b32 s7, s8 -; CHECK-NEXT: v_mov_b32_e32 v1, s6 -; CHECK-NEXT: s_mov_b32 s9, s8 -; CHECK-NEXT: s_mov_b32 s10, s8 -; CHECK-NEXT: s_mov_b32 s11, s8 -; CHECK-NEXT: v_mov_b32_e32 v2, s7 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: image_sample_lz v3, v[1:2], s[36:43], s[8:11] dmask:0x1 -; CHECK-NEXT: image_sample_lz v4, v[1:2], s[52:59], s[8:11] dmask:0x1 +; CHECK-NEXT: s_mov_b64 s[16:17], 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_mov_b64 s[18:19], s[16:17] +; CHECK-NEXT: image_sample_lz v3, v[1:2], s[52:59], s[16:19] dmask:0x1 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[16:19] dmask:0x1 ; CHECK-NEXT: s_and_b64 vcc, exec, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_sub_f32_e32 v1, v4, v3 @@ -181,35 +209,34 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccz .LBB0_8 ; CHECK-NEXT: .LBB0_9: ; %Flow13 -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_or_b64 exec, exec, s[14:15] ; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock -; CHECK-NEXT: s_or_b64 exec, exec, s[12:13] -; CHECK-NEXT: v_readlane_b32 s69, v5, 21 -; CHECK-NEXT: v_readlane_b32 s68, v5, 20 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_readlane_b32 s67, v5, 19 -; CHECK-NEXT: v_readlane_b32 s66, v5, 18 -; CHECK-NEXT: v_readlane_b32 s65, v5, 17 -; CHECK-NEXT: v_readlane_b32 s64, v5, 16 -; CHECK-NEXT: v_readlane_b32 s55, v5, 15 -; CHECK-NEXT: v_readlane_b32 s54, v5, 14 -; CHECK-NEXT: v_readlane_b32 s53, v5, 13 -; CHECK-NEXT: v_readlane_b32 s52, v5, 12 -; CHECK-NEXT: v_readlane_b32 s51, v5, 11 -; CHECK-NEXT: v_readlane_b32 s50, v5, 10 -; CHECK-NEXT: v_readlane_b32 s49, v5, 9 -; CHECK-NEXT: v_readlane_b32 s48, v5, 8 -; CHECK-NEXT: v_readlane_b32 s39, v5, 7 -; CHECK-NEXT: v_readlane_b32 s38, v5, 6 -; CHECK-NEXT: v_readlane_b32 s37, v5, 5 -; CHECK-NEXT: v_readlane_b32 s36, v5, 4 -; CHECK-NEXT: v_readlane_b32 s35, v5, 3 -; CHECK-NEXT: v_readlane_b32 s34, v5, 2 -; CHECK-NEXT: v_readlane_b32 s31, v5, 1 -; CHECK-NEXT: v_readlane_b32 s30, v5, 0 +; CHECK-NEXT: s_or_b64 exec, exec, s[24:25] +; CHECK-NEXT: v_readlane_b32 s69, v6, 21 +; CHECK-NEXT: v_readlane_b32 s68, v6, 20 +; CHECK-NEXT: v_readlane_b32 s67, v6, 19 +; CHECK-NEXT: v_readlane_b32 s66, v6, 18 +; CHECK-NEXT: v_readlane_b32 s65, v6, 17 +; CHECK-NEXT: v_readlane_b32 s64, v6, 16 +; CHECK-NEXT: v_readlane_b32 s55, v6, 15 +; CHECK-NEXT: v_readlane_b32 s54, v6, 14 +; CHECK-NEXT: v_readlane_b32 s53, v6, 13 +; CHECK-NEXT: v_readlane_b32 s52, v6, 12 +; CHECK-NEXT: v_readlane_b32 s51, v6, 11 +; CHECK-NEXT: v_readlane_b32 s50, v6, 10 +; CHECK-NEXT: v_readlane_b32 s49, v6, 9 +; CHECK-NEXT: v_readlane_b32 s48, v6, 8 +; CHECK-NEXT: v_readlane_b32 s39, v6, 7 +; CHECK-NEXT: v_readlane_b32 s38, v6, 6 +; CHECK-NEXT: v_readlane_b32 s37, v6, 5 +; CHECK-NEXT: v_readlane_b32 s36, v6, 4 +; CHECK-NEXT: v_readlane_b32 s35, v6, 3 +; CHECK-NEXT: v_readlane_b32 s34, v6, 2 +; CHECK-NEXT: v_readlane_b32 s31, v6, 1 +; CHECK-NEXT: v_readlane_b32 s30, v6, 0 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll index a328bbe8b4ddc..42168354897e5 100644 --- a/llvm/test/CodeGen/AMDGPU/imm.ll +++ b/llvm/test/CodeGen/AMDGPU/imm.ll @@ -1532,8 +1532,8 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) { ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v1, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -1543,8 +1543,8 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) { ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index 56a3ce7281030..3a40688b0c45c 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -9544,78 +9544,77 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: s_mov_b32 s19, 0xe8f000 ; NOOPT-NEXT: s_add_u32 s16, s16, s5 ; NOOPT-NEXT: s_addc_u32 s17, s17, 0 -; NOOPT-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane -; NOOPT-NEXT: v_writelane_b32 v33, s4, 0 -; NOOPT-NEXT: s_mov_b32 s4, s1 -; NOOPT-NEXT: v_readlane_b32 s1, v33, 0 -; NOOPT-NEXT: v_writelane_b32 v33, s4, 1 -; NOOPT-NEXT: s_mov_b32 s4, s0 -; NOOPT-NEXT: v_readlane_b32 s0, v33, 1 +; NOOPT-NEXT: s_mov_b32 s5, s4 +; NOOPT-NEXT: s_mov_b32 s4, s3 +; NOOPT-NEXT: s_mov_b32 s6, s2 +; NOOPT-NEXT: s_mov_b32 s7, s1 ; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:144 ; 4-byte Folded Spill ; NOOPT-NEXT: v_mov_b32_e32 v2, v1 -; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 -; NOOPT-NEXT: s_mov_b32 s5, s0 -; NOOPT-NEXT: s_mov_b32 s6, s2 -; NOOPT-NEXT: s_mov_b32 s7, s3 -; NOOPT-NEXT: ; implicit-def: $sgpr0 -; NOOPT-NEXT: ; implicit-def: $sgpr0 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v1, v2 ; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:136 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:140 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b32 s8, 0xf000 -; NOOPT-NEXT: s_mov_b32 s0, 0 -; NOOPT-NEXT: v_writelane_b32 v33, s0, 2 -; NOOPT-NEXT: s_mov_b32 s2, s0 -; NOOPT-NEXT: s_mov_b32 s3, s8 -; NOOPT-NEXT: s_mov_b32 s8, s0 -; NOOPT-NEXT: s_mov_b32 s9, s0 +; NOOPT-NEXT: s_mov_b32 s4, 0 +; NOOPT-NEXT: s_mov_b32 s6, s4 +; NOOPT-NEXT: s_mov_b32 s7, s8 +; NOOPT-NEXT: s_mov_b32 s8, s4 +; NOOPT-NEXT: s_mov_b32 s9, s4 ; NOOPT-NEXT: ; kill: def $sgpr8_sgpr9 killed $sgpr8_sgpr9 def $sgpr8_sgpr9_sgpr10_sgpr11 -; NOOPT-NEXT: s_mov_b64 s[10:11], s[2:3] -; NOOPT-NEXT: v_writelane_b32 v33, s8, 3 -; NOOPT-NEXT: v_writelane_b32 v33, s9, 4 -; NOOPT-NEXT: v_writelane_b32 v33, s10, 5 -; NOOPT-NEXT: v_writelane_b32 v33, s11, 6 -; NOOPT-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10_sgpr11 killed $sgpr4_sgpr5_sgpr6_sgpr7 -; NOOPT-NEXT: ; implicit-def: $sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b64 s[10:11], s[6:7] +; NOOPT-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane +; NOOPT-NEXT: v_writelane_b32 v33, s8, 0 +; NOOPT-NEXT: v_writelane_b32 v33, s9, 1 +; NOOPT-NEXT: v_writelane_b32 v33, s10, 2 +; NOOPT-NEXT: v_writelane_b32 v33, s11, 3 +; NOOPT-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10_sgpr11 killed $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: ; implicit-def: $sgpr6_sgpr7 ; NOOPT-NEXT: s_waitcnt expcnt(1) -; NOOPT-NEXT: v_mov_b32_e32 v0, s1 -; NOOPT-NEXT: buffer_load_dword v0, v0, s[4:7], s0 offen +; NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; NOOPT-NEXT: buffer_load_dword v0, v0, s[0:3], s4 offen ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:132 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[0:1], 0 +; NOOPT-NEXT: v_writelane_b32 v33, s0, 4 +; NOOPT-NEXT: v_writelane_b32 v33, s1, 5 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: v_mov_b32_e32 v0, s0 -; NOOPT-NEXT: v_mov_b32_e32 v30, s0 -; NOOPT-NEXT: v_mov_b32_e32 v29, s0 +; NOOPT-NEXT: v_mov_b32_e32 v1, s1 +; NOOPT-NEXT: v_mov_b32_e32 v29, s1 ; NOOPT-NEXT: v_mov_b32_e32 v28, s0 -; NOOPT-NEXT: v_mov_b32_e32 v27, s0 +; NOOPT-NEXT: v_mov_b32_e32 v27, s1 ; NOOPT-NEXT: v_mov_b32_e32 v26, s0 -; NOOPT-NEXT: v_mov_b32_e32 v25, s0 +; NOOPT-NEXT: v_mov_b32_e32 v25, s1 ; NOOPT-NEXT: v_mov_b32_e32 v24, s0 -; NOOPT-NEXT: v_mov_b32_e32 v23, s0 +; NOOPT-NEXT: v_mov_b32_e32 v23, s1 ; NOOPT-NEXT: v_mov_b32_e32 v22, s0 -; NOOPT-NEXT: v_mov_b32_e32 v21, s0 +; NOOPT-NEXT: v_mov_b32_e32 v21, s1 ; NOOPT-NEXT: v_mov_b32_e32 v20, s0 -; NOOPT-NEXT: v_mov_b32_e32 v19, s0 +; NOOPT-NEXT: v_mov_b32_e32 v19, s1 ; NOOPT-NEXT: v_mov_b32_e32 v18, s0 -; NOOPT-NEXT: v_mov_b32_e32 v17, s0 +; NOOPT-NEXT: v_mov_b32_e32 v17, s1 ; NOOPT-NEXT: v_mov_b32_e32 v16, s0 -; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec -; NOOPT-NEXT: v_mov_b32_e32 v1, v30 -; NOOPT-NEXT: v_mov_b32_e32 v2, v29 -; NOOPT-NEXT: v_mov_b32_e32 v3, v28 -; NOOPT-NEXT: v_mov_b32_e32 v4, v27 -; NOOPT-NEXT: v_mov_b32_e32 v5, v26 -; NOOPT-NEXT: v_mov_b32_e32 v6, v25 -; NOOPT-NEXT: v_mov_b32_e32 v7, v24 -; NOOPT-NEXT: v_mov_b32_e32 v8, v23 -; NOOPT-NEXT: v_mov_b32_e32 v9, v22 -; NOOPT-NEXT: v_mov_b32_e32 v10, v21 -; NOOPT-NEXT: v_mov_b32_e32 v11, v20 -; NOOPT-NEXT: v_mov_b32_e32 v12, v19 -; NOOPT-NEXT: v_mov_b32_e32 v13, v18 -; NOOPT-NEXT: v_mov_b32_e32 v14, v17 -; NOOPT-NEXT: v_mov_b32_e32 v15, v16 +; NOOPT-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v2, v28 +; NOOPT-NEXT: v_mov_b32_e32 v3, v29 +; NOOPT-NEXT: v_mov_b32_e32 v4, v26 +; NOOPT-NEXT: v_mov_b32_e32 v5, v27 +; NOOPT-NEXT: v_mov_b32_e32 v6, v24 +; NOOPT-NEXT: v_mov_b32_e32 v7, v25 +; NOOPT-NEXT: v_mov_b32_e32 v8, v22 +; NOOPT-NEXT: v_mov_b32_e32 v9, v23 +; NOOPT-NEXT: v_mov_b32_e32 v10, v20 +; NOOPT-NEXT: v_mov_b32_e32 v11, v21 +; NOOPT-NEXT: v_mov_b32_e32 v12, v18 +; NOOPT-NEXT: v_mov_b32_e32 v13, v19 +; NOOPT-NEXT: v_mov_b32_e32 v14, v16 +; NOOPT-NEXT: v_mov_b32_e32 v15, v17 ; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:68 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:72 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:76 ; 4-byte Folded Spill @@ -9633,8 +9632,8 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:124 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:128 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[0:1], exec -; NOOPT-NEXT: v_writelane_b32 v33, s0, 7 -; NOOPT-NEXT: v_writelane_b32 v33, s1, 8 +; NOOPT-NEXT: v_writelane_b32 v33, s0, 6 +; NOOPT-NEXT: v_writelane_b32 v33, s1, 7 ; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 ; NOOPT-NEXT: buffer_store_dword v33, off, s[16:19], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] @@ -9661,8 +9660,8 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v33, 9 -; NOOPT-NEXT: v_readlane_b32 s1, v33, 10 +; NOOPT-NEXT: v_readlane_b32 s0, v33, 8 +; NOOPT-NEXT: v_readlane_b32 s1, v33, 9 ; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload @@ -9727,8 +9726,8 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:60 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] -; NOOPT-NEXT: v_writelane_b32 v33, s2, 9 -; NOOPT-NEXT: v_writelane_b32 v33, s3, 10 +; NOOPT-NEXT: v_writelane_b32 v33, s2, 8 +; NOOPT-NEXT: v_writelane_b32 v33, s3, 9 ; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 ; NOOPT-NEXT: buffer_store_dword v33, off, s[16:19], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] @@ -9740,18 +9739,18 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v33, 7 -; NOOPT-NEXT: v_readlane_b32 s1, v33, 8 +; NOOPT-NEXT: v_readlane_b32 s0, v33, 6 +; NOOPT-NEXT: v_readlane_b32 s1, v33, 7 ; NOOPT-NEXT: s_mov_b64 exec, s[0:1] ; NOOPT-NEXT: ; %bb.3: ; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 ; NOOPT-NEXT: buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v33, 3 -; NOOPT-NEXT: v_readlane_b32 s1, v33, 4 -; NOOPT-NEXT: v_readlane_b32 s2, v33, 5 -; NOOPT-NEXT: v_readlane_b32 s3, v33, 6 +; NOOPT-NEXT: v_readlane_b32 s0, v33, 0 +; NOOPT-NEXT: v_readlane_b32 s1, v33, 1 +; NOOPT-NEXT: v_readlane_b32 s2, v33, 2 +; NOOPT-NEXT: v_readlane_b32 s3, v33, 3 ; NOOPT-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload @@ -9839,26 +9838,26 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; SI-MOVREL: ; %bb.0: ; %entry ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s4 ; SI-MOVREL-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; SI-MOVREL-NEXT: s_mov_b32 s2, 0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0 +; SI-MOVREL-NEXT: s_mov_b32 s2, 0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, v6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, v6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, v6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, v6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, v6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v18, v6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v20, v6 ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s0, s2 ; SI-MOVREL-NEXT: s_mov_b32 s1, s2 -; SI-MOVREL-NEXT: v_mov_b32_e32 v6, v5 ; SI-MOVREL-NEXT: v_mov_b32_e32 v7, v5 -; SI-MOVREL-NEXT: v_mov_b32_e32 v8, v5 ; SI-MOVREL-NEXT: v_mov_b32_e32 v9, v5 -; SI-MOVREL-NEXT: v_mov_b32_e32 v10, v5 ; SI-MOVREL-NEXT: v_mov_b32_e32 v11, v5 -; SI-MOVREL-NEXT: v_mov_b32_e32 v12, v5 ; SI-MOVREL-NEXT: v_mov_b32_e32 v13, v5 -; SI-MOVREL-NEXT: v_mov_b32_e32 v14, v5 ; SI-MOVREL-NEXT: v_mov_b32_e32 v15, v5 -; SI-MOVREL-NEXT: v_mov_b32_e32 v16, v5 ; SI-MOVREL-NEXT: v_mov_b32_e32 v17, v5 -; SI-MOVREL-NEXT: v_mov_b32_e32 v18, v5 ; SI-MOVREL-NEXT: v_mov_b32_e32 v19, v5 -; SI-MOVREL-NEXT: v_mov_b32_e32 v20, v5 ; SI-MOVREL-NEXT: s_mov_b64 s[4:5], exec ; SI-MOVREL-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1 ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) @@ -9882,21 +9881,21 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s4 ; VI-MOVREL-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v6, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, v6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, v6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, v6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, v6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, v6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v18, v6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v20, v6 ; VI-MOVREL-NEXT: v_mov_b32_e32 v7, v5 -; VI-MOVREL-NEXT: v_mov_b32_e32 v8, v5 ; VI-MOVREL-NEXT: v_mov_b32_e32 v9, v5 -; VI-MOVREL-NEXT: v_mov_b32_e32 v10, v5 ; VI-MOVREL-NEXT: v_mov_b32_e32 v11, v5 -; VI-MOVREL-NEXT: v_mov_b32_e32 v12, v5 ; VI-MOVREL-NEXT: v_mov_b32_e32 v13, v5 -; VI-MOVREL-NEXT: v_mov_b32_e32 v14, v5 ; VI-MOVREL-NEXT: v_mov_b32_e32 v15, v5 -; VI-MOVREL-NEXT: v_mov_b32_e32 v16, v5 ; VI-MOVREL-NEXT: v_mov_b32_e32 v17, v5 -; VI-MOVREL-NEXT: v_mov_b32_e32 v18, v5 ; VI-MOVREL-NEXT: v_mov_b32_e32 v19, v5 -; VI-MOVREL-NEXT: v_mov_b32_e32 v20, v5 ; VI-MOVREL-NEXT: s_mov_b64 s[0:1], exec ; VI-MOVREL-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1 ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) @@ -9926,21 +9925,21 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s4 ; VI-IDXMODE-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, v6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, v6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, v6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, v6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, v6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v18, v6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v20, v6 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, v5 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, v5 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, v5 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, v5 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, v5 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, v5 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, v5 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, v5 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, v5 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, v5 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, v5 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v18, v5 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v19, v5 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v20, v5 ; VI-IDXMODE-NEXT: s_mov_b64 s[0:1], exec ; VI-IDXMODE-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1 ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) @@ -9971,21 +9970,21 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-IDXMODE-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, v6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, v6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, v5 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, v5 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, v5 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, v5 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, v5 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, v5 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v5 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, v5 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, v5 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, v5 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v20, v5 ; GFX9-IDXMODE-NEXT: s_mov_b64 s[0:1], exec ; GFX9-IDXMODE-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll index f961e857f39e5..37990085e6abf 100644 --- a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll +++ b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll @@ -15,11 +15,9 @@ define void @issue92561(ptr addrspace(1) %arg) { ; SDAG-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16 ; SDAG-NEXT: global_load_b128 v[0:3], v[0:1], off ; SDAG-NEXT: v_mov_b32_e32 v8, 0 -; SDAG-NEXT: s_mov_b32 s12, 0 +; SDAG-NEXT: s_mov_b64 s[12:13], 0 ; SDAG-NEXT: s_mov_b32 s3, exec_lo -; SDAG-NEXT: s_mov_b32 s13, s12 -; SDAG-NEXT: s_mov_b32 s14, s12 -; SDAG-NEXT: s_mov_b32 s15, s12 +; SDAG-NEXT: s_mov_b64 s[14:15], s[12:13] ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_readfirstlane_b32 s4, v0 @@ -51,14 +49,10 @@ define void @issue92561(ptr addrspace(1) %arg) { ; SDAG-NEXT: s_mov_b32 exec_lo, s3 ; SDAG-NEXT: v_dual_mov_b32 v0, 0x7fc00000 :: v_dual_mov_b32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 1.0 -; SDAG-NEXT: s_mov_b32 s0, s12 -; SDAG-NEXT: s_mov_b32 s1, s12 -; SDAG-NEXT: s_mov_b32 s2, s12 -; SDAG-NEXT: s_mov_b32 s3, s12 -; SDAG-NEXT: s_mov_b32 s4, s12 -; SDAG-NEXT: s_mov_b32 s5, s12 -; SDAG-NEXT: s_mov_b32 s6, s12 -; SDAG-NEXT: s_mov_b32 s7, s12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[12:13] +; SDAG-NEXT: s_mov_b64 s[2:3], s[12:13] +; SDAG-NEXT: s_mov_b64 s[4:5], s[12:13] +; SDAG-NEXT: s_mov_b64 s[6:7], s[12:13] ; SDAG-NEXT: s_clause 0x2 ; SDAG-NEXT: image_sample_c_lz v0, [v1, v1, v0, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; SDAG-NEXT: image_sample_c_lz v3, [v1, v1, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY diff --git a/llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll b/llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll index 7caa563d8b298..6a5a7cc3e6219 100644 --- a/llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll +++ b/llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll @@ -8,12 +8,10 @@ define amdgpu_vs void @test(i32 inreg %cmp, i32 %e0) { ; CHECK-LABEL: test: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_cmp_eq_u32 s0, 0 -; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %load -; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: s_mov_b32 s2, s0 -; CHECK-NEXT: s_mov_b32 s3, s0 +; CHECK-NEXT: s_mov_b64 s[0:1], 0 +; CHECK-NEXT: s_mov_b64 s[2:3], s[0:1] ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: buffer_load_format_xy v[1:2], v1, s[0:3], 0 idxen ; CHECK-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index 8732c77778b01..65bc36644fdad 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -464,7 +464,9 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm_splat: ; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; NOLIT-SRCC-DAG: v_mov_b32_e32 v0, 0 +; NOLIT-SRCC-DAG: v_mov_b32_e32 v1, 0 +; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} ; NOLIT-SRCC: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}] ; LIT-SRCC: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0 ; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0 diff --git a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll index 3b855a56a5abb..beff113dc0661 100644 --- a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll +++ b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll @@ -7,11 +7,11 @@ define <2 x i32> @uniform_masked_load_ptr1_mask_v2i32(ptr addrspace(1) inreg noc ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB0_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] ; GFX942-NEXT: .LBB0_2: ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] @@ -30,13 +30,12 @@ define <4 x i32> @uniform_masked_load_ptr1_mask_v4i32(ptr addrspace(1) inreg noc ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB1_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] ; GFX942-NEXT: .LBB1_2: ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] @@ -55,13 +54,12 @@ define <4 x float> @uniform_masked_load_ptr1_mask_v4f32(ptr addrspace(1) inreg n ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB2_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] ; GFX942-NEXT: .LBB2_2: ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] @@ -80,20 +78,16 @@ define <8 x i32> @uniform_masked_load_ptr1_mask_v8i32(ptr addrspace(1) inreg noc ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB3_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load -; GFX942-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16 +; GFX942-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] ; GFX942-NEXT: .LBB3_2: ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -111,20 +105,16 @@ define <8 x float> @uniform_masked_load_ptr1_mask_v8f32(ptr addrspace(1) inreg n ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB4_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load -; GFX942-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16 +; GFX942-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] ; GFX942-NEXT: .LBB4_2: ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -142,13 +132,12 @@ define <8 x i16> @uniform_masked_load_ptr1_mask_v8i16(ptr addrspace(1) inreg noc ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB5_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] ; GFX942-NEXT: .LBB5_2: ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] @@ -167,13 +156,12 @@ define <8 x half> @uniform_masked_load_ptr1_mask_v8f16(ptr addrspace(1) inreg no ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB6_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] ; GFX942-NEXT: .LBB6_2: ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] @@ -192,13 +180,12 @@ define <8 x bfloat> @uniform_masked_load_ptr1_mask_v8bf16(ptr addrspace(1) inreg ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB7_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] ; GFX942-NEXT: .LBB7_2: ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll index af713179a888d..4d5b532d3c90b 100644 --- a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll +++ b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll @@ -11,11 +11,9 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; GFX10-LABEL: long_store_chain: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 -; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: s_mov_b32 s1, s0 -; GFX10-NEXT: s_mov_b32 s2, s0 -; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -92,11 +90,9 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; GFX11-LABEL: long_store_chain: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_mov_b32_e32 v2, s2 @@ -175,11 +171,9 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; GFX12-LABEL: long_store_chain: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll index 8157b1a7f7c80..8a8c423ba113a 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll @@ -175,9 +175,9 @@ define void @issue63986_reduced_expanded(i64 %idxprom) { ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: .LBB1_8: ; %post-loop-memcpy-expansion ; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: v_mov_b32_e32 v3, v2 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: v_mov_b32_e32 v5, v3 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 -; CHECK-NEXT: v_mov_b32_e32 v5, v2 ; CHECK-NEXT: s_and_b64 vcc, exec, 0 ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: .LBB1_9: ; %loop-memcpy-expansion2 diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index 4a635d6e7f59f..e2b28ee5e92b0 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -458,7 +458,9 @@ exit: ; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init: -; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908: v_mov_b32_e32 v0, 0 +; GFX908: v_mov_b32_e32 v1, 0 +; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} ; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] ; GFX90A-NOT: v_accvgpr ; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} @@ -499,7 +501,9 @@ exit: ; GCN-LABEL: {{^}}test_mfma_loop_agpr_init: -; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908: v_mov_b32_e32 v0, 0 +; GFX908: v_mov_b32_e32 v1, 0 +; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} ; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] ; GFX90A-NOT: v_accvgpr ; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} @@ -577,9 +581,8 @@ exit: ; GCN-LABEL: {{^}}test_mfma_nested_loop_zeroinit: -; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0 -; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] +; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} ; Check that we do not copy agprs to vgprs and back in an outer loop. diff --git a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll index 43e3a1fa29483..890c7234b2e45 100644 --- a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll +++ b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll @@ -28,20 +28,22 @@ define void @nonkernel() { ; GFX9-LABEL: nonkernel: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: ds_write_b32 v0, v0 offset:8 -; GFX9-NEXT: ds_write_b64 v0, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: ds_write_b32 v2, v2 offset:8 +; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: nonkernel: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: ds_write_b32 v0, v0 offset:8 -; GFX10-NEXT: ds_write_b64 v0, v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: ds_write_b32 v2, v2 offset:8 +; GFX10-NEXT: ds_write_b64 v2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll index d62f045674ace..4b58fdab44bc6 100644 --- a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll +++ b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll @@ -8,7 +8,7 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) { ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b64 s[4:5], 0 ; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: ; implicit-def: $sgpr2 ; GFX10-NEXT: s_inst_prefetch 0x1 @@ -29,17 +29,11 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) { ; GFX10-NEXT: s_cbranch_execz .LBB0_1 ; GFX10-NEXT: ; %bb.3: ; %branch2_merge ; GFX10-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX10-NEXT: s_mov_b32 s5, s4 -; GFX10-NEXT: s_mov_b32 s6, s4 -; GFX10-NEXT: s_mov_b32 s7, s4 -; GFX10-NEXT: s_mov_b32 s8, s4 -; GFX10-NEXT: s_mov_b32 s9, s4 -; GFX10-NEXT: s_mov_b32 s10, s4 -; GFX10-NEXT: s_mov_b32 s11, s4 -; GFX10-NEXT: s_mov_b32 s12, s4 -; GFX10-NEXT: s_mov_b32 s13, s4 -; GFX10-NEXT: s_mov_b32 s14, s4 -; GFX10-NEXT: s_mov_b32 s15, s4 +; GFX10-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX10-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX10-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX10-NEXT: s_mov_b64 s[12:13], s[4:5] +; GFX10-NEXT: s_mov_b64 s[14:15], s[4:5] ; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo ; GFX10-NEXT: image_sample_lz v1, [v2, v2, v1], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -57,7 +51,7 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) { ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: s_mov_b64 s[4:5], 0 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: ; implicit-def: $sgpr2 ; GFX12-NEXT: s_branch .LBB0_2 @@ -77,17 +71,11 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) { ; GFX12-NEXT: s_cbranch_execz .LBB0_1 ; GFX12-NEXT: ; %bb.3: ; %branch2_merge ; GFX12-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX12-NEXT: s_mov_b32 s5, s4 -; GFX12-NEXT: s_mov_b32 s6, s4 -; GFX12-NEXT: s_mov_b32 s7, s4 -; GFX12-NEXT: s_mov_b32 s8, s4 -; GFX12-NEXT: s_mov_b32 s9, s4 -; GFX12-NEXT: s_mov_b32 s10, s4 -; GFX12-NEXT: s_mov_b32 s11, s4 -; GFX12-NEXT: s_mov_b32 s12, s4 -; GFX12-NEXT: s_mov_b32 s13, s4 -; GFX12-NEXT: s_mov_b32 s14, s4 -; GFX12-NEXT: s_mov_b32 s15, s4 +; GFX12-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX12-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX12-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX12-NEXT: s_mov_b64 s[12:13], s[4:5] +; GFX12-NEXT: s_mov_b64 s[14:15], s[4:5] ; GFX12-NEXT: s_and_not1_b32 s2, s2, exec_lo ; GFX12-NEXT: image_sample_lz v1, [v2, v2, v1], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_3D ; GFX12-NEXT: s_wait_samplecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll index 3844d6054e130..493139df6357c 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -8,34 +8,32 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: s_mov_b32 s2, 0 +; GFX942-NEXT: s_mov_b32 s4, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GFX942-NEXT: s_mov_b32 s3, 0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 ; GFX942-NEXT: s_branch .LBB0_2 ; GFX942-NEXT: .LBB0_1: ; %bb2 ; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX942-NEXT: s_or_b32 s4, s3, 1 -; GFX942-NEXT: s_ashr_i32 s5, s3, 31 -; GFX942-NEXT: s_mov_b32 s3, s2 -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NEXT: s_nop 2 ; GFX942-NEXT: v_accvgpr_mov_b32 a0, a2 ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1 -; GFX942-NEXT: s_and_b32 s3, s5, s4 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[2:5], v[2:3], v[2:3], a[0:3] +; GFX942-NEXT: s_or_b32 s2, s4, 1 +; GFX942-NEXT: s_ashr_i32 s3, s4, 31 +; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[2:5], v[0:1], v[0:1], a[0:3] +; GFX942-NEXT: s_and_b32 s4, s3, s2 ; GFX942-NEXT: s_cbranch_execz .LBB0_4 ; GFX942-NEXT: .LBB0_2: ; %bb ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX942-NEXT: s_cbranch_vccz .LBB0_1 ; GFX942-NEXT: ; %bb.3: -; GFX942-NEXT: ; implicit-def: $sgpr3 +; GFX942-NEXT: ; implicit-def: $sgpr4 ; GFX942-NEXT: ; implicit-def: $agpr2 ; GFX942-NEXT: .LBB0_4: ; %common.ret ; GFX942-NEXT: s_endpgm @@ -45,38 +43,36 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX908-NEXT: s_mov_b32 s2, 0 +; GFX908-NEXT: s_mov_b32 s4, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_cmp_lg_u32 s0, 0 ; GFX908-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GFX908-NEXT: s_mov_b32 s3, 0 +; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 +; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v2 ; GFX908-NEXT: s_branch .LBB0_2 ; GFX908-NEXT: .LBB0_1: ; %bb2 ; GFX908-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX908-NEXT: s_or_b32 s4, s3, 1 -; GFX908-NEXT: s_ashr_i32 s5, s3, 31 -; GFX908-NEXT: s_mov_b32 s3, s2 -; GFX908-NEXT: v_mov_b32_e32 v1, s2 -; GFX908-NEXT: s_nop 2 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a2 -; GFX908-NEXT: v_mov_b32_e32 v2, s3 +; GFX908-NEXT: s_nop 4 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v4 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX908-NEXT: s_and_b32 s3, s5, s4 -; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[2:5], v[1:2], v[1:2], a[0:3] +; GFX908-NEXT: s_or_b32 s2, s4, 1 +; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[2:5], v[0:1], v[0:1], a[0:3] +; GFX908-NEXT: s_ashr_i32 s3, s4, 31 +; GFX908-NEXT: s_and_b32 s4, s3, s2 ; GFX908-NEXT: s_cbranch_execz .LBB0_4 ; GFX908-NEXT: .LBB0_2: ; %bb ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX908-NEXT: s_cbranch_vccz .LBB0_1 ; GFX908-NEXT: ; %bb.3: -; GFX908-NEXT: ; implicit-def: $sgpr3 +; GFX908-NEXT: ; implicit-def: $sgpr4 ; GFX908-NEXT: ; implicit-def: $agpr2 ; GFX908-NEXT: .LBB0_4: ; %common.ret ; GFX908-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll index debbfce7dadcc..9769718481f18 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX1150 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX1150 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GIGFX1150 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX12 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX12 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GIGFX12 %s define amdgpu_vs float @fadd_f32(float inreg %a, float inreg %b) { ; CHECK-LABEL: fadd_f32: @@ -45,12 +45,26 @@ define amdgpu_vs float @fmin_f32(float inreg %a, float inreg %b) { ; GFX1150-NEXT: v_mov_b32_e32 v0, s0 ; GFX1150-NEXT: ; return to shader part epilog ; +; GIGFX1150-LABEL: fmin_f32: +; GIGFX1150: ; %bb.0: +; GIGFX1150-NEXT: s_min_f32 s0, s0, s1 +; GIGFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GIGFX1150-NEXT: v_mov_b32_e32 v0, s0 +; GIGFX1150-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: fmin_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_min_num_f32 s0, s0, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog +; +; GIGFX12-LABEL: fmin_f32: +; GIGFX12: ; %bb.0: +; GIGFX12-NEXT: s_min_num_f32 s0, s0, s1 +; GIGFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GIGFX12-NEXT: v_mov_b32_e32 v0, s0 +; GIGFX12-NEXT: ; return to shader part epilog %min = call float @llvm.minnum.f32(float %a, float %b) ret float %min } @@ -63,12 +77,26 @@ define amdgpu_vs float @fmax_f32(float inreg %a, float inreg %b) { ; GFX1150-NEXT: v_mov_b32_e32 v0, s0 ; GFX1150-NEXT: ; return to shader part epilog ; +; GIGFX1150-LABEL: fmax_f32: +; GIGFX1150: ; %bb.0: +; GIGFX1150-NEXT: s_max_f32 s0, s0, s1 +; GIGFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GIGFX1150-NEXT: v_mov_b32_e32 v0, s0 +; GIGFX1150-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: fmax_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_max_num_f32 s0, s0, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog +; +; GIGFX12-LABEL: fmax_f32: +; GIGFX12: ; %bb.0: +; GIGFX12-NEXT: s_max_num_f32 s0, s0, s1 +; GIGFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GIGFX12-NEXT: v_mov_b32_e32 v0, s0 +; GIGFX12-NEXT: ; return to shader part epilog %max = call float @llvm.maxnum.f32(float %a, float %b) ret float %max } @@ -114,12 +142,26 @@ define amdgpu_vs half @fmin_f16(half inreg %a, half inreg %b) { ; GFX1150-NEXT: v_mov_b32_e32 v0, s0 ; GFX1150-NEXT: ; return to shader part epilog ; +; GIGFX1150-LABEL: fmin_f16: +; GIGFX1150: ; %bb.0: +; GIGFX1150-NEXT: s_min_f16 s0, s0, s1 +; GIGFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GIGFX1150-NEXT: v_mov_b32_e32 v0, s0 +; GIGFX1150-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: fmin_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_min_num_f16 s0, s0, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog +; +; GIGFX12-LABEL: fmin_f16: +; GIGFX12: ; %bb.0: +; GIGFX12-NEXT: s_min_num_f16 s0, s0, s1 +; GIGFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GIGFX12-NEXT: v_mov_b32_e32 v0, s0 +; GIGFX12-NEXT: ; return to shader part epilog %min = call half @llvm.minnum.f16(half %a, half %b) ret half %min } @@ -132,12 +174,26 @@ define amdgpu_vs half @fmax_f16(half inreg %a, half inreg %b) { ; GFX1150-NEXT: v_mov_b32_e32 v0, s0 ; GFX1150-NEXT: ; return to shader part epilog ; +; GIGFX1150-LABEL: fmax_f16: +; GIGFX1150: ; %bb.0: +; GIGFX1150-NEXT: s_max_f16 s0, s0, s1 +; GIGFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GIGFX1150-NEXT: v_mov_b32_e32 v0, s0 +; GIGFX1150-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: fmax_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_max_num_f16 s0, s0, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog +; +; GIGFX12-LABEL: fmax_f16: +; GIGFX12: ; %bb.0: +; GIGFX12-NEXT: s_max_num_f16 s0, s0, s1 +; GIGFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GIGFX12-NEXT: v_mov_b32_e32 v0, s0 +; GIGFX12-NEXT: ; return to shader part epilog %max = call half @llvm.maxnum.f16(half %a, half %b) ret half %max } @@ -211,31 +267,53 @@ define amdgpu_vs half @fmac_f16_with_mov(half inreg %a, half inreg %b, half inre define amdgpu_ps float @_amdgpu_ps_main() { ; GFX1150-LABEL: _amdgpu_ps_main: ; GFX1150: ; %bb.0: ; %bb -; GFX1150-NEXT: s_mov_b32 s0, 0 -; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1150-NEXT: s_mov_b32 s1, s0 -; GFX1150-NEXT: s_mov_b32 s2, s0 -; GFX1150-NEXT: s_mov_b32 s3, s0 +; GFX1150-NEXT: s_mov_b64 s[0:1], 0 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_3) +; GFX1150-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX1150-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_fmac_f32 s0, s1, 4.0 -; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-NEXT: v_mov_b32_e32 v0, s0 ; GFX1150-NEXT: ; return to shader part epilog ; +; GIGFX1150-LABEL: _amdgpu_ps_main: +; GIGFX1150: ; %bb.0: ; %bb +; GIGFX1150-NEXT: s_mov_b32 s0, 0 +; GIGFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GIGFX1150-NEXT: s_mov_b32 s1, s0 +; GIGFX1150-NEXT: s_mov_b32 s2, s0 +; GIGFX1150-NEXT: s_mov_b32 s3, s0 +; GIGFX1150-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x0 +; GIGFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GIGFX1150-NEXT: s_fmac_f32 s0, s1, 4.0 +; GIGFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GIGFX1150-NEXT: v_mov_b32_e32 v0, s0 +; GIGFX1150-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: _amdgpu_ps_main: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_3) +; GFX12-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX12-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_fmac_f32 s0, s1, 4.0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog +; +; GIGFX12-LABEL: _amdgpu_ps_main: +; GIGFX12: ; %bb.0: ; %bb +; GIGFX12-NEXT: s_mov_b32 s0, 0 +; GIGFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GIGFX12-NEXT: s_mov_b32 s1, s0 +; GIGFX12-NEXT: s_mov_b32 s2, s0 +; GIGFX12-NEXT: s_mov_b32 s3, s0 +; GIGFX12-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x0 +; GIGFX12-NEXT: s_wait_kmcnt 0x0 +; GIGFX12-NEXT: s_fmac_f32 s0, s1, 4.0 +; GIGFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GIGFX12-NEXT: v_mov_b32_e32 v0, s0 +; GIGFX12-NEXT: ; return to shader part epilog bb: %i = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> zeroinitializer, i32 0, i32 0) %i1 = bitcast i32 %i to float diff --git a/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll b/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll index 118c47e680709..bc9a3ec97ae34 100644 --- a/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll @@ -9,38 +9,32 @@ declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 define amdgpu_ps void @_amdgpu_ps_main(float %arg) { ; GFX900-LABEL: _amdgpu_ps_main: ; GFX900: ; %bb.0: ; %bb -; GFX900-NEXT: s_mov_b64 s[4:5], exec +; GFX900-NEXT: s_mov_b64 s[0:1], exec ; GFX900-NEXT: s_wqm_b64 exec, exec ; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: s_mov_b32 s0, 0 ; GFX900-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; GFX900-NEXT: ; implicit-def: $vgpr0 ; GFX900-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX900-NEXT: s_xor_b64 s[6:7], exec, s[2:3] +; GFX900-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX900-NEXT: s_cbranch_execz .LBB0_2 ; GFX900-NEXT: ; %bb.1: ; %bb1 +; GFX900-NEXT: s_mov_b64 s[12:13], 0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: s_mov_b32 s1, s0 -; GFX900-NEXT: s_mov_b32 s2, s0 -; GFX900-NEXT: s_mov_b32 s3, s0 -; GFX900-NEXT: s_mov_b32 s8, s0 -; GFX900-NEXT: s_mov_b32 s9, s0 -; GFX900-NEXT: s_mov_b32 s10, s0 -; GFX900-NEXT: s_mov_b32 s11, s0 -; GFX900-NEXT: s_mov_b32 s12, s0 -; GFX900-NEXT: s_mov_b32 s13, s0 -; GFX900-NEXT: s_mov_b32 s14, s0 -; GFX900-NEXT: s_mov_b32 s15, s0 -; GFX900-NEXT: image_sample v[0:1], v[0:1], s[8:15], s[0:3] dmask:0x3 +; GFX900-NEXT: s_mov_b64 s[14:15], s[12:13] +; GFX900-NEXT: s_mov_b64 s[4:5], s[12:13] +; GFX900-NEXT: s_mov_b64 s[6:7], s[12:13] +; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: image_sample v[0:1], v[0:1], s[4:11], s[12:15] dmask:0x3 ; GFX900-NEXT: .LBB0_2: ; %Flow -; GFX900-NEXT: s_or_saveexec_b64 s[0:1], s[6:7] -; GFX900-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX900-NEXT: s_and_b64 s[0:1], exec, s[0:1] +; GFX900-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX900-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX900-NEXT: s_and_b64 s[2:3], exec, s[2:3] ; GFX900-NEXT: v_mov_b32_e32 v2, 1.0 -; GFX900-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX900-NEXT: s_xor_b64 exec, exec, s[2:3] ; GFX900-NEXT: s_cbranch_execz .LBB0_5 ; GFX900-NEXT: ; %bb.3: ; %bb5 -; GFX900-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; GFX900-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX900-NEXT: s_cbranch_scc0 .LBB0_6 ; GFX900-NEXT: ; %bb.4: ; %bb5 ; GFX900-NEXT: s_mov_b64 exec, 0 @@ -48,7 +42,7 @@ define amdgpu_ps void @_amdgpu_ps_main(float %arg) { ; GFX900-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: .LBB0_5: ; %bb6 -; GFX900-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_cvt_pkrtz_f16_f32 v1, 0, v1 ; GFX900-NEXT: v_cvt_pkrtz_f16_f32 v0, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll b/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll index 5f101c360f148..bb54184dd73c1 100644 --- a/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll +++ b/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll @@ -6,14 +6,14 @@ define amdgpu_ps void @_amdgpu_ps_main() { ; CHECK: bb.0.entry: ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3 + ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B]], %subreg.sub0_sub1, [[S_MOV_B]], %subreg.sub2_sub3 ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM killed [[REG_SEQUENCE]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 - ; CHECK-NEXT: nofpexcept S_CMP_NLT_F32 [[S_BUFFER_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], implicit-def $scc, implicit $mode + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; CHECK-NEXT: nofpexcept S_CMP_NLT_F32 [[S_BUFFER_LOAD_DWORD_IMM]], [[S_MOV_B32_]], implicit-def $scc, implicit $mode ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc ; CHECK-NEXT: SI_KILL_I1_PSEUDO killed [[COPY]], 0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: nofpexcept S_CMP_LT_F32 [[S_BUFFER_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], implicit-def $scc, implicit $mode + ; CHECK-NEXT: nofpexcept S_CMP_LT_F32 [[S_BUFFER_LOAD_DWORD_IMM]], [[S_MOV_B32_]], implicit-def $scc, implicit $mode ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll index d34769ad0fcf0..376c49e4ab769 100644 --- a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll +++ b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll @@ -10,9 +10,9 @@ define void @__omp_offloading_35_36570d3__ZN6openmc31process_advance_particle_ev ; GCN-NEXT: .cfi_startproc ; GCN-NEXT: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: global_load_dwordx2 v[1:2], v[1:2], off +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: global_load_dwordx2 v[1:2], v[3:4], off ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 @@ -27,8 +27,6 @@ define void @__omp_offloading_35_36570d3__ZN6openmc31process_advance_particle_ev ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; GCN-NEXT: .LBB0_3: ; %bb2 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: v_mov_b32_e32 v4, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dwordx2 v[1:2], v[3:4] ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 @@ -36,7 +34,7 @@ define void @__omp_offloading_35_36570d3__ZN6openmc31process_advance_particle_ev ; GCN-NEXT: s_cbranch_execz .LBB0_2 ; GCN-NEXT: .LBB0_4: ; %bb1 ; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: v_mov_b32_e32 v4, v3 +; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dwordx2 v[1:2], v[3:4] ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/si-scheduler-exports.ll b/llvm/test/CodeGen/AMDGPU/si-scheduler-exports.ll index ac271ff6a258b..a2751cad5e902 100644 --- a/llvm/test/CodeGen/AMDGPU/si-scheduler-exports.ll +++ b/llvm/test/CodeGen/AMDGPU/si-scheduler-exports.ll @@ -5,15 +5,14 @@ define amdgpu_gs void @_amdgpu_gs_main() { ; CHECK-LABEL: _amdgpu_gs_main: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: s_mov_b32 s2, s0 -; CHECK-NEXT: v_mov_b32_e32 v1, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_mov_b64 s[0:1], 0 +; CHECK-NEXT: s_mov_b64 s[2:3], s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v3, v1 ; CHECK-NEXT: v_mov_b32_e32 v2, v0 -; CHECK-NEXT: v_mov_b32_e32 v3, v0 -; CHECK-NEXT: s_mov_b32 s3, s0 ; CHECK-NEXT: exp mrt0 off, off, off, off -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen ; CHECK-NEXT: s_endpgm entry: call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, i1 false, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll index 899ec36e9b2fe..a4e876604f50d 100644 --- a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll +++ b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll @@ -6,25 +6,22 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) %in, ptr addrspace(1) ; GFX942-LABEL: test: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], 0 +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX942-NEXT: v_mov_b64_e32 v[10:11], v[2:3] -; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[0:1] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v12, s4 -; GFX942-NEXT: v_mov_b32_e32 v13, s5 -; GFX942-NEXT: v_mov_b32_e32 v4, s6 -; GFX942-NEXT: v_mov_b32_e32 v5, s7 -; GFX942-NEXT: v_mov_b32_e32 v6, s7 -; GFX942-NEXT: v_mov_b32_e32 v7, s7 +; GFX942-NEXT: v_mov_b32_e32 v8, s4 +; GFX942-NEXT: v_mov_b32_e32 v9, s5 +; GFX942-NEXT: v_mov_b32_e32 v0, s6 +; GFX942-NEXT: v_mov_b32_e32 v1, s7 +; GFX942-NEXT: v_mov_b32_e32 v2, s7 +; GFX942-NEXT: v_mov_b32_e32 v3, s7 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[12:13], v[4:7], v13 +; GFX942-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v9 ; GFX942-NEXT: s_nop 6 -; GFX942-NEXT: global_store_dword v0, v11, s[2:3] offset:12 +; GFX942-NEXT: global_store_dword v10, v7, s[2:3] offset:12 ; GFX942-NEXT: s_endpgm entry: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 0 diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll index c611c4b502817..681ceb9e87cbd 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -50,12 +50,12 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: KILL [[S_ADD_U32_]].sub0, [[S_ADD_U32_]].sub1 ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0 + ; CHECK-NEXT: undef [[S_MOV_B:%[0-9]+]].sub0_sub1:sgpr_128 = S_MOV_B64_IMM_PSEUDO 0 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]].sub2_sub3:sgpr_128 = COPY [[S_MOV_B]].sub0_sub1 ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: KILL undef %89:sgpr_128 - ; CHECK-NEXT: KILL undef %118:sgpr_128 ; CHECK-NEXT: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_1:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_1:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc @@ -86,19 +86,19 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %301:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %356:sgpr_128, undef %357:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %367:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B]], [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B]], undef %296:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B]], 16, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %351:sgpr_128, undef %352:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %362:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.124, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %351:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %362:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %346:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc @@ -116,7 +116,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %383:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %378:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4) @@ -154,9 +154,10 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.170, addrspace 4) - ; CHECK-NEXT: [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1 + ; CHECK-NEXT: [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) @@ -198,9 +199,9 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %469:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) undef`, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %464:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) undef`, addrspace 4) ; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1 - ; CHECK-NEXT: KILL undef %469:sreg_64 + ; CHECK-NEXT: KILL undef %464:sreg_64 ; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3 ; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4) @@ -211,8 +212,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]] + ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]] ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_2]] @@ -351,13 +352,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec - ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %542:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) undef`, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %537:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) undef`, addrspace 4) ; CHECK-NEXT: [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec ; CHECK-NEXT: [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc ; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec ; CHECK-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec - ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %556:vgpr_32, undef %558:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8) + ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %551:vgpr_32, undef %553:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 .expVert: %0 = extractelement <31 x i32> %userData, i64 2 diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index 474482b2d89ff..d3dcab18a25a2 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -34,11 +34,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[6:7] ; GLOBALNESS1-NEXT: s_load_dwordx4 s[52:55], s[8:9], 0x0 ; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, 0 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], 0, 0 -; GLOBALNESS1-NEXT: global_store_dword v[44:45], v42, off +; GLOBALNESS1-NEXT: v_mov_b32_e32 v40, 0 +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[42:43], 0, 0 +; GLOBALNESS1-NEXT: global_store_dword v[42:43], v40, off ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[52:53] +; GLOBALNESS1-NEXT: global_load_dword v2, v40, s[52:53] ; GLOBALNESS1-NEXT: s_mov_b64 s[48:49], s[4:5] ; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS1-NEXT: s_load_dword s7, s[8:9], 0x20 @@ -70,21 +70,21 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9] ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1 -; GLOBALNESS1-NEXT: ; implicit-def: $vgpr59 : SGPR spill to VGPR lane +; GLOBALNESS1-NEXT: ; implicit-def: $vgpr58 : SGPR spill to VGPR lane ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s8, 0 +; GLOBALNESS1-NEXT: v_writelane_b32 v58, s8, 0 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s9, 1 +; GLOBALNESS1-NEXT: v_writelane_b32 v58, s9, 1 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[70:71], 1, v3 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v46, 0x80 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v44, 0x80 ; GLOBALNESS1-NEXT: s_mov_b32 s82, s16 ; GLOBALNESS1-NEXT: s_mov_b32 s83, s15 ; GLOBALNESS1-NEXT: s_mov_b32 s84, s14 ; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11] -; GLOBALNESS1-NEXT: v_mov_b32_e32 v47, 0 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, 0 ; GLOBALNESS1-NEXT: s_mov_b32 s32, 0 -; GLOBALNESS1-NEXT: ; implicit-def: $vgpr56_vgpr57 +; GLOBALNESS1-NEXT: ; implicit-def: $vgpr46_vgpr47 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -93,24 +93,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 2 +; GLOBALNESS1-NEXT: v_writelane_b32 v58, s4, 2 ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 3 +; GLOBALNESS1-NEXT: v_writelane_b32 v58, s5, 3 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v3 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 4 -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 5 +; GLOBALNESS1-NEXT: v_writelane_b32 v58, s4, 4 +; GLOBALNESS1-NEXT: v_writelane_b32 v58, s5, 5 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v2 -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 6 -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 7 +; GLOBALNESS1-NEXT: v_writelane_b32 v58, s4, 6 +; GLOBALNESS1-NEXT: v_writelane_b32 v58, s5, 7 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[80:81], 1, v1 -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s70, 8 -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s71, 9 +; GLOBALNESS1-NEXT: v_writelane_b32 v58, s70, 8 +; GLOBALNESS1-NEXT: v_writelane_b32 v58, s71, 9 ; GLOBALNESS1-NEXT: s_branch .LBB1_4 ; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_readlane_b32 s6, v59, 6 -; GLOBALNESS1-NEXT: v_readlane_b32 s7, v59, 7 +; GLOBALNESS1-NEXT: v_readlane_b32 s6, v58, 6 +; GLOBALNESS1-NEXT: v_readlane_b32 s7, v58, 7 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_28 ; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15 @@ -120,15 +120,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow28 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[46:47], v[0:1], v[0:1] op_sel:[0,1] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_29 ; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS1-NEXT: ; Child Loop BB1_16 Depth 2 -; GLOBALNESS1-NEXT: flat_load_dword v40, v[46:47] +; GLOBALNESS1-NEXT: flat_load_dword v56, v[44:45] ; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 -; GLOBALNESS1-NEXT: buffer_store_dword v42, off, s[0:3], 0 -; GLOBALNESS1-NEXT: flat_load_dword v58, v[46:47] +; GLOBALNESS1-NEXT: buffer_store_dword v40, off, s[0:3], 0 +; GLOBALNESS1-NEXT: flat_load_dword v57, v[44:45] ; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 ; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 @@ -176,7 +176,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_3 ; GLOBALNESS1-NEXT: ; %bb.10: ; %baz.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: flat_load_dword v0, v[44:45] +; GLOBALNESS1-NEXT: flat_load_dword v0, v[42:43] ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[86:87], 0, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 @@ -185,21 +185,20 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_25 ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[44:45], off -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s8, 10 -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s9, 11 -; GLOBALNESS1-NEXT: v_readlane_b32 s4, v59, 2 -; GLOBALNESS1-NEXT: v_readlane_b32 s5, v59, 3 +; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[42:43], off +; GLOBALNESS1-NEXT: v_writelane_b32 v58, s8, 10 +; GLOBALNESS1-NEXT: v_writelane_b32 v58, s9, 11 +; GLOBALNESS1-NEXT: v_readlane_b32 s4, v58, 2 +; GLOBALNESS1-NEXT: v_readlane_b32 s5, v58, 3 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[42:43], v[42:43], off ; GLOBALNESS1-NEXT: .LBB1_13: ; %bb44.lr.ph.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58 -; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 +; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v56, vcc ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -228,8 +227,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS1-NEXT: ; %bb.19: ; %bb3.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: v_readlane_b32 s4, v59, 0 -; GLOBALNESS1-NEXT: v_readlane_b32 s5, v59, 1 +; GLOBALNESS1-NEXT: v_readlane_b32 s4, v58, 0 +; GLOBALNESS1-NEXT: v_readlane_b32 s5, v58, 1 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS1-NEXT: ; %bb.20: ; %bb6.i.i @@ -265,25 +264,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[56:57], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[42:43], v[46:47], off ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[54:55] ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[96:97] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14 ; GLOBALNESS1-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[42:43], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_14 ; GLOBALNESS1-NEXT: .LBB1_24: ; %Flow23 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_load_dwordx4 s[4:7], s[38:39], 0x0 -; GLOBALNESS1-NEXT: v_readlane_b32 s70, v59, 8 -; GLOBALNESS1-NEXT: v_readlane_b32 s8, v59, 10 +; GLOBALNESS1-NEXT: v_readlane_b32 s70, v58, 8 +; GLOBALNESS1-NEXT: v_readlane_b32 s8, v58, 10 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS1-NEXT: v_readlane_b32 s71, v59, 9 +; GLOBALNESS1-NEXT: v_readlane_b32 s71, v58, 9 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_mov_b32 s55, s7 -; GLOBALNESS1-NEXT: v_readlane_b32 s9, v59, 11 +; GLOBALNESS1-NEXT: v_readlane_b32 s9, v58, 11 ; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow24 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[52:53] @@ -291,19 +289,17 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS1-NEXT: ; %bb.26: ; %bb67.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_readlane_b32 s6, v59, 4 -; GLOBALNESS1-NEXT: v_readlane_b32 s7, v59, 5 +; GLOBALNESS1-NEXT: v_readlane_b32 s6, v58, 4 +; GLOBALNESS1-NEXT: v_readlane_b32 s7, v58, 5 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS1-NEXT: ; %bb.27: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[42:43], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_1 ; GLOBALNESS1-NEXT: .LBB1_28: ; %bb73.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[42:43], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_2 ; GLOBALNESS1-NEXT: .LBB1_29: ; %loop.exit.guard ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[8:9] @@ -348,11 +344,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[6:7] ; GLOBALNESS0-NEXT: s_load_dwordx4 s[52:55], s[8:9], 0x0 ; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, 0 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], 0, 0 -; GLOBALNESS0-NEXT: global_store_dword v[44:45], v42, off +; GLOBALNESS0-NEXT: v_mov_b32_e32 v40, 0 +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[42:43], 0, 0 +; GLOBALNESS0-NEXT: global_store_dword v[42:43], v40, off ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[52:53] +; GLOBALNESS0-NEXT: global_load_dword v2, v40, s[52:53] ; GLOBALNESS0-NEXT: s_mov_b64 s[48:49], s[4:5] ; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS0-NEXT: s_load_dword s7, s[8:9], 0x20 @@ -384,21 +380,21 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9] ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1 -; GLOBALNESS0-NEXT: ; implicit-def: $vgpr59 : SGPR spill to VGPR lane +; GLOBALNESS0-NEXT: ; implicit-def: $vgpr58 : SGPR spill to VGPR lane ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s8, 0 +; GLOBALNESS0-NEXT: v_writelane_b32 v58, s8, 0 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s9, 1 +; GLOBALNESS0-NEXT: v_writelane_b32 v58, s9, 1 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[84:85], 1, v3 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v46, 0x80 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v44, 0x80 ; GLOBALNESS0-NEXT: s_mov_b32 s70, s16 ; GLOBALNESS0-NEXT: s_mov_b32 s71, s15 ; GLOBALNESS0-NEXT: s_mov_b32 s82, s14 ; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11] -; GLOBALNESS0-NEXT: v_mov_b32_e32 v47, 0 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, 0 ; GLOBALNESS0-NEXT: s_mov_b32 s32, 0 -; GLOBALNESS0-NEXT: ; implicit-def: $vgpr56_vgpr57 +; GLOBALNESS0-NEXT: ; implicit-def: $vgpr46_vgpr47 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -407,24 +403,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 2 +; GLOBALNESS0-NEXT: v_writelane_b32 v58, s4, 2 ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 3 +; GLOBALNESS0-NEXT: v_writelane_b32 v58, s5, 3 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v3 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 4 -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 5 +; GLOBALNESS0-NEXT: v_writelane_b32 v58, s4, 4 +; GLOBALNESS0-NEXT: v_writelane_b32 v58, s5, 5 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v2 -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 6 -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 7 +; GLOBALNESS0-NEXT: v_writelane_b32 v58, s4, 6 +; GLOBALNESS0-NEXT: v_writelane_b32 v58, s5, 7 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[80:81], 1, v1 -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s84, 8 -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s85, 9 +; GLOBALNESS0-NEXT: v_writelane_b32 v58, s84, 8 +; GLOBALNESS0-NEXT: v_writelane_b32 v58, s85, 9 ; GLOBALNESS0-NEXT: s_branch .LBB1_4 ; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v59, 6 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v59, 7 +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v58, 6 +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v58, 7 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_28 ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15 @@ -434,15 +430,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow28 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[46:47], v[0:1], v[0:1] op_sel:[0,1] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_29 ; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS0-NEXT: ; Child Loop BB1_16 Depth 2 -; GLOBALNESS0-NEXT: flat_load_dword v40, v[46:47] +; GLOBALNESS0-NEXT: flat_load_dword v56, v[44:45] ; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 -; GLOBALNESS0-NEXT: buffer_store_dword v42, off, s[0:3], 0 -; GLOBALNESS0-NEXT: flat_load_dword v58, v[46:47] +; GLOBALNESS0-NEXT: buffer_store_dword v40, off, s[0:3], 0 +; GLOBALNESS0-NEXT: flat_load_dword v57, v[44:45] ; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 ; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 @@ -490,7 +486,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_3 ; GLOBALNESS0-NEXT: ; %bb.10: ; %baz.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: flat_load_dword v0, v[44:45] +; GLOBALNESS0-NEXT: flat_load_dword v0, v[42:43] ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[86:87], 0, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 @@ -499,22 +495,21 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_25 ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[44:45], off -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s8, 10 -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s9, 11 -; GLOBALNESS0-NEXT: v_readlane_b32 s4, v59, 2 -; GLOBALNESS0-NEXT: v_readlane_b32 s5, v59, 3 +; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[42:43], off +; GLOBALNESS0-NEXT: v_writelane_b32 v58, s8, 10 +; GLOBALNESS0-NEXT: v_writelane_b32 v58, s9, 11 +; GLOBALNESS0-NEXT: v_readlane_b32 s4, v58, 2 +; GLOBALNESS0-NEXT: v_readlane_b32 s5, v58, 3 ; GLOBALNESS0-NEXT: s_mov_b32 s83, s55 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS0-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[42:43], v[42:43], off ; GLOBALNESS0-NEXT: .LBB1_13: ; %bb44.lr.ph.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58 -; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 +; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v56, vcc ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -543,8 +538,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS0-NEXT: ; %bb.19: ; %bb3.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: v_readlane_b32 s4, v59, 0 -; GLOBALNESS0-NEXT: v_readlane_b32 s5, v59, 1 +; GLOBALNESS0-NEXT: v_readlane_b32 s4, v58, 0 +; GLOBALNESS0-NEXT: v_readlane_b32 s5, v58, 1 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS0-NEXT: ; %bb.20: ; %bb6.i.i @@ -580,23 +575,22 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[56:57], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[42:43], v[46:47], off ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[54:55] ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[96:97] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14 ; GLOBALNESS0-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[42:43], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_14 ; GLOBALNESS0-NEXT: .LBB1_24: ; %Flow23 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s84, v59, 8 -; GLOBALNESS0-NEXT: v_readlane_b32 s8, v59, 10 +; GLOBALNESS0-NEXT: v_readlane_b32 s84, v58, 8 +; GLOBALNESS0-NEXT: v_readlane_b32 s8, v58, 10 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: s_mov_b32 s55, s83 -; GLOBALNESS0-NEXT: v_readlane_b32 s85, v59, 9 -; GLOBALNESS0-NEXT: v_readlane_b32 s9, v59, 11 +; GLOBALNESS0-NEXT: v_readlane_b32 s85, v58, 9 +; GLOBALNESS0-NEXT: v_readlane_b32 s9, v58, 11 ; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow24 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[52:53] @@ -604,19 +598,17 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS0-NEXT: ; %bb.26: ; %bb67.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v59, 4 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v59, 5 +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v58, 4 +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v58, 5 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS0-NEXT: ; %bb.27: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[42:43], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_1 ; GLOBALNESS0-NEXT: .LBB1_28: ; %bb73.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[42:43], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_2 ; GLOBALNESS0-NEXT: .LBB1_29: ; %loop.exit.guard ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index a401f989a2507..9867cd9495005 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -459,11 +459,10 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa ; GFX942-NEXT: ; %bb.1: ; %bb.1 ; GFX942-NEXT: global_load_dwordx2 v[2:3], v5, s[10:11] ; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v4 -; GFX942-NEXT: s_waitcnt vmcnt(1) -; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: s_waitcnt vmcnt(1) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 ; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX942-NEXT: .LBB9_2: ; %Flow ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/vopc_dpp.ll b/llvm/test/CodeGen/AMDGPU/vopc_dpp.ll index a6dcbb5bbd695..cec401c0a9713 100644 --- a/llvm/test/CodeGen/AMDGPU/vopc_dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/vopc_dpp.ll @@ -2,7 +2,7 @@ define amdgpu_cs void @_amdgpu_cs_main(i32 %0) { ; GFX11-LABEL: _amdgpu_cs_main: -; GFX11: v_cmp_eq_u32_e64_dpp s1, v1, v0 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11: v_cmp_eq_u32_e64_dpp s0, v1, v0 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 .entry: %1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 0, i32 0, i32 15, i32 15, i1 false) %2 = icmp ne i32 %1, %0 diff --git a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll index 6133cb4690723..53172b7ce7ca7 100644 --- a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll +++ b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll @@ -30,7 +30,8 @@ define amdgpu_kernel void @foo(i1 %cmp1) { ; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:12 ; GFX906-NEXT: s_load_dword s2, s[4:5], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1c -; GFX906-NEXT: s_mov_b32 s4, 0 +; GFX906-NEXT: s_mov_b64 s[4:5], 0 +; GFX906-NEXT: s_mov_b64 s[6:7], exec ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_bitcmp1_b32 s2, 0 ; GFX906-NEXT: s_mul_i32 s0, s0, s1 @@ -38,9 +39,9 @@ define amdgpu_kernel void @foo(i1 %cmp1) { ; GFX906-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX906-NEXT: v_add_lshl_u32 v2, v0, v2, 4 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 -; GFX906-NEXT: v_mov_b32_e32 v1, v0 -; GFX906-NEXT: s_cselect_b32 s5, 1, 0 -; GFX906-NEXT: s_mov_b64 s[2:3], exec +; GFX906-NEXT: v_mov_b32_e32 v1, 0 +; GFX906-NEXT: s_mov_b32 s2, 0 +; GFX906-NEXT: s_cselect_b32 s3, 1, 0 ; GFX906-NEXT: ds_write_b64 v2, v[0:1] ; GFX906-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX906-NEXT: s_waitcnt vmcnt(3) @@ -59,13 +60,11 @@ define amdgpu_kernel void @foo(i1 %cmp1) { ; GFX906-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_cbranch_execnz .LBB0_1 ; GFX906-NEXT: ; %bb.2: -; GFX906-NEXT: s_cmp_lg_u32 s5, 0 -; GFX906-NEXT: s_mov_b64 exec, s[2:3] -; GFX906-NEXT: s_cselect_b32 s5, 0x3ff00000, 0 -; GFX906-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; GFX906-NEXT: s_mov_b32 s5, s4 -; GFX906-NEXT: s_mov_b32 s6, s4 -; GFX906-NEXT: s_mov_b32 s7, s4 +; GFX906-NEXT: s_cmp_lg_u32 s3, 0 +; GFX906-NEXT: s_mov_b64 exec, s[6:7] +; GFX906-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; GFX906-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX906-NEXT: s_mov_b64 s[6:7], s[4:5] ; GFX906-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX906-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index deab407581880..360e987e2b43b 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -3398,38 +3398,38 @@ define amdgpu_gs void @wqm_init_exec() { ; GFX9-W64-LABEL: wqm_init_exec: ; GFX9-W64: ; %bb.0: ; %bb ; GFX9-W64-NEXT: s_mov_b64 exec, -1 -; GFX9-W64-NEXT: s_mov_b32 s0, 0 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_mov_b32 s1, s0 -; GFX9-W64-NEXT: s_mov_b32 s2, s0 -; GFX9-W64-NEXT: s_mov_b32 s3, s0 -; GFX9-W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-W64-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-W64-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-W64-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-W64-NEXT: s_mov_b32 s4, 0 ; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec -; GFX9-W64-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $exec -; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-W64-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $exec +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-W64-NEXT: ds_write_b32 v0, v1 ; GFX9-W64-NEXT: s_endpgm ; ; GFX10-W32-LABEL: wqm_init_exec: ; GFX10-W32: ; %bb.0: ; %bb ; GFX10-W32-NEXT: s_mov_b32 exec_lo, -1 -; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_mov_b32 s0, 0 +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-W32-NEXT: s_mov_b32 s2, 0 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: s_mov_b32 s2, s0 -; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1 -; GFX10-W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-W32-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-W32-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-W32-NEXT: s_mov_b32 s1, s0 -; GFX10-W32-NEXT: s_mov_b32 s3, s0 +; GFX10-W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-W32-NEXT: v_mov_b32_e32 v5, s2 +; GFX10-W32-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-W32-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GFX10-W32-NEXT: ds_write_b32 v0, v4 +; GFX10-W32-NEXT: ds_write_b32 v4, v5 ; GFX10-W32-NEXT: s_endpgm bb: call void @llvm.amdgcn.init.exec(i64 -1)