diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index c0e88051dc427..7a897d2a2486a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1784,6 +1784,38 @@ bool TargetLowering::SimplifyDemandedBits( } } + // Narrow shift to lower half - similar to ShrinkDemandedOp. + // (shl i64:x, K) -> (i64 zero_extend (shl (i32 (trunc i64:x)), K)) + unsigned HalfWidth = BitWidth / 2; + if ((BitWidth % 2) == 0 && !VT.isVector() && ShAmt < HalfWidth) { + EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), HalfWidth); + if (isNarrowingProfitable(VT, HalfVT) && + isTypeDesirableForOp(ISD::SHL, HalfVT) && + isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) && + (!TLO.LegalOperations() || isOperationLegal(ISD::SHL, VT))) { + // Unless we aren't demanding the upper bits at all, we must ensure + // that the upper bits of the shift result are known to be zero, + // which is equivalent to the narrow shift being NUW. + KnownBits Known0 = TLO.DAG.computeKnownBits(Op0, Depth + 1); + bool IsNUW = Known0.countMinLeadingZeros() >= (ShAmt + HalfWidth); + if (IsNUW || DemandedBits.countLeadingZeros() >= HalfWidth) { + unsigned NumSignBits = TLO.DAG.ComputeNumSignBits(Op0, Depth + 1); + bool IsNSW = NumSignBits > (ShAmt + HalfWidth); + SDNodeFlags Flags; + Flags.setNoSignedWrap(IsNSW); + Flags.setNoUnsignedWrap(IsNUW); + SDValue NewOp = TLO.DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Op0); + SDValue NewShiftAmt = TLO.DAG.getShiftAmountConstant( + ShAmt, HalfVT, dl, TLO.LegalTypes()); + SDValue NewShift = TLO.DAG.getNode(ISD::SHL, dl, HalfVT, NewOp, + NewShiftAmt, Flags); + SDValue NewExt = + TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, VT, NewShift); + return TLO.CombineTo(Op, NewExt); + } + } + } + APInt InDemandedMask = DemandedBits.lshr(ShAmt); if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO, Depth + 1)) diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll index 181b5b71bdd48..d7b9eebff77c0 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll @@ -32,7 +32,8 @@ define amdgpu_cs void @test_load_zext(i32 inreg %0, i32 inreg %1, i32 inreg %res ; the base may be the RHS operand of the load in SDAG. ; GCN-LABEL: name: test_complex_reg_offset ; GCN-DAG: %[[BASE:.*]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @0 + 4, -; GCN-DAG: %[[OFFSET:.*]]:sreg_32 = S_LSHL_B32 +; SDAG-DAG: %[[OFFSET:.*]]:sreg_32 = nuw nsw S_LSHL_B32 +; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = S_LSHL_B32 ; SDAG: S_LOAD_DWORD_SGPR_IMM killed %[[BASE]], killed %[[OFFSET]], 0, 0 ; GISEL: S_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[OFFSET]], 0, 0 define amdgpu_ps void @test_complex_reg_offset(ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 1b8216f4aa2a6..81d2aa3e9510a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -4669,49 +4669,49 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: s_and_b32 s4, s6, 0x7fff ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX6-NEXT: s_bfe_u32 s4, s8, 0xf000f ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX6-NEXT: s_bfe_u32 s5, s6, 0xf000f ; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: s_bfe_u32 s4, s8, 0xf000f ; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_bfe_u32 s5, s6, 0xf000f ; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s5 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_trunc_f32_e32 v1, v1 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mad_f32 v4, -v1, v5, v6 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v1, v0, v6 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v1 +; GFX6-NEXT: v_mad_f32 v6, -v1, v5, v6 +; GFX6-NEXT: v_cvt_u32_f32_e32 v7, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, v0, v4 +; GFX6-NEXT: v_trunc_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v1 ; GFX6-NEXT: v_mad_f32 v0, -v1, v2, v0 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v3 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc -; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v7, vcc +; GFX6-NEXT: v_and_b32_e32 v1, 0x1fff, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v3i15: @@ -4724,48 +4724,48 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: s_and_b32 s1, s2, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: s_bfe_u32 s0, s2, 0xf000f -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: s_bfe_u32 s1, s6, 0xf000f -; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_alignbit_b32 v3, s3, v3, 30 -; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_bfe_u32 s1, s6, 0xf000f ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mad_f32 v5, -v1, v6, v7 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, v0, v7 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mad_f32 v7, -v1, v6, v7 +; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, v0, v5 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v1 ; GFX9-NEXT: v_mad_f32 v0, -v1, v3, v0 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc -; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v5, vcc ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v8, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0x1fff, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: global_store_short v2, v1, s[4:5] offset:4 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[4:5] -; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 ; GFX9-NEXT: s_endpgm %r = udiv <3 x i15> %x, %y store <3 x i15> %r, ptr addrspace(1) %out @@ -4850,63 +4850,63 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX6-NEXT: s_and_b32 s7, s8, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_and_b32 s10, s8, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 ; GFX6-NEXT: s_and_b32 s5, s6, 0x7fff ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: s_bfe_u32 s5, s8, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s5 -; GFX6-NEXT: s_bfe_u32 s7, s6, 0xf000f +; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 +; GFX6-NEXT: s_bfe_u32 s9, s8, 0xf000f +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s9 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s7 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX6-NEXT: s_bfe_u32 s7, s6, 0xf000f +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 -; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 -; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s6, v1 -; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, s6, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 ; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_f32_e32 v8, v4, v8 +; GFX6-NEXT: v_trunc_f32_e32 v8, v8 +; GFX6-NEXT: v_cvt_u32_f32_e32 v9, v8 +; GFX6-NEXT: v_mad_f32 v4, -v8, v6, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v6 +; GFX6-NEXT: s_lshr_b32 s5, s8, 15 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v4, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 -; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 -; GFX6-NEXT: s_lshr_b32 s5, s8, 15 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, v1, s5 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: s_lshr_b32 s4, s6, 15 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0x1fff, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v3i15: @@ -4915,60 +4915,60 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX9-NEXT: s_and_b32 s7, s0, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s2, s6, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 -; GFX9-NEXT: s_bfe_u32 s2, s0, 0xf000f -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s2 +; GFX9-NEXT: s_and_b32 s3, s6, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GFX9-NEXT: s_and_b32 s8, s0, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX9-NEXT: s_bfe_u32 s3, s0, 0xf000f +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30 +; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0xf000f +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: s_bfe_u32 s3, s6, 0xf000f -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GFX9-NEXT: v_mul_f32_e32 v5, v7, v8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v5 -; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mad_f32 v7, -v4, v6, v7 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 -; GFX9-NEXT: v_mul_f32_e32 v6, v8, v9 -; GFX9-NEXT: v_trunc_f32_e32 v6, v6 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v8 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v4 +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_mad_f32 v7, -v5, v6, v7 +; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX9-NEXT: v_mul_f32_e32 v9, v8, v9 +; GFX9-NEXT: v_trunc_f32_e32 v9, v9 +; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v9 +; GFX9-NEXT: v_mad_f32 v8, -v9, v4, v8 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v8|, v4 ; GFX9-NEXT: s_lshr_b32 s1, s0, 15 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v10, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v4, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s0 -; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3 -; GFX9-NEXT: s_lshr_b32 s0, s6, 15 -; GFX9-NEXT: v_sub_u32_e32 v4, s0, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, s6, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s1 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: s_lshr_b32 s2, s6, 15 +; GFX9-NEXT: v_sub_u32_e32 v5, s6, v1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-NEXT: v_sub_u32_e32 v3, s2, v4 +; GFX9-NEXT: v_and_b32_e32 v1, 0x1fff, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: global_store_short v2, v1, s[4:5] offset:4 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[4:5] -; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 ; GFX9-NEXT: s_endpgm %r = urem <3 x i15> %x, %y store <3 x i15> %r, ptr addrspace(1) %out @@ -5093,35 +5093,35 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX6-NEXT: s_or_b32 s6, s4, 1 -; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v2| ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, v1 -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, s4, v5 -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 +; GFX6-NEXT: v_mul_f32_e32 v1, v4, v6 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_mad_f32 v5, -v1, v2, v5 +; GFX6-NEXT: v_mad_f32 v4, -v1, v2, v4 ; GFX6-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v2| +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v3 +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v1, 0x1fff, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v3i15: @@ -5160,36 +5160,36 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v3, v5 ; GFX9-NEXT: s_or_b32 s2, s0, 1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s2, 0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX9-NEXT: v_add_u32_e32 v5, s0, v6 -; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 +; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 +; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v1 -; GFX9-NEXT: v_mad_f32 v1, -v1, v3, v6 +; GFX9-NEXT: v_mad_f32 v1, -v1, v3, v5 +; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_add_u32_e32 v0, v7, v0 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v4 -; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v5 +; GFX9-NEXT: s_cselect_b32 s0, s2, 0 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-NEXT: v_add_u32_e32 v3, s0, v6 +; GFX9-NEXT: v_and_b32_e32 v1, 0x1fff, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: global_store_short v2, v1, s[4:5] offset:4 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[4:5] -; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 ; GFX9-NEXT: s_endpgm %r = sdiv <3 x i15> %x, %y store <3 x i15> %r, ptr addrspace(1) %out @@ -5325,40 +5325,40 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s6, v4 ; GFX6-NEXT: s_or_b32 s6, s4, 1 -; GFX6-NEXT: v_cvt_i32_f32_e32 v7, v7 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v6|, |v5| -; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v2 -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v2 ; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, s4, v7 -; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v5 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, v7, v8 +; GFX6-NEXT: v_cvt_i32_f32_e32 v7, v7 +; GFX6-NEXT: v_mul_f32_e32 v2, v6, v8 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 -; GFX6-NEXT: v_mad_f32 v7, -v2, v6, v7 +; GFX6-NEXT: v_mad_f32 v6, -v2, v5, v6 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| +; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v5| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, v5, s9 +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v5 -; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v7 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s9 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0x1fff, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v3i15: @@ -5369,74 +5369,74 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf0000 ; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s1 ; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_lshr_b32 s8, s6, 15 -; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 -; GFX9-NEXT: v_trunc_f32_e32 v6, v6 -; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 -; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 +; GFX9-NEXT: v_mul_f32_e32 v7, v6, v7 +; GFX9-NEXT: v_trunc_f32_e32 v7, v7 +; GFX9-NEXT: v_mad_f32 v6, -v7, v5, v6 +; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v7 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX9-NEXT: v_alignbit_b32 v1, s3, v1, 30 ; GFX9-NEXT: s_lshr_b32 s3, s2, 15 ; GFX9-NEXT: s_or_b32 s7, s0, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s7, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 +; GFX9-NEXT: v_add_u32_e32 v5, s0, v7 ; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s0 ; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s1 +; GFX9-NEXT: v_cvt_f32_i32_e32 v7, s1 ; GFX9-NEXT: s_xor_b32 s0, s1, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v1 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 +; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v1 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 -; GFX9-NEXT: v_mul_f32_e32 v7, v6, v7 -; GFX9-NEXT: v_trunc_f32_e32 v7, v7 -; GFX9-NEXT: v_mad_f32 v6, -v7, v5, v6 -; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v7 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s2 +; GFX9-NEXT: v_mul_f32_e32 v8, v7, v8 +; GFX9-NEXT: v_trunc_f32_e32 v8, v8 +; GFX9-NEXT: v_mad_f32 v7, -v8, v6, v7 +; GFX9-NEXT: v_mul_lo_u32 v5, v5, s2 ; GFX9-NEXT: s_or_b32 s2, s0, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, |v6| ; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v0 +; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 15 +; GFX9-NEXT: v_cvt_f32_i32_e32 v7, v0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v6 +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX9-NEXT: v_cvt_i32_f32_e32 v8, v8 +; GFX9-NEXT: v_mul_f32_e32 v1, v7, v9 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_i32_f32_e32 v9, v1 +; GFX9-NEXT: v_mad_f32 v1, -v1, v6, v7 +; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v6| +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: v_add_u32_e32 v0, v9, v0 ; GFX9-NEXT: s_cselect_b32 s0, s2, 0 -; GFX9-NEXT: v_add_u32_e32 v5, s0, v7 -; GFX9-NEXT: v_bfe_i32 v7, v0, 0, 15 -; GFX9-NEXT: v_cvt_f32_i32_e32 v8, v7 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v6 -; GFX9-NEXT: v_xor_b32_e32 v1, v7, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1 -; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_mul_f32_e32 v7, v8, v9 -; GFX9-NEXT: v_trunc_f32_e32 v7, v7 -; GFX9-NEXT: v_cvt_i32_f32_e32 v9, v7 -; GFX9-NEXT: v_mad_f32 v7, -v7, v6, v8 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v5, s3 -; GFX9-NEXT: v_add_u32_e32 v1, v9, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s6, v4 -; GFX9-NEXT: v_sub_u32_e32 v4, s8, v5 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, v4 +; GFX9-NEXT: v_add_u32_e32 v1, s0, v8 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 +; GFX9-NEXT: v_sub_u32_e32 v5, s6, v5 +; GFX9-NEXT: v_sub_u32_e32 v0, v3, v0 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] +; GFX9-NEXT: v_sub_u32_e32 v3, s8, v4 +; GFX9-NEXT: v_and_b32_e32 v1, 0x1fff, v1 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-NEXT: global_store_short v2, v1, s[4:5] offset:4 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[4:5] -; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 ; GFX9-NEXT: s_endpgm %r = srem <3 x i15> %x, %y store <3 x i15> %r, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 5990736f664fb..b5a823355543c 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -37,44 +37,43 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc + ; GFX90A-NEXT: $vgpr22 = IMPLICIT_DEF + ; GFX90A-NEXT: $vgpr10 = IMPLICIT_DEF ; GFX90A-NEXT: $vgpr24 = IMPLICIT_DEF - ; GFX90A-NEXT: $agpr0 = IMPLICIT_DEF - ; GFX90A-NEXT: $vgpr26 = IMPLICIT_DEF + ; GFX90A-NEXT: $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: $vgpr20 = IMPLICIT_DEF - ; GFX90A-NEXT: $vgpr22 = IMPLICIT_DEF ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.58, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr24, $sgpr33, $vgpr31, $agpr0, $vgpr26, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3, $vgpr20, $vgpr22 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr22, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3, $vgpr10, $vgpr24, $vgpr18, $vgpr20 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr23 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr25 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr27 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.Flow17: ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr4 = V_AND_B32_e32 1023, $vgpr31, implicit $exec + ; GFX90A-NEXT: renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.57, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.bb15: ; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr2_vgpr3, implicit $exec - ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr25, implicit $exec + ; GFX90A-NEXT: renamable $vgpr4 = COPY renamable $sgpr25, implicit $exec ; GFX90A-NEXT: renamable $vgpr46, renamable $vcc = V_ADD_CO_U32_e64 $sgpr24, $vgpr0, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr47, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr5, killed $vgpr1, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr0 = V_LSHLREV_B32_e32 2, $vgpr4, implicit $exec + ; GFX90A-NEXT: renamable $vgpr47, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr4, killed $vgpr1, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 2, $vgpr30, implicit $exec ; GFX90A-NEXT: renamable $vgpr40, renamable $vcc = V_ADD_CO_U32_e64 $vgpr46, killed $vgpr0, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr41, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr47, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc @@ -82,7 +81,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 @@ -95,9 +94,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF @@ -105,32 +104,32 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr44_vgpr45 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr42_vgpr43 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.6.Flow20: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr21 = COPY renamable $sgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr19 = COPY renamable $sgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr18 = COPY $sgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr21 = COPY $sgpr17, implicit $exec ; GFX90A-NEXT: renamable $vgpr20 = COPY $sgpr17, implicit $exec ; GFX90A-NEXT: renamable $vgpr23 = COPY $sgpr17, implicit $exec ; GFX90A-NEXT: renamable $vgpr22 = COPY $sgpr17, implicit $exec ; GFX90A-NEXT: renamable $vgpr25 = COPY $sgpr17, implicit $exec ; GFX90A-NEXT: renamable $vgpr24 = COPY $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr27 = COPY $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr26 = COPY $sgpr17, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.7.Flow19: ; GFX90A-NEXT: successors: %bb.62(0x40000000), %bb.8(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec @@ -138,7 +137,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.8.Flow32: ; GFX90A-NEXT: successors: %bb.9(0x40000000), %bb.10(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr18_sgpr19, implicit-def $exec, implicit-def $scc, implicit $exec @@ -147,15 +146,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.9.bb89: ; GFX90A-NEXT: successors: %bb.10(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.10.Flow33: ; GFX90A-NEXT: successors: %bb.11(0x40000000), %bb.12(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr58_sgpr59, implicit-def $exec, implicit-def $scc, implicit $exec @@ -164,15 +163,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.11.bb84: ; GFX90A-NEXT: successors: %bb.12(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.12.Flow34: ; GFX90A-NEXT: successors: %bb.13(0x40000000), %bb.14(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec @@ -181,10 +180,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.13.bb79: ; GFX90A-NEXT: successors: %bb.14(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.14.Flow35: @@ -366,7 +365,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.35.bb20: ; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec :: (load (s8) from %ir.i21, addrspace 1) ; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec @@ -383,37 +382,37 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr43, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_LT_I16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr44_vgpr45 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.37, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.36.Flow21: ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.6 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.37.bb27: ; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec :: (load (s8) from %ir.i28, addrspace 1) ; GFX90A-NEXT: renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec @@ -421,29 +420,29 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr45, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.39, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.38.Flow22: ; GFX90A-NEXT: successors: %bb.36(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -464,7 +463,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.39.bb34: ; GFX90A-NEXT: successors: %bb.41(0x40000000), %bb.40(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec :: (load (s8) from %ir.i35, addrspace 1) ; GFX90A-NEXT: renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec @@ -472,28 +471,28 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.41, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.40.Flow23: ; GFX90A-NEXT: successors: %bb.38(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -513,7 +512,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.41.bb41: ; GFX90A-NEXT: successors: %bb.46(0x40000000), %bb.42(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc @@ -522,33 +521,33 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr20, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr18, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr42_sgpr43 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.46, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.42.Flow24: ; GFX90A-NEXT: successors: %bb.40(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc - ; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr20, implicit $exec + ; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr18, implicit $exec ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc @@ -565,7 +564,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.43.bb55: ; GFX90A-NEXT: successors: %bb.48(0x40000000), %bb.44(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr33, 16, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc @@ -573,33 +572,33 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr48_sgpr49, implicit-def dead $scc - ; GFX90A-NEXT: $agpr0 = IMPLICIT_DEF - ; GFX90A-NEXT: $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: $vgpr10 = IMPLICIT_DEF + ; GFX90A-NEXT: $vgpr12 = IMPLICIT_DEF ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.48, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.44: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr58, $vgpr57, $vgpr20, $vgpr61, $vgpr31, $vgpr63, $agpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $vgpr40, $vgpr62, $vgpr60, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr56, $vgpr47, $vgpr2, $vgpr3, $vgpr4, $vgpr46, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr14 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr58, $vgpr57, $vgpr18, $vgpr30, $vgpr31, $vgpr61, $vgpr63, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $vgpr40, $vgpr62, $vgpr60, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr56, $vgpr47, $vgpr2, $vgpr3, $vgpr46, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr10, $vgpr12 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.45.Flow26: ; GFX90A-NEXT: successors: %bb.47(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc @@ -615,7 +614,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.46.bb48: ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.47(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc @@ -629,26 +628,26 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.43, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.47.Flow25: ; GFX90A-NEXT: successors: %bb.42(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -666,133 +665,135 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.48.bb63: ; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000) - ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.49: ; GFX90A-NEXT: successors: %bb.44(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1 ; GFX90A-NEXT: S_BRANCH %bb.44 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.50.bb68: ; GFX90A-NEXT: successors: %bb.54(0x40000000), %bb.51(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 3, $vgpr4_vgpr5, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec + ; GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr48_sgpr49, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.54, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.51: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.52.bb80: ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc - ; GFX90A-NEXT: renamable $vgpr8 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr9, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr1, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec + ; GFX90A-NEXT: renamable $vgpr7, dead renamable $sgpr50_sgpr51 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.59, implicit killed $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.53: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: S_BRANCH %bb.61 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.54.bb73: ; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr5 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1) - ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec + ; GFX90A-NEXT: renamable $vgpr6 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1) + ; GFX90A-NEXT: renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr7, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr1, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr5, implicit $exec - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr5, dead renamable $sgpr58_sgpr59 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr6, implicit $exec + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr60_sgpr61 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.52, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.55.Flow29: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr60_sgpr61, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.56.bb90: ; GFX90A-NEXT: successors: %bb.60(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr54 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec - ; GFX90A-NEXT: renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec - ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr22, implicit $exec - ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr56, implicit $exec - ; GFX90A-NEXT: renamable $vgpr13 = V_ALIGNBIT_B32_e64 killed $sgpr57, killed $vgpr5, 1, implicit $exec - ; GFX90A-NEXT: renamable $vgpr30 = V_ALIGNBIT_B32_e64 $vgpr19, $vgpr18, 1, implicit $exec - ; GFX90A-NEXT: renamable $vgpr19 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr17 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec + ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec + ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr56, implicit $exec + ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr57, killed $vgpr10, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec + ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.60 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.57: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $exec:0x000000000000000F, $sgpr14, $sgpr15, $sgpr16, $sgpr17:0x0000000000000003, $sgpr23:0x0000000000000003, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $exec:0x000000000000000F, $sgpr14, $sgpr15, $sgpr16, $sgpr17:0x0000000000000003, $sgpr23:0x0000000000000003, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr23, implicit $exec - ; GFX90A-NEXT: renamable $vgpr19 = COPY killed renamable $sgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr15 = COPY killed renamable $sgpr23, implicit $exec + ; GFX90A-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr17, implicit $exec ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 @@ -803,9 +804,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF @@ -815,12 +816,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr42_vgpr43 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr40_vgpr41 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr46_vgpr47 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = COPY renamable $vgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr30 = COPY renamable $vgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr18 = COPY renamable $vgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr54 = COPY renamable $vgpr19, implicit $exec - ; GFX90A-NEXT: renamable $vgpr15 = COPY renamable $vgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr14 = COPY renamable $vgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr14 = COPY renamable $vgpr15, implicit $exec + ; GFX90A-NEXT: renamable $vgpr52 = COPY renamable $vgpr15, implicit $exec + ; GFX90A-NEXT: renamable $vgpr16 = COPY renamable $vgpr15, implicit $exec + ; GFX90A-NEXT: renamable $vgpr53 = COPY renamable $vgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr13 = COPY renamable $vgpr15, implicit $exec + ; GFX90A-NEXT: renamable $vgpr12 = COPY renamable $vgpr15, implicit $exec ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.7 ; GFX90A-NEXT: {{ $}} @@ -829,15 +830,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec - ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.434, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.434, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec - ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $agpr0_agpr1 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.435, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.435, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec - ; GFX90A-NEXT: renamable $vgpr26_vgpr27 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr23 = S_MOV_B32 0 ; GFX90A-NEXT: renamable $sgpr17 = S_MOV_B32 0 @@ -845,38 +846,37 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.59.bb85: ; GFX90A-NEXT: successors: %bb.56(0x40000000), %bb.60(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 1, $vgpr8, implicit $exec - ; GFX90A-NEXT: renamable $vgpr11 = COPY renamable $vgpr9, implicit $exec - ; GFX90A-NEXT: renamable $vgpr5 = FLAT_LOAD_UBYTE renamable $vgpr10_vgpr11, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86) + ; GFX90A-NEXT: renamable $vgpr8 = V_OR_B32_e32 1, $vgpr6, implicit $exec + ; GFX90A-NEXT: renamable $vgpr9 = COPY renamable $vgpr7, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = FLAT_LOAD_UBYTE renamable $vgpr8_vgpr9, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86) ; GFX90A-NEXT: renamable $sgpr17 = S_MOV_B32 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr5, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr10, implicit $exec ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr52_sgpr53 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.56, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.60.Flow31: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr52_sgpr53, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $vgpr12 = COPY renamable $vgpr16, implicit $exec - ; GFX90A-NEXT: renamable $agpr0_agpr1 = COPY killed renamable $vgpr12_vgpr13, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.61.Flow30: ; GFX90A-NEXT: successors: %bb.55(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc @@ -888,7 +888,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.62.bb140: ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.63(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc @@ -896,122 +896,120 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.63.Flow13: ; GFX90A-NEXT: successors: %bb.64(0x40000000), %bb.66(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.66, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.64.bb159: ; GFX90A-NEXT: successors: %bb.67(0x40000000), %bb.65(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr4, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.67, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.65.Flow10: ; GFX90A-NEXT: successors: %bb.66(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.66.Flow14: ; GFX90A-NEXT: successors: %bb.8(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = COPY $exec ; GFX90A-NEXT: S_BRANCH %bb.8 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.67.bb161: ; GFX90A-NEXT: successors: %bb.65(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr23, killed $vgpr25, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr27, implicit $exec - ; GFX90A-NEXT: renamable $vgpr3 = COPY killed renamable $agpr1, implicit $exec - ; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr3, killed $vgpr21, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr21, killed $vgpr23, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec + ; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr11, killed $vgpr19, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec ; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr54, 0, $vgpr3, 0, 0, 6, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr4 = V_OR_B32_e32 killed $vgpr30, killed $vgpr15, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr4, killed $vgpr2, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr19, 0, $vgpr3, 0, 0, 6, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr10, killed $vgpr2, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec ; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3) ; GFX90A-NEXT: S_BRANCH %bb.65 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.68.bb174: ; GFX90A-NEXT: successors: %bb.72(0x40000000), %bb.69(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr28 = V_OR_B32_e32 1, $vgpr26, implicit $exec - ; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr28, $vgpr24, implicit $exec - ; GFX90A-NEXT: renamable $vgpr36 = V_OR_B32_e32 $vgpr38, $vgpr22, implicit $exec - ; GFX90A-NEXT: renamable $vgpr32 = V_CNDMASK_B32_e64 0, $vgpr36, 0, 0, $sgpr12_sgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr50 = V_OR_B32_e32 $vgpr32, $vgpr20, implicit $exec - ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = COPY renamable $agpr0_agpr1, implicit $exec - ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr50, killed $vgpr12, implicit $exec - ; GFX90A-NEXT: renamable $vgpr34 = V_OR_B32_e32 $vgpr48, $vgpr14, implicit $exec - ; GFX90A-NEXT: renamable $vgpr52 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr34, killed $sgpr12_sgpr13, implicit $exec + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec + ; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec + ; GFX90A-NEXT: renamable $vgpr34 = V_OR_B32_e32 $vgpr38, $vgpr20, implicit $exec + ; GFX90A-NEXT: renamable $vgpr28 = V_CNDMASK_B32_e64 0, $vgpr34, 0, 0, $sgpr12_sgpr13, implicit $exec + ; GFX90A-NEXT: renamable $vgpr36 = V_OR_B32_e32 $vgpr28, $vgpr18, implicit $exec + ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr36, $vgpr10, implicit $exec + ; GFX90A-NEXT: renamable $vgpr32 = V_OR_B32_e32 $vgpr48, $vgpr12, implicit $exec + ; GFX90A-NEXT: renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.72, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.69.Flow: ; GFX90A-NEXT: successors: %bb.70(0x40000000), %bb.71(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.71, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.70.bb186: ; GFX90A-NEXT: successors: %bb.71(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 3, killed $vgpr2_vgpr3, implicit $exec - ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr27, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr27, implicit $exec ; GFX90A-NEXT: renamable $vgpr2, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr2, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr5, killed $vgpr3, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr29 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr39 = COPY renamable $vgpr29, implicit $exec - ; GFX90A-NEXT: renamable $vgpr37 = COPY renamable $vgpr29, implicit $exec - ; GFX90A-NEXT: renamable $vgpr51 = COPY renamable $vgpr29, implicit $exec - ; GFX90A-NEXT: renamable $vgpr49 = COPY renamable $vgpr29, implicit $exec - ; GFX90A-NEXT: renamable $vgpr33 = COPY renamable $vgpr29, implicit $exec - ; GFX90A-NEXT: renamable $vgpr53 = COPY renamable $vgpr29, implicit $exec - ; GFX90A-NEXT: renamable $vgpr35 = COPY renamable $vgpr29, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr29, renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr10, killed $vgpr3, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr27 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr39 = COPY renamable $vgpr27, implicit $exec + ; GFX90A-NEXT: renamable $vgpr35 = COPY renamable $vgpr27, implicit $exec + ; GFX90A-NEXT: renamable $vgpr37 = COPY renamable $vgpr27, implicit $exec + ; GFX90A-NEXT: renamable $vgpr49 = COPY renamable $vgpr27, implicit $exec + ; GFX90A-NEXT: renamable $vgpr29 = COPY renamable $vgpr27, implicit $exec + ; GFX90A-NEXT: renamable $vgpr51 = COPY renamable $vgpr27, implicit $exec + ; GFX90A-NEXT: renamable $vgpr33 = COPY renamable $vgpr27, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, renamable $vgpr26_vgpr27, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr12 = COPY killed renamable $sgpr22, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr29, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr29, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr5, killed renamable $vgpr52_vgpr53, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr29, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr10, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr27, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.71.Flow9: ; GFX90A-NEXT: successors: %bb.63(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.63 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.72.bb196: ; GFX90A-NEXT: successors: %bb.69(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr5 = V_OR_B32_e32 $vgpr52, killed $vgpr18, implicit $exec - ; GFX90A-NEXT: renamable $vgpr12 = V_OR_B32_e32 killed $vgpr5, killed $vgpr16, implicit $exec - ; GFX90A-NEXT: renamable $vgpr13 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr13, renamable $vgpr12_vgpr13, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec + ; GFX90A-NEXT: renamable $vgpr54 = V_OR_B32_e32 killed $vgpr10, killed $vgpr14, implicit $exec + ; GFX90A-NEXT: renamable $vgpr55 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr55, renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.69 bb: diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 73d5088141cdb..3207f9e61e314 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -449,14 +449,14 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_mov_b32 s5, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: s_mov_b32 s4, 2 +; GCN-O0-NEXT: v_lshlrev_b32_e64 v3, s4, v1 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: ; implicit-def: $sgpr4 -; GCN-O0-NEXT: v_mov_b32_e32 v4, 0 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 -; GCN-O0-NEXT: s_mov_b32 s4, 2 -; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s4 +; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 +; GCN-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v4, v2 ; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 ; GCN-O0-NEXT: s_mov_b32 s0, 1 @@ -684,15 +684,14 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 ; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s0, 0 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: v_mov_b32_e32 v4, 0 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 ; GCN-O0-NEXT: s_mov_b32 s0, 2 -; GCN-O0-NEXT: s_mov_b32 s1, s0 -; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s1 +; GCN-O0-NEXT: v_lshlrev_b32_e64 v3, s0, v1 +; GCN-O0-NEXT: s_mov_b32 s1, 0 +; GCN-O0-NEXT: ; implicit-def: $sgpr1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 +; GCN-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v4, v2 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_mov_b32 s2, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index 64060ebbb159e..8382624af5d74 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -646,10 +646,8 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-LABEL: udiv16_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_movk_i32 s6, 0x400 -; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_movk_i32 s4, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -657,22 +655,19 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_and_b32 s4, 0xffff, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX9-NEXT: v_add_u16_e64 v3, s7, 1 -; GFX9-NEXT: v_readfirstlane_b32 s7, v3 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s6, v3 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v1 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v3 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[4:5], 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX9-NEXT: v_add_u16_e32 v2, 1, v2 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s4, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_mul_f32_e32 v5, v4, v1 +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5 +; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, v0 +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], 0, v6, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s8, s2, s0 -; GFX9-NEXT: v_mad_f32 v3, -v3, v0, v4 -; GFX9-NEXT: s_addc_u32 s9, s3, s1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], 0, v5, s[0:1] -; GFX9-NEXT: global_store_short v2, v3, s[8:9] +; GFX9-NEXT: global_store_short v3, v4, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB4_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -683,30 +678,25 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s0, s4, 0xffff -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB4_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_and_b32 s0, 0xffff, s4 -; GFX10-NEXT: v_add_nc_u16 v3, s4, 1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v4, s0 -; GFX10-NEXT: s_lshl_b64 s[4:5], s[0:1], 1 -; GFX10-NEXT: s_add_u32 s6, s2, s4 -; GFX10-NEXT: v_readfirstlane_b32 s4, v3 -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v3 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v1 -; GFX10-NEXT: s_addc_u32 s7, s3, s5 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX10-NEXT: v_add_nc_u16 v2, v2, 1 +; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo -; GFX10-NEXT: v_trunc_f32_e32 v3, v3 -; GFX10-NEXT: v_mad_f32 v4, -v3, v0, v4 -; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX10-NEXT: v_trunc_f32_e32 v5, v5 +; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4 +; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v4|, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, 0, v3, s0 -; GFX10-NEXT: global_store_short v2, v3, s[6:7] +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v5, s0 +; GFX10-NEXT: global_store_short v3, v4, s[2:3] ; GFX10-NEXT: s_cbranch_vccz .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -717,36 +707,31 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s0, s4, 0xffff -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB4_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_and_b32 s0, 0xffff, s4 -; GFX11-NEXT: v_add_nc_u16 v3, s4, 1 -; GFX11-NEXT: v_cvt_f32_u32_e32 v4, s0 -; GFX11-NEXT: s_lshl_b64 s[4:5], s[0:1], 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_add_u32 s6, s2, s4 -; GFX11-NEXT: v_readfirstlane_b32 s4, v3 -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v2, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v3, v4, v1 -; GFX11-NEXT: s_addc_u32 s7, s3, s5 +; GFX11-NEXT: v_mul_f32_e32 v5, v4, v1 ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v3, v3 -; GFX11-NEXT: v_fma_f32 v4, -v3, v0, v4 -; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX11-NEXT: v_trunc_f32_e32 v5, v5 +; GFX11-NEXT: v_fma_f32 v4, -v5, v0, v4 +; GFX11-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v4|, v0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, 0, v3, s0 -; GFX11-NEXT: global_store_b16 v2, v3, s[6:7] +; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v5, s0 +; GFX11-NEXT: global_store_b16 v3, v4, s[2:3] ; GFX11-NEXT: s_cbranch_vccz .LBB4_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_nop 0 @@ -773,33 +758,31 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-LABEL: urem16_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_movk_i32 s7, 0x400 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_movk_i32 s5, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s6, s2, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GFX9-NEXT: s_and_b32 s4, s2, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] -; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: v_mul_f32_e32 v9, v8, v3 -; GFX9-NEXT: v_trunc_f32_e32 v9, v9 -; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v9 -; GFX9-NEXT: v_mad_f32 v8, -v9, v2, v8 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v8|, v2 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s7, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v8, s[2:3], 0, v10, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v8, v8, s6 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v8 -; GFX9-NEXT: global_store_short v[5:6], v0, off +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX9-NEXT: v_add_u16_e32 v2, 1, v2 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 1, v3 +; GFX9-NEXT: v_mul_f32_e32 v6, v4, v1 +; GFX9-NEXT: v_trunc_f32_e32 v6, v6 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 +; GFX9-NEXT: v_mad_f32 v4, -v6, v0, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, v0 +; GFX9-NEXT: s_and_b64 vcc, exec, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], 0, v7, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s4 +; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_short v5, v3, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -809,30 +792,27 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s1, s4, 0xffff -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s1 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GFX10-NEXT: s_and_b32 s0, s4, 0xffff +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB5_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX10-NEXT: v_add_nc_u16 v4, v4, 1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v0 -; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] -; GFX10-NEXT: v_mul_f32_e32 v8, v7, v3 -; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 -; GFX10-NEXT: v_trunc_f32_e32 v8, v8 -; GFX10-NEXT: v_mad_f32 v7, -v8, v2, v7 -; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v7|, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 -; GFX10-NEXT: v_mul_lo_u32 v7, v7, s1 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v7 -; GFX10-NEXT: global_store_short v[5:6], v0, off +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX10-NEXT: v_add_nc_u16 v2, v2, 1 +; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1 +; GFX10-NEXT: v_trunc_f32_e32 v5, v5 +; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4 +; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v4|, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 1, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, v4, s0 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, v3, v4 +; GFX10-NEXT: global_store_short v5, v3, s[2:3] ; GFX10-NEXT: s_cbranch_vccz .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -840,38 +820,36 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: urem16_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s1, s4, 0xffff +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s1 -; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB5_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX11-NEXT: v_add_nc_u16 v4, v4, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_u32_e32 v7, v0 -; GFX11-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v2, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v8, v7, v3 -; GFX11-NEXT: v_add_co_u32 v5, s0, s2, v5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v8, v8 -; GFX11-NEXT: v_fma_f32 v7, -v8, v2, v7 -; GFX11-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v7|, v2 -; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_lo_u32 v7, v7, s1 -; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v7 -; GFX11-NEXT: global_store_b16 v[5:6], v0, off +; GFX11-NEXT: v_mul_f32_e32 v5, v4, v1 +; GFX11-NEXT: v_trunc_f32_e32 v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f32 v4, -v5, v0, v4 +; GFX11-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v4|, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 1, v3 +; GFX11-NEXT: v_mul_lo_u32 v4, v4, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v3, v3, v4 +; GFX11-NEXT: global_store_b16 v5, v3, s[0:1] ; GFX11-NEXT: s_cbranch_vccz .LBB5_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_nop 0 @@ -898,38 +876,35 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-LABEL: sdiv16_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_movk_i32 s5, 0x400 +; GFX9-NEXT: s_movk_i32 s3, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s4, s2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_sext_i32_i16 s2, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB6_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_sext_i32_i16 s2, s6 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s2 -; GFX9-NEXT: s_xor_b32 s7, s2, s4 -; GFX9-NEXT: s_ashr_i32 s2, s7, 30 -; GFX9-NEXT: s_or_b32 s2, s2, 1 -; GFX9-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[8:9], |v4|, |v0| -; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX9-NEXT: s_cselect_b32 s7, s2, 0 -; GFX9-NEXT: s_and_b32 s2, s6, 0xffff -; GFX9-NEXT: v_add_u16_e64 v3, s6, 1 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], 1 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3 -; GFX9-NEXT: s_add_u32 s8, s0, s8 -; GFX9-NEXT: v_readfirstlane_b32 s6, v3 -; GFX9-NEXT: v_add_u32_e32 v3, s7, v5 -; GFX9-NEXT: s_addc_u32 s9, s1, s9 -; GFX9-NEXT: global_store_short v2, v3, s[8:9] +; GFX9-NEXT: s_sext_i32_i16 s5, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5 +; GFX9-NEXT: s_xor_b32 s6, s5, s2 +; GFX9-NEXT: s_ashr_i32 s5, s6, 30 +; GFX9-NEXT: s_or_b32 s5, s5, 1 +; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1 +; GFX9-NEXT: v_trunc_f32_e32 v4, v4 +; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0| +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX9-NEXT: v_add_u16_e64 v2, s4, 1 +; GFX9-NEXT: s_cselect_b32 s5, s5, 0 +; GFX9-NEXT: s_and_b32 s6, 0xffff, s4 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2 +; GFX9-NEXT: v_readfirstlane_b32 s4, v2 +; GFX9-NEXT: v_add_u32_e32 v2, s5, v4 +; GFX9-NEXT: s_lshl_b32 s5, s6, 1 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: global_store_short v3, v2, s[0:1] ; GFX9-NEXT: s_cbranch_vccz .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -939,36 +914,33 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s4, s4 -; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX10-NEXT: s_sext_i32_i16 s0, s4 +; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB6_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_sext_i32_i16 s0, s5 -; GFX10-NEXT: v_add_nc_u16 v3, s5, 1 -; GFX10-NEXT: v_cvt_f32_i32_e32 v4, s0 -; GFX10-NEXT: s_xor_b32 s0, s0, s4 -; GFX10-NEXT: s_ashr_i32 s0, s0, 30 -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v3 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX10-NEXT: s_or_b32 s0, s0, 1 -; GFX10-NEXT: v_trunc_f32_e32 v5, v5 -; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4 -; GFX10-NEXT: v_cmp_ge_f32_e64 s6, |v4|, |v0| -; GFX10-NEXT: v_cvt_i32_f32_e32 v4, v5 -; GFX10-NEXT: s_and_b32 s6, s6, exec_lo -; GFX10-NEXT: s_cselect_b32 s6, s0, 0 -; GFX10-NEXT: s_and_b32 s0, s5, 0xffff -; GFX10-NEXT: v_readfirstlane_b32 s5, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v3, s6, v4 -; GFX10-NEXT: s_lshl_b64 s[6:7], s[0:1], 1 -; GFX10-NEXT: s_add_u32 s6, s2, s6 -; GFX10-NEXT: s_addc_u32 s7, s3, s7 -; GFX10-NEXT: global_store_short v2, v3, s[6:7] +; GFX10-NEXT: s_sext_i32_i16 s4, s1 +; GFX10-NEXT: v_add_nc_u16 v2, s1, 1 +; GFX10-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX10-NEXT: s_xor_b32 s5, s4, s0 +; GFX10-NEXT: s_ashr_i32 s4, s5, 30 +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 +; GFX10-NEXT: v_mul_f32_e32 v4, v3, v1 +; GFX10-NEXT: s_or_b32 s4, s4, 1 +; GFX10-NEXT: v_trunc_f32_e32 v4, v4 +; GFX10-NEXT: v_mad_f32 v3, -v4, v0, v3 +; GFX10-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX10-NEXT: v_cmp_ge_f32_e64 s5, |v3|, |v0| +; GFX10-NEXT: s_and_b32 s5, s5, exec_lo +; GFX10-NEXT: s_cselect_b32 s4, s4, 0 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s1 +; GFX10-NEXT: v_readfirstlane_b32 s1, v2 +; GFX10-NEXT: s_lshl_b32 s5, s5, 1 +; GFX10-NEXT: v_add_nc_u32_e32 v2, s4, v4 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: global_store_short v3, v2, s[2:3] ; GFX10-NEXT: s_cbranch_vccz .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -978,43 +950,39 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_mov_b32 s3, 0 -; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s4, s2 +; GFX11-NEXT: s_sext_i32_i16 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB6_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_sext_i32_i16 s2, s5 -; GFX11-NEXT: v_add_nc_u16 v3, s5, 1 -; GFX11-NEXT: v_cvt_f32_i32_e32 v4, s2 -; GFX11-NEXT: s_xor_b32 s2, s2, s4 +; GFX11-NEXT: s_sext_i32_i16 s4, s3 +; GFX11-NEXT: v_add_nc_u16 v2, s3, 1 +; GFX11-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX11-NEXT: s_xor_b32 s5, s4, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_ashr_i32 s2, s2, 30 -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v3 +; GFX11-NEXT: s_ashr_i32 s4, s5, 30 +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX11-NEXT: s_or_b32 s2, s2, 1 +; GFX11-NEXT: v_mul_f32_e32 v4, v3, v1 +; GFX11-NEXT: s_or_b32 s4, s4, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v5, v5 -; GFX11-NEXT: v_fma_f32 v4, -v5, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v4|, |v0| -; GFX11-NEXT: v_cvt_i32_f32_e32 v4, v5 -; GFX11-NEXT: s_and_b32 s6, s6, exec_lo -; GFX11-NEXT: s_cselect_b32 s6, s2, 0 -; GFX11-NEXT: s_and_b32 s2, s5, 0xffff -; GFX11-NEXT: v_readfirstlane_b32 s5, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v3, s6, v4 -; GFX11-NEXT: s_lshl_b64 s[6:7], s[2:3], 1 +; GFX11-NEXT: v_trunc_f32_e32 v4, v4 +; GFX11-NEXT: v_fma_f32 v3, -v4, v0, v3 +; GFX11-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_ge_f32_e64 s5, |v3|, |v0| +; GFX11-NEXT: s_and_b32 s5, s5, exec_lo +; GFX11-NEXT: s_cselect_b32 s4, s4, 0 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s3 +; GFX11-NEXT: v_readfirstlane_b32 s3, v2 +; GFX11-NEXT: s_lshl_b32 s5, s5, 1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_u32 s6, s0, s6 -; GFX11-NEXT: s_addc_u32 s7, s1, s7 -; GFX11-NEXT: global_store_b16 v2, v3, s[6:7] +; GFX11-NEXT: v_dual_mov_b32 v3, s5 :: v_dual_add_nc_u32 v2, s4, v4 +; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] ; GFX11-NEXT: s_cbranch_vccz .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_nop 0 @@ -1041,40 +1009,37 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-LABEL: srem16_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_movk_i32 s5, 0x400 +; GFX9-NEXT: s_movk_i32 s3, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s4, s2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_sext_i32_i16 s2, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB7_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_sext_i32_i16 s7, s6 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s7 -; GFX9-NEXT: s_xor_b32 s2, s7, s4 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s2, s2, 1 -; GFX9-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 -; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[8:9], |v4|, |v0| -; GFX9-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX9-NEXT: v_add_u16_e64 v3, s6, 1 -; GFX9-NEXT: s_cselect_b32 s8, s2, 0 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3 -; GFX9-NEXT: s_and_b32 s2, s6, 0xffff -; GFX9-NEXT: v_readfirstlane_b32 s6, v3 -; GFX9-NEXT: v_add_u32_e32 v3, s8, v5 -; GFX9-NEXT: v_mul_lo_u32 v3, v3, s4 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], 1 -; GFX9-NEXT: s_add_u32 s8, s0, s8 -; GFX9-NEXT: s_addc_u32 s9, s1, s9 -; GFX9-NEXT: v_sub_u32_e32 v3, s7, v3 -; GFX9-NEXT: global_store_short v2, v3, s[8:9] +; GFX9-NEXT: s_sext_i32_i16 s5, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5 +; GFX9-NEXT: s_xor_b32 s6, s5, s2 +; GFX9-NEXT: s_ashr_i32 s6, s6, 30 +; GFX9-NEXT: s_or_b32 s8, s6, 1 +; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1 +; GFX9-NEXT: v_trunc_f32_e32 v4, v4 +; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0| +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX9-NEXT: v_add_u16_e64 v2, s4, 1 +; GFX9-NEXT: s_cselect_b32 s6, s8, 0 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s4 +; GFX9-NEXT: v_readfirstlane_b32 s4, v2 +; GFX9-NEXT: v_add_u32_e32 v2, s6, v4 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX9-NEXT: s_lshl_b32 s6, s7, 1 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 +; GFX9-NEXT: global_store_short v3, v2, s[0:1] ; GFX9-NEXT: s_cbranch_vccz .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -1084,38 +1049,36 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s4, s4 -; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX10-NEXT: s_sext_i32_i16 s0, s4 +; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB7_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_sext_i32_i16 s8, s5 -; GFX10-NEXT: v_add_nc_u16 v3, s5, 1 -; GFX10-NEXT: v_cvt_f32_i32_e32 v4, s8 -; GFX10-NEXT: s_xor_b32 s0, s8, s4 -; GFX10-NEXT: s_ashr_i32 s0, s0, 30 -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v3 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX10-NEXT: s_or_b32 s0, s0, 1 -; GFX10-NEXT: v_trunc_f32_e32 v5, v5 -; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4 -; GFX10-NEXT: v_cmp_ge_f32_e64 s6, |v4|, |v0| -; GFX10-NEXT: v_cvt_i32_f32_e32 v4, v5 +; GFX10-NEXT: s_sext_i32_i16 s4, s1 +; GFX10-NEXT: v_add_nc_u16 v2, s1, 1 +; GFX10-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX10-NEXT: s_xor_b32 s5, s4, s0 +; GFX10-NEXT: s_ashr_i32 s5, s5, 30 +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 +; GFX10-NEXT: v_mul_f32_e32 v4, v3, v1 +; GFX10-NEXT: s_or_b32 s5, s5, 1 +; GFX10-NEXT: v_trunc_f32_e32 v4, v4 +; GFX10-NEXT: v_mad_f32 v3, -v4, v0, v3 +; GFX10-NEXT: v_cmp_ge_f32_e64 s6, |v3|, |v0| +; GFX10-NEXT: v_cvt_i32_f32_e32 v3, v4 ; GFX10-NEXT: s_and_b32 s6, s6, exec_lo -; GFX10-NEXT: s_cselect_b32 s6, s0, 0 -; GFX10-NEXT: s_and_b32 s0, s5, 0xffff -; GFX10-NEXT: v_add_nc_u32_e32 v4, s6, v4 -; GFX10-NEXT: v_readfirstlane_b32 s5, v3 -; GFX10-NEXT: s_lshl_b64 s[6:7], s[0:1], 1 -; GFX10-NEXT: s_add_u32 s6, s2, s6 -; GFX10-NEXT: v_mul_lo_u32 v3, v4, s4 -; GFX10-NEXT: s_addc_u32 s7, s3, s7 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s8, v3 -; GFX10-NEXT: global_store_short v2, v3, s[6:7] +; GFX10-NEXT: s_cselect_b32 s5, s5, 0 +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v3, s5, v3 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s1 +; GFX10-NEXT: v_readfirstlane_b32 s1, v2 +; GFX10-NEXT: s_lshl_b32 s5, s5, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_mul_lo_u32 v3, v3, s0 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s4, v3 +; GFX10-NEXT: global_store_short v2, v3, s[2:3] ; GFX10-NEXT: s_cbranch_vccz .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -1125,46 +1088,44 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_mov_b32 s3, 0 -; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s4, s2 +; GFX11-NEXT: s_sext_i32_i16 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB7_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_sext_i32_i16 s8, s5 -; GFX11-NEXT: v_add_nc_u16 v3, s5, 1 -; GFX11-NEXT: v_cvt_f32_i32_e32 v4, s8 -; GFX11-NEXT: s_xor_b32 s2, s8, s4 +; GFX11-NEXT: s_sext_i32_i16 s4, s3 +; GFX11-NEXT: v_add_nc_u16 v2, s3, 1 +; GFX11-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX11-NEXT: s_xor_b32 s5, s4, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_ashr_i32 s2, s2, 30 -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v3 +; GFX11-NEXT: s_ashr_i32 s5, s5, 30 +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX11-NEXT: s_or_b32 s2, s2, 1 +; GFX11-NEXT: v_mul_f32_e32 v4, v3, v1 +; GFX11-NEXT: s_or_b32 s5, s5, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v5, v5 -; GFX11-NEXT: v_fma_f32 v4, -v5, v0, v4 +; GFX11-NEXT: v_trunc_f32_e32 v4, v4 +; GFX11-NEXT: v_fma_f32 v3, -v4, v0, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v4|, |v0| -; GFX11-NEXT: v_cvt_i32_f32_e32 v4, v5 +; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v3|, |v0| +; GFX11-NEXT: v_cvt_i32_f32_e32 v3, v4 ; GFX11-NEXT: s_and_b32 s6, s6, exec_lo -; GFX11-NEXT: s_cselect_b32 s6, s2, 0 -; GFX11-NEXT: s_and_b32 s2, s5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_add_nc_u32_e32 v4, s6, v4 -; GFX11-NEXT: v_readfirstlane_b32 s5, v3 -; GFX11-NEXT: s_lshl_b64 s[6:7], s[2:3], 1 -; GFX11-NEXT: s_add_u32 s6, s0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_lo_u32 v3, v4, s4 -; GFX11-NEXT: s_addc_u32 s7, s1, s7 -; GFX11-NEXT: v_sub_nc_u32_e32 v3, s8, v3 -; GFX11-NEXT: global_store_b16 v2, v3, s[6:7] +; GFX11-NEXT: s_cselect_b32 s5, s5, 0 +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, s5, v3 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s3 +; GFX11-NEXT: v_readfirstlane_b32 s3, v2 +; GFX11-NEXT: s_lshl_b32 s5, s5, 1 +; GFX11-NEXT: v_mov_b32_e32 v2, s5 +; GFX11-NEXT: v_mul_lo_u32 v3, v3, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v3, s4, v3 +; GFX11-NEXT: global_store_b16 v2, v3, s[0:1] ; GFX11-NEXT: s_cbranch_vccz .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index a462c19ce645d..6f61af0418636 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -28,138 +28,136 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s35 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s35 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, 3 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x800 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x1000 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x1800 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[3:4] ; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] ; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] ; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] ; GFX8-NEXT: s_movk_i32 s0, 0x2000 -; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x2800 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] ; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] ; GFX8-NEXT: s_movk_i32 s0, 0x3000 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x3800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x3800, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v11 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v11 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v6, v12, vcc ; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v8, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v13, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v13, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v14, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v15, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v15, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v16, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v17, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v17, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v18, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc +; GFX8-NEXT: flat_store_dwordx2 v[1:2], v[3:4] ; GFX8-NEXT: s_endpgm ; -; GFX900-LABEL: clmem_read_simplified: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX900-NEXT: s_mov_b32 s38, -1 -; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s3 -; GFX900-NEXT: s_addc_u32 s37, s37, 0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX900-NEXT: s_getpc_b64 s[0:1] -; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX900-NEXT: v_mov_b32_e32 v31, v0 -; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: s_mov_b32 s32, 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX900-NEXT: v_and_b32_e32 v18, 0xffff8000, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, s35 -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s34, v18 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v0, vcc -; GFX900-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] -; GFX900-NEXT: s_movk_i32 s1, 0x2000 -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX900-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX900-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2048 -; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, s1, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; GFX900-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 -; GFX900-NEXT: s_movk_i32 s0, 0x1000 -; GFX900-NEXT: v_add_co_u32_e32 v10, vcc, s0, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v1, vcc -; GFX900-NEXT: global_load_dwordx2 v[12:13], v[10:11], off offset:2048 -; GFX900-NEXT: global_load_dwordx2 v[14:15], v[6:7], off -; GFX900-NEXT: global_load_dwordx2 v[16:17], v[6:7], off offset:2048 -; GFX900-NEXT: s_movk_i32 s0, 0x3000 -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX900-NEXT: global_load_dwordx2 v[6:7], v[0:1], off -; GFX900-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048 -; GFX900-NEXT: s_waitcnt vmcnt(6) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v4, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v3, vcc -; GFX900-NEXT: s_waitcnt vmcnt(5) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc -; GFX900-NEXT: s_waitcnt vmcnt(4) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v12, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v13, v1, vcc -; GFX900-NEXT: s_waitcnt vmcnt(3) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc -; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v16, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v17, v1, vcc -; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc -; GFX900-NEXT: global_store_dwordx2 v18, v[0:1], s[34:35] -; GFX900-NEXT: s_endpgm +; GFX9-LABEL: clmem_read_simplified: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff8000, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v18 +; GFX9-NEXT: v_mov_b32_e32 v3, 3 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_movk_i32 s1, 0x2000 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2048 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, s1, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 +; GFX9-NEXT: s_movk_i32 s0, 0x1000 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[12:13], v[10:11], off offset:2048 +; GFX9-NEXT: global_load_dwordx2 v[14:15], v[6:7], off +; GFX9-NEXT: global_load_dwordx2 v[16:17], v[6:7], off offset:2048 +; GFX9-NEXT: s_movk_i32 s0, 0x3000 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v12, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v13, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v16, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v17, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc +; GFX9-NEXT: global_store_dwordx2 v18, v[0:1], s[34:35] +; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: clmem_read_simplified: ; GFX10: ; %bb.0: ; %entry @@ -181,15 +179,14 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX10-NEXT: v_and_b32_e32 v20, 0xffff8000, v2 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; GFX10-NEXT: v_add_co_u32 v2, s0, s34, v20 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s35, 0, s0 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 3 +; GFX10-NEXT: v_and_b32_e32 v20, 0xffff8000, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v20 +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0x1000 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v0, 0x2000 @@ -234,77 +231,6 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX10-NEXT: global_store_dwordx2 v20, v[0:1], s[34:35] ; GFX10-NEXT: s_endpgm ; -; GFX90A-LABEL: clmem_read_simplified: -; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX90A-NEXT: s_mov_b32 s38, -1 -; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s3 -; GFX90A-NEXT: s_addc_u32 s37, s37, 0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX90A-NEXT: s_getpc_b64 s[0:1] -; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX90A-NEXT: v_mov_b32_e32 v31, v0 -; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_mov_b32 s32, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX90A-NEXT: v_and_b32_e32 v18, 0xffff8000, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s35 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s34, v18 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v0, vcc -; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 3, v[2:3] -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc -; GFX90A-NEXT: s_movk_i32 s1, 0x2000 -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2048 -; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, s1, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 -; GFX90A-NEXT: s_movk_i32 s0, 0x1000 -; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, s0, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v1, vcc -; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[10:11], off offset:2048 -; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[6:7], off -; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[6:7], off offset:2048 -; GFX90A-NEXT: s_movk_i32 s0, 0x3000 -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[0:1], off -; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048 -; GFX90A-NEXT: s_waitcnt vmcnt(6) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v3, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(5) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(4) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v12, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v13, v1, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(3) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(2) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v16, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v17, v1, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc -; GFX90A-NEXT: global_store_dwordx2 v18, v[0:1], s[34:35] -; GFX90A-NEXT: s_endpgm -; ; GFX11-LABEL: clmem_read_simplified: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_getpc_b64 s[2:3] @@ -316,17 +242,17 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v16, 0xffff8000, v2 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff8000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v2, s0, s34, v16 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s35, 0, s0 +; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v16 +; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b64 v[2:3], v[0:1], off ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2048 @@ -436,98 +362,97 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 17, v0 -; GFX8-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] -; GFX8-NEXT: v_and_b32_e32 v0, 0xfe000000, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v3, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 17, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xfe000000, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s35 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v2, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x5000 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_movk_i32 s0, 0x7f ; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB1_2 Depth 2 +; GFX8-NEXT: v_mov_b32_e32 v7, v3 ; GFX8-NEXT: v_mov_b32_e32 v6, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: .LBB1_2: ; %for.body ; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xffffb000, v5 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffb800, v5 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xffffc000, v5 -; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] -; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0xffffc800, v5 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0xffffd000, v5 -; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[11:12] -; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xffffd800, v5 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, -1, v6, vcc -; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] -; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18] -; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0xffffe000, v5 -; GFX8-NEXT: v_addc_u32_e32 v20, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v21, vcc, 0xffffe800, v5 -; GFX8-NEXT: flat_load_dwordx2 v[19:20], v[19:20] -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, -1, v6, vcc -; GFX8-NEXT: flat_load_dwordx2 v[21:22], v[21:22] -; GFX8-NEXT: v_add_u32_e32 v23, vcc, 0xfffff000, v5 -; GFX8-NEXT: v_addc_u32_e32 v24, vcc, -1, v6, vcc -; GFX8-NEXT: flat_load_dwordx2 v[23:24], v[23:24] -; GFX8-NEXT: v_add_u32_e32 v25, vcc, 0xfffff800, v5 -; GFX8-NEXT: v_addc_u32_e32 v26, vcc, -1, v6, vcc -; GFX8-NEXT: flat_load_dwordx2 v[25:26], v[25:26] -; GFX8-NEXT: flat_load_dwordx2 v[27:28], v[5:6] -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x10000, v5 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xffffb000, v6 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffb800, v6 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, -1, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0xffffc000, v6 +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9] +; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, -1, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xffffc800, v6 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, -1, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xffffd000, v6 +; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13] +; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[14:15] +; GFX8-NEXT: v_addc_u32_e32 v17, vcc, -1, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xffffd800, v6 +; GFX8-NEXT: v_addc_u32_e32 v19, vcc, -1, v7, vcc +; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[16:17] +; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[18:19] +; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xffffe000, v6 +; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xffffe800, v6 +; GFX8-NEXT: flat_load_dwordx2 v[20:21], v[20:21] +; GFX8-NEXT: v_addc_u32_e32 v23, vcc, -1, v7, vcc +; GFX8-NEXT: flat_load_dwordx2 v[22:23], v[22:23] +; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xfffff000, v6 +; GFX8-NEXT: v_addc_u32_e32 v25, vcc, -1, v7, vcc +; GFX8-NEXT: flat_load_dwordx2 v[24:25], v[24:25] +; GFX8-NEXT: v_add_u32_e32 v26, vcc, 0xfffff800, v6 +; GFX8-NEXT: v_addc_u32_e32 v27, vcc, -1, v7, vcc +; GFX8-NEXT: flat_load_dwordx2 v[26:27], v[26:27] +; GFX8-NEXT: flat_load_dwordx2 v[28:29], v[6:7] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x10000, v6 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GFX8-NEXT: s_addk_i32 s1, 0x2000 ; GFX8-NEXT: s_cmp_gt_u32 s1, 0x3fffff ; GFX8-NEXT: s_waitcnt vmcnt(10) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v8, v4 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v9, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(9) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v9, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v10, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v11, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(8) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v11, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v12, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v13, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(7) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v13, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v14, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v15, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v16, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v16, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v17, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v17, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v18, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v18, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v19, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v19, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v20, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v20, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v21, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v21, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v22, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v22, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v23, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v23, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v24, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v24, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v25, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v25, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v26, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v26, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v27, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v27, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v28, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v28, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v29, v5, vcc ; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1 @@ -538,10 +463,10 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s0, s1 ; GFX8-NEXT: s_branch .LBB1_1 ; GFX8-NEXT: .LBB1_5: ; %while.end -; GFX8-NEXT: v_mov_b32_e32 v1, s35 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v2, s35 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v1 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: s_endpgm ; ; GFX900-LABEL: clmem_read: @@ -565,14 +490,12 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 17, v0 -; GFX900-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] ; GFX900-NEXT: v_and_b32_e32 v0, 0xfe000000, v0 -; GFX900-NEXT: v_or_b32_e32 v1, v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, s35 +; GFX900-NEXT: v_lshl_or_b32 v1, v1, 3, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, s35 ; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, s34, v1 -; GFX900-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v3, vcc +; GFX900-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX900-NEXT: s_movk_i32 s0, 0x5000 ; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 @@ -682,17 +605,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 17, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 17, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_movk_i32 s1, 0x7f -; GFX10-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] -; GFX10-NEXT: v_and_b32_e32 v0, 0xfe000000, v0 -; GFX10-NEXT: v_or_b32_e32 v1, v1, v0 -; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, s34 -; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s35, v2, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v0, 0xfe000000, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 3, v0 +; GFX10-NEXT: v_add_co_u32 v1, s0, v1, s34 +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, 0, s35, s0 ; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, 0x5000, v1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo ; GFX10-NEXT: .LBB1_1: ; %for.cond.preheader @@ -796,15 +717,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 17, v0 ; GFX90A-NEXT: v_and_b32_e32 v0, 0xfe000000, v0 -; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 3, v[2:3] -; GFX90A-NEXT: v_or_b32_e32 v1, v2, v0 +; GFX90A-NEXT: v_lshl_or_b32 v1, v1, 3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v2, s35 ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, s34, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v2, vcc ; GFX90A-NEXT: s_movk_i32 s0, 0x5000 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc @@ -903,20 +822,18 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xff, v0 -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 17, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 17, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0xff, v0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_movk_i32 s1, 0x7f -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] -; GFX11-NEXT: v_and_b32_e32 v0, 0xfe000000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xfe000000, v1 +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v1, v1, v0 -; GFX11-NEXT: v_add_co_u32 v1, vcc_lo, v1, s34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s35, v2, vcc_lo +; GFX11-NEXT: v_add_co_u32 v1, s0, v1, s34 +; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, 0, s35, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v1, vcc_lo, 0x5000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX11-NEXT: ; =>This Loop Header: Depth=1 @@ -1135,39 +1052,38 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s35 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[1:2] -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s35 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, 2 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x400 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x800 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0xc00 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x1000 -; GFX8-NEXT: v_add_u32_e32 v11, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v11, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x1400 -; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x1800 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x1c00 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x2000 -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: flat_load_dword v19, v[5:6] ; GFX8-NEXT: flat_load_dword v7, v[7:8] ; GFX8-NEXT: flat_load_dword v8, v[9:10] @@ -1175,90 +1091,89 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX8-NEXT: flat_load_dword v10, v[13:14] ; GFX8-NEXT: flat_load_dword v11, v[15:16] ; GFX8-NEXT: flat_load_dword v12, v[17:18] -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x2400, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x2400, v3 ; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dword v3, v[3:4] ; GFX8-NEXT: s_waitcnt vmcnt(8) -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v19, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v19, v0 ; GFX8-NEXT: s_waitcnt vmcnt(7) -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v7, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v8, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v8, v0 ; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v9, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0 ; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v10, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v10, v0 ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v11, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v11, v0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v12, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v12, v0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: flat_store_dword v[1:2], v0 ; GFX8-NEXT: s_endpgm ; -; GFX900-LABEL: Address32: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX900-NEXT: s_mov_b32 s38, -1 -; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s3 -; GFX900-NEXT: s_addc_u32 s37, s37, 0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX900-NEXT: s_getpc_b64 s[0:1] -; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX900-NEXT: v_mov_b32_e32 v31, v0 -; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: s_mov_b32 s32, 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX900-NEXT: v_and_b32_e32 v4, 0xffff8000, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, s35 -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s34, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v0, vcc -; GFX900-NEXT: v_lshlrev_b64 v[0:1], 2, v[1:2] -; GFX900-NEXT: s_movk_i32 s0, 0x1000 -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX900-NEXT: global_load_dword v5, v[0:1], off -; GFX900-NEXT: global_load_dword v6, v[0:1], off offset:1024 -; GFX900-NEXT: global_load_dword v7, v[0:1], off offset:2048 -; GFX900-NEXT: global_load_dword v8, v[0:1], off offset:3072 -; GFX900-NEXT: global_load_dword v9, v[2:3], off -; GFX900-NEXT: global_load_dword v10, v[2:3], off offset:1024 -; GFX900-NEXT: global_load_dword v11, v[2:3], off offset:2048 -; GFX900-NEXT: global_load_dword v12, v[2:3], off offset:3072 -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX900-NEXT: global_load_dword v2, v[0:1], off -; GFX900-NEXT: global_load_dword v3, v[0:1], off offset:1024 -; GFX900-NEXT: s_waitcnt vmcnt(8) -; GFX900-NEXT: v_add_u32_e32 v0, v6, v5 -; GFX900-NEXT: s_waitcnt vmcnt(6) -; GFX900-NEXT: v_add3_u32 v0, v7, v0, v8 -; GFX900-NEXT: s_waitcnt vmcnt(4) -; GFX900-NEXT: v_add3_u32 v0, v9, v0, v10 -; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add3_u32 v0, v11, v0, v12 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add3_u32 v0, v2, v0, v3 -; GFX900-NEXT: global_store_dword v4, v0, s[34:35] -; GFX900-NEXT: s_endpgm +; GFX9-LABEL: Address32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff8000, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, 2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_movk_i32 s0, 0x1000 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v5, v[0:1], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:1024 +; GFX9-NEXT: global_load_dword v7, v[0:1], off offset:2048 +; GFX9-NEXT: global_load_dword v8, v[0:1], off offset:3072 +; GFX9-NEXT: global_load_dword v9, v[2:3], off +; GFX9-NEXT: global_load_dword v10, v[2:3], off offset:1024 +; GFX9-NEXT: global_load_dword v11, v[2:3], off offset:2048 +; GFX9-NEXT: global_load_dword v12, v[2:3], off offset:3072 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v2, v[0:1], off +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_add_u32_e32 v0, v6, v5 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add3_u32 v0, v7, v0, v8 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add3_u32 v0, v9, v0, v10 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add3_u32 v0, v11, v0, v12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add3_u32 v0, v2, v0, v3 +; GFX9-NEXT: global_store_dword v4, v0, s[34:35] +; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: Address32: ; GFX10: ; %bb.0: ; %entry @@ -1280,15 +1195,14 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff8000, v2 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; GFX10-NEXT: v_add_co_u32 v2, s0, s34, v8 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s35, 0, s0 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 2 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff8000, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v8 +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x1000 @@ -1327,64 +1241,6 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX10-NEXT: global_store_dword v8, v0, s[34:35] ; GFX10-NEXT: s_endpgm ; -; GFX90A-LABEL: Address32: -; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX90A-NEXT: s_mov_b32 s38, -1 -; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s3 -; GFX90A-NEXT: s_addc_u32 s37, s37, 0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX90A-NEXT: s_getpc_b64 s[0:1] -; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX90A-NEXT: v_mov_b32_e32 v31, v0 -; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_mov_b32 s32, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff8000, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s35 -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, s34, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v0, vcc -; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 2, v[2:3] -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v1, vcc -; GFX90A-NEXT: s_movk_i32 s0, 0x1000 -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: global_load_dword v6, v[0:1], off offset:1024 -; GFX90A-NEXT: global_load_dword v7, v[0:1], off offset:2048 -; GFX90A-NEXT: global_load_dword v8, v[0:1], off offset:3072 -; GFX90A-NEXT: global_load_dword v9, v[2:3], off -; GFX90A-NEXT: global_load_dword v10, v[2:3], off offset:1024 -; GFX90A-NEXT: global_load_dword v11, v[2:3], off offset:2048 -; GFX90A-NEXT: global_load_dword v12, v[2:3], off offset:3072 -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: global_load_dword v2, v[0:1], off -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:1024 -; GFX90A-NEXT: s_waitcnt vmcnt(8) -; GFX90A-NEXT: v_add_u32_e32 v0, v6, v5 -; GFX90A-NEXT: s_waitcnt vmcnt(6) -; GFX90A-NEXT: v_add3_u32 v0, v7, v0, v8 -; GFX90A-NEXT: s_waitcnt vmcnt(4) -; GFX90A-NEXT: v_add3_u32 v0, v9, v0, v10 -; GFX90A-NEXT: s_waitcnt vmcnt(2) -; GFX90A-NEXT: v_add3_u32 v0, v11, v0, v12 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add3_u32 v0, v2, v0, v3 -; GFX90A-NEXT: global_store_dword v4, v0, s[34:35] -; GFX90A-NEXT: s_endpgm -; ; GFX11-LABEL: Address32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_getpc_b64 s[2:3] @@ -1396,17 +1252,17 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff8000, v2 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff8000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v2, s0, s34, v6 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s35, 0, s0 +; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v6 +; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v7, v[0:1], off ; GFX11-NEXT: global_load_b32 v8, v[0:1], off offset:1024 @@ -1513,89 +1369,87 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s35 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s35 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, 3 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; GFX8-NEXT: s_movk_i32 s0, 0xf000 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0xf800 -; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[3:4] ; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] -; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 1, v4, vcc +; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v7 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v6, v8, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc +; GFX8-NEXT: flat_store_dwordx2 v[1:2], v[3:4] ; GFX8-NEXT: s_endpgm ; -; GFX900-LABEL: Offset64: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX900-NEXT: s_mov_b32 s38, -1 -; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s3 -; GFX900-NEXT: s_addc_u32 s37, s37, 0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX900-NEXT: s_getpc_b64 s[0:1] -; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX900-NEXT: v_mov_b32_e32 v31, v0 -; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: s_mov_b32 s32, 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX900-NEXT: v_and_b32_e32 v12, 0xffff8000, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, s35 -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s34, v12 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v0, vcc -; GFX900-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] -; GFX900-NEXT: s_movk_i32 s0, 0xf000 -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, 0, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 1, v1, vcc -; GFX900-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX900-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:-4096 -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX900-NEXT: global_load_dwordx2 v[8:9], v[4:5], off -; GFX900-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048 -; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v3, vcc -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc -; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[34:35] -; GFX900-NEXT: s_endpgm +; GFX9-LABEL: Offset64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff8000, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v12 +; GFX9-NEXT: v_mov_b32_e32 v3, 3 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 1, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:-4096 +; GFX9-NEXT: s_movk_i32 s0, 0xf000 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[4:5], off +; GFX9-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc +; GFX9-NEXT: global_store_dwordx2 v12, v[0:1], s[34:35] +; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: Offset64: ; GFX10: ; %bb.0: ; %entry @@ -1617,15 +1471,14 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX10-NEXT: v_and_b32_e32 v12, 0xffff8000, v2 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; GFX10-NEXT: v_add_co_u32 v2, s0, s34, v12 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s35, 0, s0 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 3 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff8000, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v12 +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0xfffff800 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 @@ -1648,56 +1501,6 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX10-NEXT: global_store_dwordx2 v12, v[0:1], s[34:35] ; GFX10-NEXT: s_endpgm ; -; GFX90A-LABEL: Offset64: -; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX90A-NEXT: s_mov_b32 s38, -1 -; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s3 -; GFX90A-NEXT: s_addc_u32 s37, s37, 0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX90A-NEXT: s_getpc_b64 s[0:1] -; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX90A-NEXT: v_mov_b32_e32 v31, v0 -; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_mov_b32 s32, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX90A-NEXT: v_and_b32_e32 v12, 0xffff8000, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s35 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s34, v12 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v0, vcc -; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 3, v[2:3] -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 1, v1, vcc -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:-4096 -; GFX90A-NEXT: s_movk_i32 s0, 0xf000 -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[4:5], off -; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048 -; GFX90A-NEXT: s_waitcnt vmcnt(2) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v3, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc -; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[34:35] -; GFX90A-NEXT: s_endpgm -; ; GFX11-LABEL: Offset64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_getpc_b64 s[2:3] @@ -1709,17 +1512,17 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v8, 0xffff8000, v2 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff8000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v2, s0, s34, v8 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s35, 0, s0 +; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v8 +; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 1, v1, vcc_lo @@ -1794,81 +1597,80 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s35 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[1:2] -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s35 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, 2 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; GFX8-NEXT: s_mov_b32 s0, 0x7ffff800 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc ; GFX8-NEXT: s_mov_b32 s0, 0x7ffffc00 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: flat_load_dword v6, v[7:8] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x80000000, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dword v3, v[3:4] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v6, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: flat_store_dword v[1:2], v0 ; GFX8-NEXT: s_endpgm ; -; GFX900-LABEL: p32Offset64: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX900-NEXT: s_mov_b32 s38, -1 -; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s3 -; GFX900-NEXT: s_addc_u32 s37, s37, 0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX900-NEXT: s_getpc_b64 s[0:1] -; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX900-NEXT: v_mov_b32_e32 v31, v0 -; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: s_mov_b32 s32, 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX900-NEXT: v_and_b32_e32 v6, 0xffff8000, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, s35 -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s34, v6 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v0, vcc -; GFX900-NEXT: v_lshlrev_b64 v[0:1], 2, v[1:2] -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x7ffff000, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, 0x80000000, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX900-NEXT: global_load_dword v7, v[0:1], off -; GFX900-NEXT: global_load_dword v8, v[2:3], off offset:2048 -; GFX900-NEXT: global_load_dword v9, v[2:3], off offset:3072 -; GFX900-NEXT: global_load_dword v10, v[4:5], off -; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_u32_e32 v0, v8, v7 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add3_u32 v0, v9, v0, v10 -; GFX900-NEXT: global_store_dword v6, v0, s[34:35] -; GFX900-NEXT: s_endpgm +; GFX9-LABEL: p32Offset64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff8000, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v6 +; GFX9-NEXT: v_mov_b32_e32 v3, 2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_mov_b32 s0, 0x7ffff000 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0x80000000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v7, v[0:1], off +; GFX9-NEXT: global_load_dword v8, v[2:3], off offset:2048 +; GFX9-NEXT: global_load_dword v9, v[2:3], off offset:3072 +; GFX9-NEXT: global_load_dword v10, v[4:5], off +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, v8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add3_u32 v0, v9, v0, v10 +; GFX9-NEXT: global_store_dword v6, v0, s[34:35] +; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: p32Offset64: ; GFX10: ; %bb.0: ; %entry @@ -1890,15 +1692,14 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff8000, v2 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; GFX10-NEXT: v_add_co_u32 v2, s0, s34, v4 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s35, 0, s0 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff8000, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0x80000000 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: global_load_dword v5, v[0:1], off @@ -1915,51 +1716,6 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX10-NEXT: global_store_dword v4, v0, s[34:35] ; GFX10-NEXT: s_endpgm ; -; GFX90A-LABEL: p32Offset64: -; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX90A-NEXT: s_mov_b32 s38, -1 -; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s3 -; GFX90A-NEXT: s_addc_u32 s37, s37, 0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX90A-NEXT: s_getpc_b64 s[0:1] -; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX90A-NEXT: v_mov_b32_e32 v31, v0 -; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_mov_b32 s32, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff8000, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s35 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s34, v6 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v0, vcc -; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 2, v[2:3] -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x7ffff000, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x80000000, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: global_load_dword v7, v[0:1], off -; GFX90A-NEXT: global_load_dword v8, v[2:3], off offset:2048 -; GFX90A-NEXT: global_load_dword v9, v[2:3], off offset:3072 -; GFX90A-NEXT: global_load_dword v10, v[4:5], off -; GFX90A-NEXT: s_waitcnt vmcnt(2) -; GFX90A-NEXT: v_add_u32_e32 v0, v8, v7 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add3_u32 v0, v9, v0, v10 -; GFX90A-NEXT: global_store_dword v6, v0, s[34:35] -; GFX90A-NEXT: s_endpgm -; ; GFX11-LABEL: p32Offset64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_getpc_b64 s[2:3] @@ -1971,17 +1727,17 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff8000, v2 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff8000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v2, s0, s34, v6 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s35, 0, s0 +; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v6 +; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x7ffff000, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo @@ -2322,137 +2078,135 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s35 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s35 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, 3 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x3800 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x3000 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x2800 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[3:4] ; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] ; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] ; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] ; GFX8-NEXT: s_movk_i32 s0, 0x2000 -; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x1800 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] ; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] ; GFX8-NEXT: s_movk_i32 s0, 0x1000 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x800, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v11 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v11 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v6, v12, vcc ; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v8, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v13, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v13, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v14, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v15, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v15, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v16, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v17, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v17, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v18, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc +; GFX8-NEXT: flat_store_dwordx2 v[1:2], v[3:4] ; GFX8-NEXT: s_endpgm ; -; GFX900-LABEL: ReverseOrder: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX900-NEXT: s_mov_b32 s38, -1 -; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s3 -; GFX900-NEXT: s_addc_u32 s37, s37, 0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX900-NEXT: s_getpc_b64 s[0:1] -; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX900-NEXT: v_mov_b32_e32 v31, v0 -; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: s_mov_b32 s32, 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX900-NEXT: v_and_b32_e32 v22, 0xffff8000, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, s35 -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s34, v22 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v0, vcc -; GFX900-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] -; GFX900-NEXT: s_movk_i32 s0, 0x3000 -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 -; GFX900-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX900-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:2048 -; GFX900-NEXT: global_load_dwordx2 v[8:9], v[4:5], off -; GFX900-NEXT: s_movk_i32 s0, 0x2000 -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX900-NEXT: global_load_dwordx2 v[10:11], v[4:5], off offset:2048 -; GFX900-NEXT: s_movk_i32 s0, 0x1000 -; GFX900-NEXT: v_add_co_u32_e32 v12, vcc, s0, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc -; GFX900-NEXT: global_load_dwordx2 v[14:15], v[12:13], off -; GFX900-NEXT: global_load_dwordx2 v[16:17], v[4:5], off -; GFX900-NEXT: global_load_dwordx2 v[18:19], v[12:13], off offset:2048 -; GFX900-NEXT: global_load_dwordx2 v[20:21], v[0:1], off offset:2048 -; GFX900-NEXT: s_waitcnt vmcnt(6) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v3, vcc -; GFX900-NEXT: s_waitcnt vmcnt(5) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc -; GFX900-NEXT: s_waitcnt vmcnt(4) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc -; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v16, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v17, v1, vcc -; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v18, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v19, v1, vcc -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v20, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v21, v1, vcc -; GFX900-NEXT: global_store_dwordx2 v22, v[0:1], s[34:35] -; GFX900-NEXT: s_endpgm +; GFX9-LABEL: ReverseOrder: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff8000, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v22 +; GFX9-NEXT: v_mov_b32_e32 v3, 3 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_movk_i32 s0, 0x3000 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:2048 +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[4:5], off +; GFX9-NEXT: s_movk_i32 s0, 0x2000 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[10:11], v[4:5], off offset:2048 +; GFX9-NEXT: s_movk_i32 s0, 0x1000 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[14:15], v[12:13], off +; GFX9-NEXT: global_load_dwordx2 v[16:17], v[4:5], off +; GFX9-NEXT: global_load_dwordx2 v[18:19], v[12:13], off offset:2048 +; GFX9-NEXT: global_load_dwordx2 v[20:21], v[0:1], off offset:2048 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v16, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v17, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v18, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v19, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v20, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v21, v1, vcc +; GFX9-NEXT: global_store_dwordx2 v22, v[0:1], s[34:35] +; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: ReverseOrder: ; GFX10: ; %bb.0: ; %entry @@ -2474,15 +2228,14 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX10-NEXT: v_and_b32_e32 v20, 0xffff8000, v2 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; GFX10-NEXT: v_add_co_u32 v2, s0, s34, v20 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s35, 0, s0 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 3 +; GFX10-NEXT: v_and_b32_e32 v20, 0xffff8000, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v20 +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x3800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x3000, v0 @@ -2531,76 +2284,6 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX10-NEXT: global_store_dwordx2 v20, v[0:1], s[34:35] ; GFX10-NEXT: s_endpgm ; -; GFX90A-LABEL: ReverseOrder: -; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX90A-NEXT: s_mov_b32 s38, -1 -; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s3 -; GFX90A-NEXT: s_addc_u32 s37, s37, 0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX90A-NEXT: s_getpc_b64 s[0:1] -; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX90A-NEXT: v_mov_b32_e32 v31, v0 -; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_mov_b32 s32, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX90A-NEXT: v_and_b32_e32 v22, 0xffff8000, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s35 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s34, v22 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v0, vcc -; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 3, v[2:3] -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc -; GFX90A-NEXT: s_movk_i32 s0, 0x3000 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:2048 -; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[4:5], off -; GFX90A-NEXT: s_movk_i32 s0, 0x2000 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[4:5], off offset:2048 -; GFX90A-NEXT: s_movk_i32 s0, 0x1000 -; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, s0, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc -; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[12:13], off -; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[4:5], off -; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[12:13], off offset:2048 -; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[0:1], off offset:2048 -; GFX90A-NEXT: s_waitcnt vmcnt(6) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v3, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(5) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(4) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(2) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v16, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v17, v1, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v18, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v19, v1, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v20, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v21, v1, vcc -; GFX90A-NEXT: global_store_dwordx2 v22, v[0:1], s[34:35] -; GFX90A-NEXT: s_endpgm -; ; GFX11-LABEL: ReverseOrder: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_getpc_b64 s[2:3] @@ -2612,17 +2295,17 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v16, 0xffff8000, v2 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff8000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v2, s0, s34, v16 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s35, 0, s0 +; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v16 +; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x3000, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo @@ -2733,71 +2416,69 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s35 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v4, v1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s35 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, 3 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x800 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0, v0 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, -1, v6, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] ; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc -; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc +; GFX8-NEXT: flat_store_dwordx2 v[1:2], v[3:4] ; GFX8-NEXT: s_endpgm ; -; GFX900-LABEL: negativeoffset: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX900-NEXT: s_mov_b32 s38, -1 -; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s3 -; GFX900-NEXT: s_addc_u32 s37, s37, 0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX900-NEXT: s_getpc_b64 s[0:1] -; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX900-NEXT: v_mov_b32_e32 v31, v0 -; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: s_mov_b32 s32, 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX900-NEXT: v_and_b32_e32 v8, 0xffff8000, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, s35 -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s34, v8 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v0, vcc -; GFX900-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] -; GFX900-NEXT: s_movk_i32 s0, 0x1000 -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v3, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v1, vcc -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc -; GFX900-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX900-NEXT: global_load_dwordx2 v[6:7], v[2:3], off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v6, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v5, vcc -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[34:35] -; GFX900-NEXT: s_endpgm +; GFX9-LABEL: negativeoffset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff8000, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v8 +; GFX9-NEXT: v_mov_b32_e32 v3, 3 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: s_movk_i32 s0, 0x1000 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v5, vcc +; GFX9-NEXT: global_store_dwordx2 v8, v[0:1], s[34:35] +; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: negativeoffset: ; GFX10: ; %bb.0: ; %entry @@ -2819,19 +2500,18 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff8000, v2 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; GFX10-NEXT: v_add_co_u32 v2, s0, s34, v8 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s35, 0, s0 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v3, vcc_lo -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 3 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff8000, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v8 +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v3 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0, v3 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v4, vcc_lo ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off @@ -2841,49 +2521,6 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[34:35] ; GFX10-NEXT: s_endpgm ; -; GFX90A-LABEL: negativeoffset: -; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX90A-NEXT: s_mov_b32 s38, -1 -; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s3 -; GFX90A-NEXT: s_addc_u32 s37, s37, 0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX90A-NEXT: s_getpc_b64 s[0:1] -; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX90A-NEXT: v_mov_b32_e32 v31, v0 -; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_mov_b32 s32, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX90A-NEXT: v_and_b32_e32 v8, 0xffff8000, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s35 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s34, v8 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v0, vcc -; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 3, v[2:3] -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v4, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v1, vcc -; GFX90A-NEXT: s_movk_i32 s0, 0x1000 -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[2:3], off -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v6, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v5, vcc -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[34:35] -; GFX90A-NEXT: s_endpgm -; ; GFX11-LABEL: negativeoffset: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_getpc_b64 s[2:3] @@ -2895,22 +2532,22 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff8000, v2 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff8000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v2, s0, s34, v4 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s35, 0, s0 +; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, v1, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v2 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0, v2 -; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0, v3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v5, vcc_lo ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:-2048 ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index 08db1e7fee259..90a06d8e2d0a2 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -10083,27 +10083,17 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_add_u32 s40, s40, s3 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v5, -1, v0 -; GFX6-NEXT: v_mov_b32_e32 v6, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_mov_b32_e32 v6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 8, v5 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:240 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 8, v0 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240 ; GFX6-NEXT: s_addc_u32 s41, s41, 0 -; GFX6-NEXT: s_mov_b32 s2, 0x83800 -; GFX6-NEXT: s_mov_b64 s[8:9], exec -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:224 ; GFX6-NEXT: s_mov_b32 s2, 0x83400 +; GFX6-NEXT: s_mov_b64 s[8:9], exec ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10111,7 +10101,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:208 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224 ; GFX6-NEXT: s_mov_b32 s2, 0x83000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10120,7 +10110,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:192 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208 ; GFX6-NEXT: s_mov_b32 s2, 0x82c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10129,7 +10119,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:176 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192 ; GFX6-NEXT: s_mov_b32 s2, 0x82800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10138,7 +10128,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:160 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176 ; GFX6-NEXT: s_mov_b32 s2, 0x82400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10147,7 +10137,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:144 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160 ; GFX6-NEXT: s_mov_b32 s2, 0x82000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10156,7 +10146,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:128 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144 ; GFX6-NEXT: s_mov_b32 s2, 0x81c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10165,7 +10155,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:112 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128 ; GFX6-NEXT: s_mov_b32 s2, 0x81800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10174,7 +10164,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:96 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112 ; GFX6-NEXT: s_mov_b32 s2, 0x81400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10183,7 +10173,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:80 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96 ; GFX6-NEXT: s_mov_b32 s2, 0x81000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10191,18 +10181,27 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_load_dwordx4 v[17:20], v[7:8], s[4:7], 0 addr64 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 -; GFX6-NEXT: buffer_load_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:16 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80 +; GFX6-NEXT: s_mov_b32 s2, 0x80c00 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_load_dwordx4 v[16:19], v[5:6], s[4:7], 0 addr64 offset:64 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16 ; GFX6-NEXT: s_mov_b32 s2, 0x80800 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v11, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_load_dwordx4 v[13:16], v[7:8], s[4:7], 0 addr64 offset:32 +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_load_dwordx4 v[12:15], v[5:6], s[4:7], 0 addr64 offset:32 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 @@ -10217,17 +10216,9 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[8:9] -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[7:8], s[4:7], 0 addr64 offset:48 -; GFX6-NEXT: s_mov_b32 s2, 0x80c00 +; GFX6-NEXT: buffer_load_dwordx4 v[20:23], v[5:6], s[4:7], 0 addr64 offset:48 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(3) ; GFX6-NEXT: v_mov_b32_e32 v7, 1 ; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: buffer_store_dword v7, v4, s[40:43], 0 offen @@ -10245,7 +10236,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s9, 5 ; GFX6-NEXT: v_writelane_b32 v4, s10, 6 ; GFX6-NEXT: v_writelane_b32 v4, s11, 7 -; GFX6-NEXT: s_mov_b32 s12, 0x83c00 +; GFX6-NEXT: s_mov_b32 s12, 0x83800 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 @@ -10285,7 +10276,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s13, 5 ; GFX6-NEXT: v_writelane_b32 v4, s14, 6 ; GFX6-NEXT: v_writelane_b32 v4, s15, 7 -; GFX6-NEXT: s_mov_b32 s38, 0x84400 +; GFX6-NEXT: s_mov_b32 s38, 0x84000 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 @@ -10293,7 +10284,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_mov_b64 exec, s[36:37] ; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s38, 0x83c00 +; GFX6-NEXT: s_mov_b32 s38, 0x83800 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload @@ -10321,7 +10312,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s21, 5 ; GFX6-NEXT: v_writelane_b32 v4, s22, 6 ; GFX6-NEXT: v_writelane_b32 v4, s23, 7 -; GFX6-NEXT: s_mov_b32 s38, 0x84c00 +; GFX6-NEXT: s_mov_b32 s38, 0x84800 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 @@ -10329,7 +10320,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_mov_b64 exec, s[36:37] ; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s38, 0x84400 +; GFX6-NEXT: s_mov_b32 s38, 0x84000 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload @@ -10357,7 +10348,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s29, 5 ; GFX6-NEXT: v_writelane_b32 v4, s30, 6 ; GFX6-NEXT: v_writelane_b32 v4, s31, 7 -; GFX6-NEXT: s_mov_b32 s38, 0x85400 +; GFX6-NEXT: s_mov_b32 s38, 0x85000 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 @@ -10365,7 +10356,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_mov_b64 exec, s[36:37] ; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s38, 0x84c00 +; GFX6-NEXT: s_mov_b32 s38, 0x84800 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload @@ -10389,7 +10380,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s1, 1 ; GFX6-NEXT: v_writelane_b32 v4, s2, 2 ; GFX6-NEXT: v_writelane_b32 v4, s3, 3 -; GFX6-NEXT: s_mov_b32 s38, 0x85c00 +; GFX6-NEXT: s_mov_b32 s38, 0x85800 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 @@ -10403,7 +10394,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s5, 1 ; GFX6-NEXT: v_writelane_b32 v4, s6, 2 ; GFX6-NEXT: v_writelane_b32 v4, s7, 3 -; GFX6-NEXT: s_mov_b32 s36, 0x86000 +; GFX6-NEXT: s_mov_b32 s36, 0x85c00 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 @@ -10415,7 +10406,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_writelane_b32 v4, s2, 0 ; GFX6-NEXT: v_writelane_b32 v4, s3, 1 -; GFX6-NEXT: s_mov_b32 s4, 0x86400 +; GFX6-NEXT: s_mov_b32 s4, 0x86000 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s4 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 @@ -10423,7 +10414,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_mov_b64 exec, s[0:1] ; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s38, 0x85400 +; GFX6-NEXT: s_mov_b32 s38, 0x85000 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload @@ -10441,7 +10432,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_mov_b64 exec, s[36:37] ; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2180 +; GFX6-NEXT: v_mov_b32_e32 v7, 0x2170 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload @@ -10456,7 +10447,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_mov_b64 vcc, s[34:35] ; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_mov_b64 exec, 3 -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2190 +; GFX6-NEXT: v_mov_b32_e32 v7, 0x2180 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload @@ -10472,7 +10463,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_mov_b64 s[34:35], vcc ; GFX6-NEXT: s_mov_b64 s[4:5], exec ; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: s_mov_b32 s6, 0x85c00 +; GFX6-NEXT: s_mov_b32 s6, 0x85800 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload @@ -10484,35 +10475,46 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[4:5] -; GFX6-NEXT: s_mov_b32 s2, 0x83c00 +; GFX6-NEXT: s_mov_b32 s2, 0x83800 ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_mov_b32 s2, 0x84400 -; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(4) -; GFX6-NEXT: v_mov_b32_e32 v0, v17 -; GFX6-NEXT: v_mov_b32_e32 v1, v18 -; GFX6-NEXT: v_mov_b32_e32 v2, v19 -; GFX6-NEXT: v_mov_b32_e32 v3, v20 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_mov_b32 s2, 0x84000 +; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s2, 0x84800 +; GFX6-NEXT: buffer_store_dword v16, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v17, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v18, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v19, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: v_mov_b32_e32 v0, v20 +; GFX6-NEXT: v_mov_b32_e32 v1, v21 +; GFX6-NEXT: v_mov_b32_e32 v2, v22 +; GFX6-NEXT: v_mov_b32_e32 v3, v23 +; GFX6-NEXT: s_waitcnt expcnt(3) ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: v_mov_b32_e32 v20, v3 -; GFX6-NEXT: buffer_load_dword v13, off, s[40:43], s2 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s2, 0x83c00 -; GFX6-NEXT: v_mov_b32_e32 v19, v2 -; GFX6-NEXT: v_mov_b32_e32 v18, v1 -; GFX6-NEXT: v_mov_b32_e32 v17, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v16, off, s[40:43], s2 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s2, 0x84000 +; GFX6-NEXT: v_mov_b32_e32 v23, v3 +; GFX6-NEXT: buffer_load_dword v12, off, s[40:43], s2 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v13, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v14, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v15, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s2, 0x83800 +; GFX6-NEXT: v_mov_b32_e32 v22, v2 +; GFX6-NEXT: v_mov_b32_e32 v21, v1 +; GFX6-NEXT: v_mov_b32_e32 v20, v0 ; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload @@ -10545,183 +10547,175 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[2:3] -; GFX6-NEXT: s_mov_b32 s4, 0x83800 -; GFX6-NEXT: v_lshl_b64 v[4:5], v[5:6], 8 -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_mov_b32 s4, 0x83400 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:240 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_mov_b32 s4, 0x83000 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:224 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x82c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:208 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x82800 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:192 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x82400 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:176 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x82000 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:160 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x81c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:144 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x81800 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:128 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x81400 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:112 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x81000 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:96 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x80c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:80 -; GFX6-NEXT: buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:64 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v20, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x80800 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:48 -; GFX6-NEXT: buffer_store_dwordx4 v[13:16], v[4:5], s[0:3], 0 addr64 offset:32 -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:80 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], v[5:6], s[0:3], 0 addr64 offset:64 +; GFX6-NEXT: buffer_store_dwordx4 v[20:23], v[5:6], s[0:3], 0 addr64 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], v[5:6], s[0:3], 0 addr64 offset:32 +; GFX6-NEXT: s_waitcnt expcnt(3) +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX9-FLATSCR-LABEL: test_limited_sgpr: ; GFX9-FLATSCR: ; %bb.0: ; %entry ; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 -; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v5, -1, v0 -; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 8, v0 ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:240 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:240 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 1 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[8:11], v0, s[38:39] offset:224 -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:208 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v0, s[38:39] offset:192 -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:176 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2090 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[16:19], v0, s[38:39] offset:160 -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:144 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:128 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:112 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:96 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:80 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:224 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:64 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:208 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2030 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:48 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[19:22], v5, s[38:39] offset:192 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[15:18], v5, s[38:39] offset:176 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:160 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:144 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:128 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:112 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 1 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[7:10], v5, s[38:39] +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:96 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) +; GFX9-FLATSCR-NEXT: v_lshl_add_u32 v4, v7, 13, v4 +; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX9-FLATSCR-NEXT: scratch_store_dword v4, v6, off +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:80 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:32 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:64 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2090 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:16 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:48 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v0, s[38:39] -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:32 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:16 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: v_lshl_add_u32 v4, v0, 13, v4 -; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dword v4, v7, off +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ; def s[0:7] ; GFX9-FLATSCR-NEXT: ;;#ASMEND @@ -10750,23 +10744,27 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39] ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20e0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v11 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20f0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2100 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v13 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v14 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_nop 0 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ;;#ASMEND -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[8:11], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20f0 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[20:23], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20e0 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[16:19], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, v1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, v0 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: ;;#ASMSTART @@ -10781,62 +10779,58 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: .LBB1_2: ; %ret ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[12:15], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: v_lshlrev_b64 v[4:5], 8, v[5:6] -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, s37 -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, s36, v4 -; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:112 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[12:15], off offset:240 -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[8:11], off offset:224 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:96 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2090 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:208 -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[20:23], off offset:192 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[20:23], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:80 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[20:23], off offset:176 -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[16:19], off offset:160 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[16:19], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:64 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:48 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[12:15], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:32 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[16:19], off offset:144 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:128 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(3) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[12:15], off offset:112 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:96 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2030 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:80 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:16 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[7:10], s[36:37] ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:64 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:240 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2030 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:48 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:224 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:208 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[19:22], s[36:37] offset:192 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[15:18], s[36:37] offset:176 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[11:14], s[36:37] offset:160 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16 -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:144 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:128 ; GFX9-FLATSCR-NEXT: s_endpgm ; ; GFX10-FLATSCR-LABEL: test_limited_sgpr: @@ -10847,32 +10841,31 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX10-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 1 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 1 ; GFX10-FLATSCR-NEXT: s_mov_b32 s33, exec_lo -; GFX10-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v5, -1, v0 -; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; GFX10-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 8, v0 ; GFX10-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLATSCR-NEXT: s_clause 0xf -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[64:67], v0, s[38:39] offset:240 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[60:63], v0, s[38:39] offset:224 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[56:59], v0, s[38:39] offset:208 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[52:55], v0, s[38:39] offset:192 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[48:51], v0, s[38:39] offset:176 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[44:47], v0, s[38:39] offset:160 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[40:43], v0, s[38:39] offset:144 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[36:39], v0, s[38:39] offset:128 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[32:35], v0, s[38:39] offset:112 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[28:31], v0, s[38:39] offset:96 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[24:27], v0, s[38:39] offset:80 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v0, s[38:39] offset:64 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[16:19], v0, s[38:39] offset:48 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[12:15], v0, s[38:39] offset:32 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[8:11], v0, s[38:39] offset:16 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v0, s[38:39] +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[35:38], v5, s[38:39] offset:240 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[31:34], v5, s[38:39] offset:224 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[27:30], v5, s[38:39] offset:208 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[23:26], v5, s[38:39] offset:192 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[19:22], v5, s[38:39] offset:176 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[15:18], v5, s[38:39] offset:160 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:144 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[7:10], v5, s[38:39] offset:128 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[63:66], v5, s[38:39] offset:112 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[59:62], v5, s[38:39] offset:96 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[55:58], v5, s[38:39] offset:80 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[51:54], v5, s[38:39] offset:64 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[47:50], v5, s[38:39] offset:48 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[43:46], v5, s[38:39] offset:32 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[39:42], v5, s[38:39] offset:16 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLATSCR-NEXT: v_lshl_add_u32 v4, v0, 13, 16 -; GFX10-FLATSCR-NEXT: scratch_store_dword v4, v7, off +; GFX10-FLATSCR-NEXT: scratch_store_dword v4, v6, off ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ; def s[0:7] ; GFX10-FLATSCR-NEXT: ;;#ASMEND @@ -10901,124 +10894,124 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35] ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v88, v59 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v92, v63 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v87, v58 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v86, v57 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v85, v56 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v91, v62 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v90, v61 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v89, v60 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v60, v35 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[64:67], s0 ; 16-byte Folded Spill -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v68, v39 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v59, v34 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v58, v33 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v57, v32 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v67, v38 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v66, v37 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v65, v36 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v11 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v72, v43 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v76, v47 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v80, v51 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v84, v55 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v8 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v71, v42 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v70, v41 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v69, v40 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v40, v15 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v75, v46 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v74, v45 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v73, v44 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v44, v19 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v79, v50 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v78, v49 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v77, v48 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v48, v23 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v83, v54 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v82, v53 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v81, v52 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v52, v27 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v56, v31 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v10 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v12 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v41, v16 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v45, v20 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v49, v24 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v53, v28 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v39, v14 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v13 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v43, v18 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v42, v17 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v47, v22 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v46, v21 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v51, v26 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v50, v25 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v55, v30 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v54, v29 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v88, v58 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v92, v62 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v87, v57 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v86, v56 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v85, v55 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v91, v61 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v90, v60 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v89, v59 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v60, v34 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[63:66], s0 ; 16-byte Folded Spill +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v68, v38 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v59, v33 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v58, v32 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v57, v31 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v67, v37 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v66, v36 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v65, v35 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v10 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v72, v42 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v76, v46 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v80, v50 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v84, v54 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v7 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v71, v41 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v70, v40 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v69, v39 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v40, v14 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v75, v45 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v74, v44 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v73, v43 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v44, v18 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v79, v49 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v78, v48 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v77, v47 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v48, v22 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v83, v53 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v82, v52 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v81, v51 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v52, v26 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v56, v30 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v9 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v8 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v11 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v41, v15 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v45, v19 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v49, v23 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v53, v27 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v39, v13 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v12 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v43, v17 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v42, v16 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v47, v21 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v46, v20 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v51, v25 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v50, v24 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v55, v29 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v54, v28 ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ;;#ASMEND -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, v33 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v53 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v49 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, v45 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v41 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, v37 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, v34 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v35 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, v36 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v57 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v54 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v55 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, v56 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v50 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v51 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, v52 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, v46 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v47 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v48 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, v42 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v43 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v44 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, v38 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, v39 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, v40 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v58 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v59 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v60 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, v33 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, v53 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v49 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v45 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, v41 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, v37 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, v34 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, v35 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v36 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, v57 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v54 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v55 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v56 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v50 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v51 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v52 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, v46 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, v47 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v48 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v42 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, v43 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v44 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, v38 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, v39 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, v40 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v58 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v59 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v60 ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ;;#ASMEND -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v65 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v66 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v67 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v39, v68 -; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[64:67], off, s0 ; 16-byte Folded Reload -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v60, v89 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v56, v85 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v52, v81 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v48, v77 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v44, v73 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v40, v69 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v61, v90 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v62, v91 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v63, v92 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v57, v86 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v58, v87 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v59, v88 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v53, v82 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v54, v83 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v55, v84 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v49, v78 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v50, v79 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v51, v80 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v45, v74 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v46, v75 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v47, v76 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v41, v70 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v42, v71 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v43, v72 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v65 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v66 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v67 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v68 +; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[63:66], off, s0 ; 16-byte Folded Reload +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v59, v89 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v55, v85 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v51, v81 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v47, v77 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v43, v73 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v39, v69 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v60, v90 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v61, v91 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v62, v92 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v56, v86 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v57, v87 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v58, v88 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v52, v82 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v53, v83 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v54, v84 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v48, v78 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v49, v79 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v50, v80 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v44, v74 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v45, v75 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v46, v76 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v40, v70 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v41, v71 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v42, v72 ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: ;;#ASMSTART @@ -11031,26 +11024,23 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: .LBB1_2: ; %ret ; GFX10-FLATSCR-NEXT: s_or_b32 exec_lo, exec_lo, s33 -; GFX10-FLATSCR-NEXT: v_lshlrev_b64 v[4:5], 8, v[5:6] -; GFX10-FLATSCR-NEXT: v_add_co_u32 v4, vcc_lo, s36, v4 -; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s37, v5, vcc_lo -; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[64:67], off offset:240 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[60:63], off offset:224 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[56:59], off offset:208 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[52:55], off offset:192 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[48:51], off offset:176 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[44:47], off offset:160 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[40:43], off offset:144 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[36:39], off offset:128 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[32:35], off offset:112 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[28:31], off offset:96 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[24:27], off offset:80 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[20:23], off offset:64 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[16:19], off offset:48 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[12:15], off offset:32 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[8:11], off offset:16 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[63:66], s[36:37] offset:112 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[59:62], s[36:37] offset:96 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[55:58], s[36:37] offset:80 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[51:54], s[36:37] offset:64 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[47:50], s[36:37] offset:48 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[43:46], s[36:37] offset:32 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[39:42], s[36:37] offset:16 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[35:38], s[36:37] offset:240 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[31:34], s[36:37] offset:224 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[27:30], s[36:37] offset:208 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[23:26], s[36:37] offset:192 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[19:22], s[36:37] offset:176 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[15:18], s[36:37] offset:160 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[11:14], s[36:37] offset:144 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[7:10], s[36:37] offset:128 ; GFX10-FLATSCR-NEXT: s_endpgm entry: %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index 930ba80ad6963..cc73302f85637 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -571,7 +571,7 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-NEXT: successors: %bb.2(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.tex.coerce.kernarg.offset, align 4, addrspace 4) - ; SI-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 3, killed [[COPY1]](s32), implicit $exec + ; SI-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 3, killed [[COPY1]](s32), implicit $exec ; SI-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1) ; SI-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[GLOBAL_LOAD_DWORDX2_SADDR]], 16, 0, implicit $exec :: (invariant load (s128) from %ir.3 + 16, addrspace 4) ; SI-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub3 diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index b8d18f56b7602..dfbc4790e63b0 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -6,22 +6,18 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX906-NEXT: v_lshlrev_b32_e32 v5, 2, v0 +; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v2, v1, s[4:5] -; GFX906-NEXT: v_mov_b32_e32 v1, 0 +; GFX906-NEXT: global_load_dword v2, v5, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v2 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB0_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] -; GFX906-NEXT: v_mov_b32_e32 v0, s7 -; GFX906-NEXT: v_add_co_u32_e32 v2, vcc, s6, v2 -; GFX906-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc -; GFX906-NEXT: global_load_dword v2, v[2:3], off +; GFX906-NEXT: global_load_dword v2, v5, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v2 @@ -54,11 +50,11 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX906-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v2, v1, s[4:5] -; GFX906-NEXT: v_mov_b32_e32 v1, 0 +; GFX906-NEXT: global_load_dword v2, v6, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v2 @@ -66,11 +62,7 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB1_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] -; GFX906-NEXT: v_mov_b32_e32 v0, s7 -; GFX906-NEXT: v_add_co_u32_e32 v2, vcc, s6, v2 -; GFX906-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc -; GFX906-NEXT: global_load_dword v2, v[2:3], off +; GFX906-NEXT: global_load_dword v2, v6, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v2 @@ -106,36 +98,32 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX906-NEXT: v_lshlrev_b32_e32 v7, 3, v0 +; GFX906-NEXT: v_mov_b32_e32 v5, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[2:3], v1, s[4:5] -; GFX906-NEXT: v_mov_b32_e32 v1, 0 +; GFX906-NEXT: global_load_dwordx2 v[1:2], v7, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b64 v[4:5], 24, v[2:3] -; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX906-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB2_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: v_lshlrev_b64 v[2:3], 3, v[0:1] -; GFX906-NEXT: v_mov_b32_e32 v0, s7 -; GFX906-NEXT: v_add_co_u32_e32 v2, vcc, s6, v2 -; GFX906-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc -; GFX906-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX906-NEXT: global_load_dwordx2 v[1:2], v7, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b64 v[4:5], 24, v[2:3] -; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX906-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX906-NEXT: .LBB2_2: ; %bb.2 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v6 -; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v4 -; GFX906-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_byte v1, v3, s[2:3] offset:4 -; GFX906-NEXT: global_store_dword v1, v0, s[2:3] +; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_byte v5, v2, s[2:3] offset:4 +; GFX906-NEXT: global_store_dword v5, v0, s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -159,46 +147,42 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX906-NEXT: v_lshlrev_b32_e32 v10, 3, v0 +; GFX906-NEXT: v_mov_b32_e32 v3, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[2:3], v1, s[4:5] -; GFX906-NEXT: v_mov_b32_e32 v1, 0 +; GFX906-NEXT: global_load_dwordx2 v[1:2], v10, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB3_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: v_lshlrev_b64 v[2:3], 3, v[0:1] -; GFX906-NEXT: v_mov_b32_e32 v0, s7 -; GFX906-NEXT: v_add_co_u32_e32 v2, vcc, s6, v2 -; GFX906-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc -; GFX906-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX906-NEXT: global_load_dwordx2 v[1:2], v10, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX906-NEXT: .LBB3_2: ; %bb.2 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v9 -; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v7 -; GFX906-NEXT: v_or_b32_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v6 -; GFX906-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v4 -; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx2 v1, v[2:3], s[2:3] +; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v7 +; GFX906-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v6 +; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v4 +; GFX906-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -222,68 +206,64 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; GFX906-NEXT: v_lshlrev_b32_e32 v18, 4, v0 +; GFX906-NEXT: v_mov_b32_e32 v5, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[2:5], v1, s[4:5] -; GFX906-NEXT: v_mov_b32_e32 v1, 0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v6, 24, v5 -; GFX906-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v5 -; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GFX906-NEXT: global_load_dwordx4 v[1:4], v18, s[4:5] +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_lshrrev_b32_e32 v6, 24, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB4_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: v_lshlrev_b64 v[2:3], 4, v[0:1] -; GFX906-NEXT: v_mov_b32_e32 v0, s7 -; GFX906-NEXT: v_add_co_u32_e32 v2, vcc, s6, v2 -; GFX906-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc -; GFX906-NEXT: global_load_dwordx4 v[2:5], v[2:3], off -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v6, 24, v5 -; GFX906-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v5 -; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GFX906-NEXT: global_load_dwordx4 v[1:4], v18, s[6:7] +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_lshrrev_b32_e32 v6, 24, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; GFX906-NEXT: .LBB4_2: ; %bb.2 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v17 -; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v15 -; GFX906-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v14 -; GFX906-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v12 -; GFX906-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v11 -; GFX906-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v9 -; GFX906-NEXT: v_or_b32_sdwa v4, v10, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v8 -; GFX906-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v6 -; GFX906-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] +; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v15 +; GFX906-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v14 +; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; GFX906-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v9 +; GFX906-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v8 +; GFX906-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v6 +; GFX906-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v5, v[0:3], s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -306,118 +286,114 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-LABEL: v32i8_liveout: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0 +; GFX906-NEXT: v_lshlrev_b32_e32 v31, 5, v0 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX906-NEXT: v_mov_b32_e32 v9, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[2:5], v1, s[4:5] offset:16 -; GFX906-NEXT: global_load_dwordx4 v[6:9], v1, s[4:5] -; GFX906-NEXT: v_mov_b32_e32 v1, 0 +; GFX906-NEXT: global_load_dwordx4 v[1:4], v31, s[4:5] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v31, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v10, 24, v5 -; GFX906-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GFX906-NEXT: v_lshrrev_b32_e32 v12, 8, v5 -; GFX906-NEXT: v_lshrrev_b32_e32 v13, 24, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v15, 8, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v16, 24, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v18, 8, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v19, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v21, 8, v2 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v22, 24, v9 -; GFX906-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; GFX906-NEXT: v_lshrrev_b32_e32 v24, 8, v9 -; GFX906-NEXT: v_lshrrev_b32_e32 v25, 24, v8 -; GFX906-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; GFX906-NEXT: v_lshrrev_b32_e32 v27, 8, v8 -; GFX906-NEXT: v_lshrrev_b32_e32 v28, 24, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v32, 24, v6 -; GFX906-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX906-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v18, 24, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_lshrrev_b32_e32 v21, 24, v8 +; GFX906-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX906-NEXT: v_lshrrev_b32_e32 v23, 8, v8 +; GFX906-NEXT: v_lshrrev_b32_e32 v24, 24, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX906-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX906-NEXT: v_lshrrev_b32_e32 v30, 24, v5 +; GFX906-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX906-NEXT: v_lshrrev_b32_e32 v33, 8, v5 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB5_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: v_lshlrev_b64 v[2:3], 5, v[0:1] -; GFX906-NEXT: v_mov_b32_e32 v0, s7 -; GFX906-NEXT: v_add_co_u32_e32 v10, vcc, s6, v2 -; GFX906-NEXT: v_addc_co_u32_e32 v11, vcc, v0, v3, vcc -; GFX906-NEXT: global_load_dwordx4 v[2:5], v[10:11], off offset:16 -; GFX906-NEXT: global_load_dwordx4 v[6:9], v[10:11], off -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v10, 24, v5 -; GFX906-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GFX906-NEXT: v_lshrrev_b32_e32 v12, 8, v5 -; GFX906-NEXT: v_lshrrev_b32_e32 v13, 24, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v15, 8, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v16, 24, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v18, 8, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v19, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v21, 8, v2 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v22, 24, v9 -; GFX906-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; GFX906-NEXT: v_lshrrev_b32_e32 v24, 8, v9 -; GFX906-NEXT: v_lshrrev_b32_e32 v25, 24, v8 -; GFX906-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; GFX906-NEXT: v_lshrrev_b32_e32 v27, 8, v8 -; GFX906-NEXT: v_lshrrev_b32_e32 v28, 24, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v32, 24, v6 -; GFX906-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX906-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; GFX906-NEXT: global_load_dwordx4 v[1:4], v31, s[6:7] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v31, s[6:7] +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v18, 24, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_lshrrev_b32_e32 v21, 24, v8 +; GFX906-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX906-NEXT: v_lshrrev_b32_e32 v23, 8, v8 +; GFX906-NEXT: v_lshrrev_b32_e32 v24, 24, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX906-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX906-NEXT: v_lshrrev_b32_e32 v30, 24, v5 +; GFX906-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX906-NEXT: v_lshrrev_b32_e32 v33, 8, v5 ; GFX906-NEXT: .LBB5_2: ; %bb.2 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: v_lshlrev_b16_e32 v28, 8, v28 -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v32 -; GFX906-NEXT: v_or_b32_sdwa v28, v30, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v31 -; GFX906-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v6, v6, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v6, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v27 -; GFX906-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v8, 8, v25 -; GFX906-NEXT: v_or_b32_sdwa v8, v26, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; GFX906-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v9, 8, v22 -; GFX906-NEXT: v_or_b32_sdwa v9, v23, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v21 -; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v19 -; GFX906-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v18 -; GFX906-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v16 -; GFX906-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v15 -; GFX906-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v13 -; GFX906-NEXT: v_or_b32_sdwa v4, v14, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v30 +; GFX906-NEXT: v_lshlrev_b16_e32 v31, 8, v33 ; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29 -; GFX906-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v10 -; GFX906-NEXT: v_or_b32_sdwa v7, v7, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v11, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v7, v7, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[6:9], s[0:1] -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[0:1] offset:16 +; GFX906-NEXT: v_lshlrev_b16_e32 v27, 8, v27 +; GFX906-NEXT: v_lshlrev_b16_e32 v26, 8, v26 +; GFX906-NEXT: v_lshlrev_b16_e32 v24, 8, v24 +; GFX906-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; GFX906-NEXT: v_or_b32_sdwa v30, v32, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v7, v7, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v8, v8, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v6, v6, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v7, v7, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v8, v8, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[0:1] +; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v20 +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v18 +; GFX906-NEXT: v_or_b32_sdwa v5, v19, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v17 +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; GFX906-NEXT: v_or_b32_sdwa v5, v16, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v14 +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v12 +; GFX906-NEXT: v_or_b32_sdwa v5, v13, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v11 +; GFX906-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[0:1] offset:16 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -446,1540 +422,1548 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_add_u32 s8, s8, s3 ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX906-NEXT: v_lshlrev_b32_e32 v63, 3, v0 ; GFX906-NEXT: s_addc_u32 s9, s9, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[18:21], v2, s[4:5] offset:240 -; GFX906-NEXT: global_load_dwordx4 v[6:9], v2, s[4:5] offset:224 -; GFX906-NEXT: global_load_dwordx4 v[10:13], v2, s[4:5] offset:208 -; GFX906-NEXT: global_load_dwordx4 v[14:17], v2, s[4:5] offset:192 -; GFX906-NEXT: v_mov_b32_e32 v1, 0 +; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[4:5] offset:240 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v63, s[4:5] offset:224 +; GFX906-NEXT: global_load_dwordx4 v[9:12], v63, s[4:5] offset:208 +; GFX906-NEXT: global_load_dwordx4 v[13:16], v63, s[4:5] offset:192 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: v_mov_b32_e32 v4, 0 ; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v21 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v21 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v21 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v20 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v20 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v20 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v19 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v19 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v18 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v18 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v18 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v9 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v9 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v8 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v8 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v7 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v7 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v6 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v6 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v13 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v13 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v13 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v12 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v12 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v12 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v11 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v11 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v10 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v10 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v17 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v17 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v17 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v16 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v16 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v15 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v15 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v15 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v14 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v14 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v14 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[18:21], v2, s[4:5] offset:176 -; GFX906-NEXT: global_load_dwordx4 v[22:25], v2, s[4:5] offset:160 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v21 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v21 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v21 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v20 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v20 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v20 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v19 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v19 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v18 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v18 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v18 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v25 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v25 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v25 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v24 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v24 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v24 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v23 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v23 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v23 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v22 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v22 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v22 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[26:29], v2, s[4:5] offset:144 -; GFX906-NEXT: global_load_dwordx4 v[30:33], v2, s[4:5] offset:128 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v29 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v29 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v29 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v28 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v28 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v28 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v27 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v27 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v27 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v26 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v26 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v26 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v33 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v33 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v33 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v32 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v32 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v32 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v31 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v31 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v31 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v30 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v30 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v30 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[34:37], v2, s[4:5] offset:112 -; GFX906-NEXT: global_load_dwordx4 v[38:41], v2, s[4:5] offset:96 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v37 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v37 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v37 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v36 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v36 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v36 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v35 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v35 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v34 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v34 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v34 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v41 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v41 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v41 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v40 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v40 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v40 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v39 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v39 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v39 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v38 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v38 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v38 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[42:45], v2, s[4:5] offset:80 -; GFX906-NEXT: global_load_dwordx4 v[46:49], v2, s[4:5] offset:64 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v45 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v45 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v45 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v44 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v44 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v44 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v43 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v43 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v43 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v42 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v42 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v42 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v49 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v49 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v49 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v48 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v48 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v48 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v47 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v47 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v47 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v46 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v46 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v46 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[50:53], v2, s[4:5] offset:48 -; GFX906-NEXT: global_load_dwordx4 v[54:57], v2, s[4:5] offset:32 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v53 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v53 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v53 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v52 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v52 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v52 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v51 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v51 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v51 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v50 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v50 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v57 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v57 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v57 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v56 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v56 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v56 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v55 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v55 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v55 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v54 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v54 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v54 -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[58:61], v2, s[4:5] offset:16 -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: global_load_dwordx4 v[2:5], v2, s[4:5] -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v61 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v61 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v61 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v60 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v60 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v60 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v59 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v59 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v59 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v58 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v58 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v58 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v5 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v5 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v5 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v4 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v4 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v63, 24, v2 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v3 -; GFX906-NEXT: buffer_store_dword v63, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v3 -; GFX906-NEXT: buffer_store_dword v63, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v63, 8, v2 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX906-NEXT: s_cbranch_execz .LBB6_2 -; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: v_lshlrev_b64 v[2:3], 3, v[0:1] -; GFX906-NEXT: v_mov_b32_e32 v0, s7 -; GFX906-NEXT: v_add_co_u32_e32 v2, vcc, s6, v2 -; GFX906-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc -; GFX906-NEXT: global_load_dwordx4 v[18:21], v[2:3], off offset:240 -; GFX906-NEXT: global_load_dwordx4 v[6:9], v[2:3], off offset:224 -; GFX906-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:208 -; GFX906-NEXT: global_load_dwordx4 v[14:17], v[2:3], off offset:192 -; GFX906-NEXT: s_waitcnt vmcnt(3) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18 +; GFX906-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v9 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v8 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v8 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v9 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v8 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v8 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v7 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v7 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v8 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v6 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v6 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v6 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v6 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v5 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v6 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v5 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v13 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v12 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v12 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v13 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v12 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v12 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v11 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v11 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v12 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v11 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v11 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v10 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v10 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v11 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v10 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v10 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v9 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v9 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v10 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v9 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v14 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v13 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v13 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[18:21], v[2:3], off offset:176 -; GFX906-NEXT: global_load_dwordx4 v[22:25], v[2:3], off offset:160 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v13 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[4:5] offset:176 +; GFX906-NEXT: global_load_dwordx4 v[21:24], v63, s[4:5] offset:160 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v25 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v24 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v24 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v25 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v24 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v24 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v23 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v23 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v24 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v23 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v23 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v22 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v22 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v23 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v22 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v22 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v22 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[26:29], v[2:3], off offset:144 -; GFX906-NEXT: global_load_dwordx4 v[30:33], v[2:3], off offset:128 +; GFX906-NEXT: global_load_dwordx4 v[25:28], v63, s[4:5] offset:144 +; GFX906-NEXT: global_load_dwordx4 v[29:32], v63, s[4:5] offset:128 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v29 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v28 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v28 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v29 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v28 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v28 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v27 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v27 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v28 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v27 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v27 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v26 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v26 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v27 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v26 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v26 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v25 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v25 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(12) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v32 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v32 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v31 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v31 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v30 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v30 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v29 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v29 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[33:36], v63, s[4:5] offset:112 +; GFX906-NEXT: global_load_dwordx4 v[37:40], v63, s[4:5] offset:96 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v36 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v36 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v35 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v35 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v34 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v34 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v33 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v33 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(12) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v40 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v40 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v39 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v39 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v38 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v38 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v37 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v37 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[41:44], v63, s[4:5] offset:80 +; GFX906-NEXT: global_load_dwordx4 v[45:48], v63, s[4:5] offset:64 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v44 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v44 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v43 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v43 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v43 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v42 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v42 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v41 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v41 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(12) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v48 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v48 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v47 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v47 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v47 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v46 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v46 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v46 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v45 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v45 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v45 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[49:52], v63, s[4:5] offset:48 +; GFX906-NEXT: global_load_dwordx4 v[53:56], v63, s[4:5] offset:32 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v52 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v52 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v51 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v51 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v50 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v50 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v49 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v49 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(12) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v56 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v56 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v56 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v55 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v55 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v54 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v54 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v53 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v53 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[57:60], v63, s[4:5] offset:16 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[4:5] +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v60 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v60 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v60 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v59 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v59 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v59 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v58 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v58 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v58 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v57 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v57 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v57 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(12) +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v3 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v3 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v2 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v2 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v0 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v1 +; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v0 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v1 +; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v0 +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_cbranch_execz .LBB6_2 +; GFX906-NEXT: ; %bb.1: ; %bb.1 +; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[6:7] offset:240 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v63, s[6:7] offset:224 +; GFX906-NEXT: global_load_dwordx4 v[9:12], v63, s[6:7] offset:208 +; GFX906-NEXT: global_load_dwordx4 v[13:16], v63, s[6:7] offset:192 +; GFX906-NEXT: s_waitcnt vmcnt(3) +; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v3 +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v3 +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v2 +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v1 +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1 +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v0 +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v8 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v8 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v7 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v6 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v6 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v5 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v5 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v12 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v12 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v11 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v11 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v10 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v10 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v9 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v9 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v16 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v16 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v15 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v15 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v14 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v14 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v13 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v13 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[6:7] offset:176 +; GFX906-NEXT: global_load_dwordx4 v[21:24], v63, s[6:7] offset:160 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(12) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v24 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v24 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v23 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v23 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v22 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v22 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[25:28], v63, s[6:7] offset:144 +; GFX906-NEXT: global_load_dwordx4 v[29:32], v63, s[6:7] offset:128 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v28 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v28 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v27 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v27 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v26 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v26 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v25 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v25 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v33 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v32 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v32 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v33 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v32 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v32 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v31 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v31 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v32 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v31 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v31 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v30 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v30 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v31 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v30 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v30 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v29 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v29 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v30 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v29 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[34:37], v[2:3], off offset:112 -; GFX906-NEXT: global_load_dwordx4 v[38:41], v[2:3], off offset:96 +; GFX906-NEXT: global_load_dwordx4 v[33:36], v63, s[6:7] offset:112 +; GFX906-NEXT: global_load_dwordx4 v[37:40], v63, s[6:7] offset:96 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v37 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v36 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v36 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v37 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v36 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v36 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v35 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v35 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v36 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v35 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v35 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v34 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v34 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v35 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v34 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v34 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v33 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v33 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v34 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v33 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v41 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v40 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v40 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v41 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v40 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v40 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v39 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v39 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v40 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v39 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v39 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v38 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v38 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v39 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v38 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v38 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v37 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v37 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v38 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v37 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[42:45], v[2:3], off offset:80 -; GFX906-NEXT: global_load_dwordx4 v[46:49], v[2:3], off offset:64 +; GFX906-NEXT: global_load_dwordx4 v[41:44], v63, s[6:7] offset:80 +; GFX906-NEXT: global_load_dwordx4 v[45:48], v63, s[6:7] offset:64 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v45 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v44 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v45 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v44 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v45 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v44 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v44 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v43 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v43 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v44 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v43 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v43 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v42 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v43 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v42 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v43 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v42 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v42 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v41 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v41 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v42 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v41 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v49 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v48 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v48 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v49 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v48 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v48 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v47 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v47 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v48 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v47 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v47 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v46 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v47 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v46 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v47 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v46 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v46 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v45 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v46 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v45 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v46 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v45 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[50:53], v[2:3], off offset:48 -; GFX906-NEXT: global_load_dwordx4 v[54:57], v[2:3], off offset:32 +; GFX906-NEXT: global_load_dwordx4 v[49:52], v63, s[6:7] offset:48 +; GFX906-NEXT: global_load_dwordx4 v[53:56], v63, s[6:7] offset:32 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v53 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v52 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v52 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v53 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v52 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v52 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v51 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v51 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v52 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v51 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v51 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v50 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v50 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v51 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v50 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v50 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v49 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v49 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v50 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v49 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v57 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v56 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v57 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v56 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v57 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v56 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v56 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v55 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v56 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v55 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v56 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v55 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v55 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v54 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v54 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v55 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v54 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v54 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v53 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v53 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v54 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v53 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[58:61], v[2:3], off offset:16 +; GFX906-NEXT: global_load_dwordx4 v[57:60], v63, s[6:7] offset:16 ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: global_load_dwordx4 v[2:5], v[2:3], off -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v61 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v61 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v61 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v60 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v60 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v60 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v59 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v59 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v59 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v58 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v58 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v58 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[6:7] +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v60 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v60 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v60 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v59 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v59 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v59 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v58 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v58 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v58 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v57 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v57 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v57 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v4 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v4 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v3 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v3 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v63, 8, v2 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v3 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v3 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v2 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v2 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v0 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v1 +; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v0 +; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v1 +; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v0 ; GFX906-NEXT: .LBB6_2: ; %bb.2 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v63 -; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v62 -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v62, 8, v62 +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v63, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 +; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v62, 8, v62 ; GFX906-NEXT: v_or_b32_sdwa v62, v63, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v59, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v60, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v61, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v59, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v57, 8, v57 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v58, 8, v58 ; GFX906-NEXT: v_or_b32_sdwa v58, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v57, 8, v57 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v57, 8, v57 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:16 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v55, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v56, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v55, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v53, 8, v53 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v54, 8, v54 ; GFX906-NEXT: v_or_b32_sdwa v54, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v53, 8, v53 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v53, 8, v53 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:32 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:32 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v51, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v51, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v49, 8, v49 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v50, 8, v50 ; GFX906-NEXT: v_or_b32_sdwa v50, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v49, 8, v49 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v49, 8, v49 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:48 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:48 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v47, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v48, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v47, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v45, 8, v45 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v46, 8, v46 ; GFX906-NEXT: v_or_b32_sdwa v46, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v45, 8, v45 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v45, 8, v45 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:64 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:64 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v43, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v43, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v44, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v45, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v43, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v41, 8, v41 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v42, 8, v42 ; GFX906-NEXT: v_or_b32_sdwa v42, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v41, 8, v41 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v44, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v41, 8, v41 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:80 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:80 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v39, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v39, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v37, 8, v37 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v38, 8, v38 ; GFX906-NEXT: v_or_b32_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v37, 8, v37 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v37, 8, v37 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:96 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:96 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v35, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v37, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v35, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v33, 8, v33 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX906-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v33, 8, v33 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v33, 8, v33 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:112 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:112 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v31, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v32, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v31, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v30 ; GFX906-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:128 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:128 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v27, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v28, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v29, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v27, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v26, 8, v26 ; GFX906-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v25, 8, v25 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:144 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v25, 8, v25 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:144 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v23, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v23, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v25, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v23, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v22, 8, v22 ; GFX906-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:160 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:160 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v19, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v21, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v19, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v18, 8, v18 ; GFX906-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v17, 8, v17 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:176 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v17, 8, v17 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v3, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v15, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:176 +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v14, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v4, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v14, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v13, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v5, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v14, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v14, 8, v14 -; GFX906-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v14, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v13, 8, v13 +; GFX906-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v14, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:192 +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:192 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(4) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v10, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) +; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v9, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX906-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v10, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; GFX906-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v9, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 -; GFX906-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v10, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:208 +; GFX906-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; GFX906-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:208 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(4) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) +; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX906-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; GFX906-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 -; GFX906-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:224 +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:224 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(5) +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(7) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: s_waitcnt vmcnt(3) +; GFX906-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX906-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload +; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; GFX906-NEXT: v_or_b32_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload +; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX906-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(2) +; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX906-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v1, v[2:5], s[2:3] offset:240 +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:240 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/xnor.ll b/llvm/test/CodeGen/AMDGPU/xnor.ll index 75a21bdc0ff3b..fe70e9e426ddf 100644 --- a/llvm/test/CodeGen/AMDGPU/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/xnor.ll @@ -113,8 +113,7 @@ define amdgpu_kernel void @xnor_v_s_i32_one_use(ptr addrspace(1) %out, i32 %s) { ; GCN-LABEL: {{^}}xnor_i64_s_v_one_use ; GCN-NOT: s_xnor_b64 ; GCN: s_not_b64 -; GCN: v_xor_b32 -; GCN: v_xor_b32 +; GCN: v_xor_b32_e32 ; GCN-DL: v_xnor_b32 ; GCN-DL: v_xnor_b32 define amdgpu_kernel void @xnor_i64_s_v_one_use( @@ -132,8 +131,7 @@ entry: ; GCN-LABEL: {{^}}xnor_i64_v_s_one_use ; GCN-NOT: s_xnor_b64 ; GCN: s_not_b64 -; GCN: v_xor_b32 -; GCN: v_xor_b32 +; GCN: v_xor_b32_e32 ; GCN-DL: v_xnor_b32 ; GCN-DL: v_xnor_b32 define amdgpu_kernel void @xnor_i64_v_s_one_use( diff --git a/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll b/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll index f2a53653ec2ff..26d8905b48d04 100644 --- a/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll +++ b/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll @@ -8,9 +8,8 @@ define void @BZ2_bzDecompress_bb5_2E_outer_bb35_2E_i_bb54_2E_i(ptr, i32 %c_nbloc ; CHECK-NEXT: movl %edx, %edx ; CHECK-NEXT: movl (%rdi,%rdx,4), %edx ; CHECK-NEXT: movzbl %dl, %r10d -; CHECK-NEXT: # kill: def $edx killed $edx def $rdx -; CHECK-NEXT: shrl $8, %edx ; CHECK-NEXT: addl $4, %r10d +; CHECK-NEXT: shrl $8, %edx ; CHECK-NEXT: movl (%rdi,%rdx,4), %edx ; CHECK-NEXT: movzbl %dl, %edi ; CHECK-NEXT: shrl $8, %edx diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll index ad85a090010f8..8ff4f4067dabd 100644 --- a/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll +++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll @@ -1392,11 +1392,8 @@ return: ; preds = %entry, %if.then define i64 @atomic_shl1_xor_64_const_br(ptr %v) nounwind { ; CHECK-LABEL: atomic_shl1_xor_64_const_br: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: lock btcq $4, (%rdi) -; CHECK-NEXT: setb %al -; CHECK-NEXT: shlq $4, %rax -; CHECK-NEXT: je .LBB48_1 +; CHECK-NEXT: jae .LBB48_1 ; CHECK-NEXT: # %bb.2: # %if.then ; CHECK-NEXT: movq 32(%rdi), %rax ; CHECK-NEXT: retq @@ -1458,12 +1455,9 @@ return: ; preds = %entry, %if.then define i64 @atomic_shl1_xor_64_const_brz(ptr %v) nounwind { ; CHECK-LABEL: atomic_shl1_xor_64_const_brz: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: lock btcq $4, (%rdi) -; CHECK-NEXT: setb %al -; CHECK-NEXT: shlq $4, %rax ; CHECK-NEXT: movl $123, %eax -; CHECK-NEXT: je .LBB50_1 +; CHECK-NEXT: jae .LBB50_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB50_1: # %if.then @@ -1524,11 +1518,8 @@ return: ; preds = %entry, %if.then define i64 @atomic_shl1_xor_64_const_brnz(ptr %v) nounwind { ; CHECK-LABEL: atomic_shl1_xor_64_const_brnz: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: lock btcq $4, (%rdi) -; CHECK-NEXT: setb %al -; CHECK-NEXT: shlq $4, %rax -; CHECK-NEXT: je .LBB52_1 +; CHECK-NEXT: jae .LBB52_1 ; CHECK-NEXT: # %bb.2: # %if.then ; CHECK-NEXT: movq 32(%rdi), %rax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512vnni-combine.ll b/llvm/test/CodeGen/X86/avx512vnni-combine.ll index 7a0527be05419..f0c8a7e208326 100644 --- a/llvm/test/CodeGen/X86/avx512vnni-combine.ll +++ b/llvm/test/CodeGen/X86/avx512vnni-combine.ll @@ -73,7 +73,7 @@ define <8 x i64> @foo_512(i32 %0, <8 x i64> %1, <8 x i64> %2, ptr %3) { ; CHECK-NEXT: # %bb.4: # %.preheader ; CHECK-NEXT: shlq $6, %rcx ; CHECK-NEXT: addq %rcx, %rsi -; CHECK-NEXT: shlq $6, %rax +; CHECK-NEXT: shll $6, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/X86/avxvnni-combine.ll b/llvm/test/CodeGen/X86/avxvnni-combine.ll index d8e73a5cf37d8..75e29df9f34ac 100644 --- a/llvm/test/CodeGen/X86/avxvnni-combine.ll +++ b/llvm/test/CodeGen/X86/avxvnni-combine.ll @@ -78,7 +78,7 @@ define <2 x i64> @foo_128(i32 %0, <2 x i64> %1, <2 x i64> %2, ptr %3) { ; AVX-NEXT: # %bb.4: # %.preheader ; AVX-NEXT: shlq $4, %rcx ; AVX-NEXT: addq %rcx, %rsi -; AVX-NEXT: shlq $4, %rax +; AVX-NEXT: shll $4, %eax ; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1 @@ -125,7 +125,7 @@ define <2 x i64> @foo_128(i32 %0, <2 x i64> %1, <2 x i64> %2, ptr %3) { ; AVX512-NEXT: # %bb.4: # %.preheader ; AVX512-NEXT: shlq $4, %rcx ; AVX512-NEXT: addq %rcx, %rsi -; AVX512-NEXT: shlq $4, %rax +; AVX512-NEXT: shll $4, %eax ; AVX512-NEXT: xorl %ecx, %ecx ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1 @@ -425,7 +425,7 @@ define <4 x i64> @foo_256(i32 %0, <4 x i64> %1, <4 x i64> %2, ptr %3) { ; AVX-NEXT: # %bb.4: # %.preheader ; AVX-NEXT: shlq $5, %rcx ; AVX-NEXT: addq %rcx, %rsi -; AVX-NEXT: shlq $5, %rax +; AVX-NEXT: shll $5, %eax ; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: .LBB4_5: # =>This Inner Loop Header: Depth=1 @@ -472,7 +472,7 @@ define <4 x i64> @foo_256(i32 %0, <4 x i64> %1, <4 x i64> %2, ptr %3) { ; AVX512-NEXT: # %bb.4: # %.preheader ; AVX512-NEXT: shlq $5, %rcx ; AVX512-NEXT: addq %rcx, %rsi -; AVX512-NEXT: shlq $5, %rax +; AVX512-NEXT: shll $5, %eax ; AVX512-NEXT: xorl %ecx, %ecx ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB4_5: # =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/X86/bswap.ll b/llvm/test/CodeGen/X86/bswap.ll index 17fd612b812eb..81eac5676bb5c 100644 --- a/llvm/test/CodeGen/X86/bswap.ll +++ b/llvm/test/CodeGen/X86/bswap.ll @@ -168,8 +168,8 @@ define i64 @not_bswap() { ; CHECK64-NEXT: movzwl var16(%rip), %eax ; CHECK64-NEXT: movl %eax, %ecx ; CHECK64-NEXT: shrl $8, %ecx -; CHECK64-NEXT: shlq $8, %rax -; CHECK64-NEXT: orq %rcx, %rax +; CHECK64-NEXT: shll $8, %eax +; CHECK64-NEXT: orl %ecx, %eax ; CHECK64-NEXT: retq %init = load i16, ptr @var16 %big = zext i16 %init to i64 @@ -197,7 +197,7 @@ define i64 @not_useful_bswap() { ; CHECK64-LABEL: not_useful_bswap: ; CHECK64: # %bb.0: ; CHECK64-NEXT: movzbl var8(%rip), %eax -; CHECK64-NEXT: shlq $8, %rax +; CHECK64-NEXT: shll $8, %eax ; CHECK64-NEXT: retq %init = load i8, ptr @var8 %big = zext i8 %init to i64 @@ -224,12 +224,9 @@ define i64 @finally_useful_bswap() { ; ; CHECK64-LABEL: finally_useful_bswap: ; CHECK64: # %bb.0: -; CHECK64-NEXT: movzwl var16(%rip), %ecx -; CHECK64-NEXT: movzbl %cl, %eax -; CHECK64-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; CHECK64-NEXT: shrl $8, %ecx -; CHECK64-NEXT: shlq $8, %rax -; CHECK64-NEXT: orq %rcx, %rax +; CHECK64-NEXT: movzwl var16(%rip), %eax +; CHECK64-NEXT: bswapl %eax +; CHECK64-NEXT: shrl $16, %eax ; CHECK64-NEXT: retq %init = load i16, ptr @var16 %big = zext i16 %init to i64 diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll index 5500ad3323043..a3568716edd9e 100644 --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -832,7 +832,7 @@ define void @pr59781(ptr %in, ptr %out) { ; CHECK: # %bb.0: ; CHECK-NEXT: movzwl (%rdi), %eax ; CHECK-NEXT: movzbl 2(%rdi), %ecx -; CHECK-NEXT: shlq $16, %rcx +; CHECK-NEXT: shll $16, %ecx ; CHECK-NEXT: orq %rax, %rcx ; CHECK-NEXT: movq %rcx, (%rsi) ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/cmp-concat.ll b/llvm/test/CodeGen/X86/cmp-concat.ll index 33834bc4470ae..5e030de1409f2 100644 --- a/llvm/test/CodeGen/X86/cmp-concat.ll +++ b/llvm/test/CodeGen/X86/cmp-concat.ll @@ -35,8 +35,8 @@ define i1 @cmp_anybits_concat_shl_shl_i16(i16 %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: movzwl %di, %eax ; CHECK-NEXT: movzwl %si, %ecx -; CHECK-NEXT: shlq $8, %rcx -; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: orl %eax, %ecx ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %zx = zext i16 %x to i64 @@ -53,8 +53,8 @@ define i1 @cmp_anybits_concat_shl_shl_i16_commute(i16 %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: movzwl %di, %eax ; CHECK-NEXT: movzwl %si, %ecx -; CHECK-NEXT: shlq $8, %rcx -; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: orl %eax, %ecx ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %zx = zext i16 %x to i64 diff --git a/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll index 63061b0d851b6..a5295d44b07c8 100644 --- a/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll +++ b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll @@ -67,7 +67,7 @@ define void @foo(ptr %arg3, i1 %icmp16) #0 { ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: callq *%rax -; CHECK-NEXT: shlq $4, %r14 +; CHECK-NEXT: shll $4, %r14d ; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload ; CHECK-NEXT: movl %r13d, 0 ; CHECK-NEXT: movb $0, 4 diff --git a/llvm/test/CodeGen/X86/combine-bitreverse.ll b/llvm/test/CodeGen/X86/combine-bitreverse.ll index 32579bd05605e..9f81fab54a49d 100644 --- a/llvm/test/CodeGen/X86/combine-bitreverse.ll +++ b/llvm/test/CodeGen/X86/combine-bitreverse.ll @@ -368,20 +368,20 @@ define i64 @test_bitreverse_shli_bitreverse_i64(i64 %a) nounwind { ; X64-NEXT: bswapq %rax ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: andl $235867919, %ecx # imm = 0xE0F0F0F -; X64-NEXT: shlq $4, %rcx +; X64-NEXT: shll $4, %ecx ; X64-NEXT: shrl $4, %eax ; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F -; X64-NEXT: orq %rcx, %rax +; X64-NEXT: orl %ecx, %eax ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: andl $590558003, %ecx # imm = 0x23333333 ; X64-NEXT: shrl $2, %eax ; X64-NEXT: andl $858993459, %eax # imm = 0x33333333 -; X64-NEXT: leaq (%rax,%rcx,4), %rax +; X64-NEXT: leal (%rax,%rcx,4), %eax ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: andl $357913941, %ecx # imm = 0x15555555 ; X64-NEXT: shrl %eax ; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 -; X64-NEXT: leaq (%rax,%rcx,2), %rax +; X64-NEXT: leal (%rax,%rcx,2), %eax ; X64-NEXT: retq %1 = call i64 @llvm.bitreverse.i64(i64 %a) %2 = shl i64 %1, 33 diff --git a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll index 5c23c155ed85f..142ac754c3f7e 100644 --- a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll @@ -1933,8 +1933,7 @@ define i64 @test_i64_2147483647_mask_shl_1(i64 %a0) { ; ; X64-LABEL: test_i64_2147483647_mask_shl_1: ; X64: # %bb.0: -; X64-NEXT: andl $2147483647, %edi # imm = 0x7FFFFFFF -; X64-NEXT: leaq (%rdi,%rdi), %rax +; X64-NEXT: leal (%rdi,%rdi), %eax ; X64-NEXT: retq %t0 = and i64 %a0, 2147483647 %t1 = shl i64 %t0, 1 diff --git a/llvm/test/CodeGen/X86/dagcombine-shifts.ll b/llvm/test/CodeGen/X86/dagcombine-shifts.ll index 6bfda6827d520..42b325dd4c229 100644 --- a/llvm/test/CodeGen/X86/dagcombine-shifts.ll +++ b/llvm/test/CodeGen/X86/dagcombine-shifts.ll @@ -146,7 +146,7 @@ define i64 @fun7(i8 zeroext %v) { ; X64: # %bb.0: # %entry ; X64-NEXT: sarb $4, %dil ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: shlq $4, %rax +; X64-NEXT: shll $4, %eax ; X64-NEXT: retq entry: %shr = ashr i8 %v, 4 @@ -166,9 +166,7 @@ define i64 @fun8(i16 zeroext %v) { ; X64-LABEL: fun8: ; X64: # %bb.0: # %entry ; X64-NEXT: movswl %di, %eax -; X64-NEXT: shrl $4, %eax -; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: shlq $4, %rax +; X64-NEXT: andl $1048560, %eax # imm = 0xFFFF0 ; X64-NEXT: retq entry: %shr = ashr i16 %v, 4 @@ -217,11 +215,12 @@ define i64 @fun10(i8 zeroext %v) { ; ; X64-LABEL: fun10: ; X64: # %bb.0: # %entry -; X64-NEXT: shrb $4, %dil -; X64-NEXT: movzbl %dil, %ecx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: shlq $4, %rax -; X64-NEXT: orq %rcx, %rax +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shrb $4, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: andl $-16, %edi +; X64-NEXT: orq %rdi, %rax ; X64-NEXT: retq entry: %shr = lshr i8 %v, 4 @@ -245,9 +244,9 @@ define i64 @fun11(i16 zeroext %v) { ; X64-LABEL: fun11: ; X64: # %bb.0: # %entry ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: shrl $4, %edi -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shlq $4, %rax +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shrl $4, %eax +; X64-NEXT: andl $-16, %edi ; X64-NEXT: addq %rdi, %rax ; X64-NEXT: retq entry: @@ -273,9 +272,9 @@ define i64 @fun12(i32 zeroext %v) { ; X64-LABEL: fun12: ; X64: # %bb.0: # %entry ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: shrl $4, %edi -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shlq $4, %rax +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shrl $4, %eax +; X64-NEXT: andl $-16, %edi ; X64-NEXT: addq %rdi, %rax ; X64-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll index 2e13776715e5b..3796dd796eaf9 100644 --- a/llvm/test/CodeGen/X86/divmod128.ll +++ b/llvm/test/CodeGen/X86/divmod128.ll @@ -425,36 +425,35 @@ entry: define i128 @urem_i128_12(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_12: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: movq %rsi, %rax -; X86-64-NEXT: shldq $62, %rdi, %rax +; X86-64-NEXT: movq %rsi, %rcx +; X86-64-NEXT: shldq $62, %rdi, %rcx ; X86-64-NEXT: shrq $2, %rsi -; X86-64-NEXT: addq %rax, %rsi -; X86-64-NEXT: adcq $0, %rsi -; X86-64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB -; X86-64-NEXT: movq %rsi, %rax -; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: addq %rsi, %rcx +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB +; X86-64-NEXT: movq %rcx, %rax +; X86-64-NEXT: mulq %rdx ; X86-64-NEXT: shrq %rdx -; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax -; X86-64-NEXT: subq %rax, %rsi +; X86-64-NEXT: leal (%rdx,%rdx,2), %eax +; X86-64-NEXT: subl %eax, %ecx ; X86-64-NEXT: andl $3, %edi -; X86-64-NEXT: leaq (%rdi,%rsi,4), %rax +; X86-64-NEXT: leaq (%rdi,%rcx,4), %rax ; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_12: ; WIN64: # %bb.0: # %entry ; WIN64-NEXT: movq %rdx, %r8 -; WIN64-NEXT: movq %rdx, %rax -; WIN64-NEXT: shldq $62, %rcx, %rax -; WIN64-NEXT: shrq $2, %r8 -; WIN64-NEXT: addq %rax, %r8 +; WIN64-NEXT: shldq $62, %rcx, %r8 +; WIN64-NEXT: shrq $2, %rdx +; WIN64-NEXT: addq %rdx, %r8 ; WIN64-NEXT: adcq $0, %r8 ; WIN64-NEXT: movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB ; WIN64-NEXT: movq %r8, %rax ; WIN64-NEXT: mulq %rdx ; WIN64-NEXT: shrq %rdx -; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax -; WIN64-NEXT: subq %rax, %r8 +; WIN64-NEXT: leal (%rdx,%rdx,2), %eax +; WIN64-NEXT: subl %eax, %r8d ; WIN64-NEXT: andl $3, %ecx ; WIN64-NEXT: leaq (%rcx,%r8,4), %rax ; WIN64-NEXT: xorl %edx, %edx diff --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll index 38a1de251a3d9..90e075bfabf0a 100644 --- a/llvm/test/CodeGen/X86/extract-bits.ll +++ b/llvm/test/CodeGen/X86/extract-bits.ll @@ -8091,9 +8091,9 @@ define void @pr38938(ptr %a0, ptr %a1) nounwind { ; X64-NOBMI-LABEL: pr38938: ; X64-NOBMI: # %bb.0: ; X64-NOBMI-NEXT: movl (%rsi), %eax -; X64-NOBMI-NEXT: shrl $21, %eax -; X64-NOBMI-NEXT: andl $1023, %eax # imm = 0x3FF -; X64-NOBMI-NEXT: incl (%rdi,%rax,4) +; X64-NOBMI-NEXT: shrl $19, %eax +; X64-NOBMI-NEXT: andl $4092, %eax # imm = 0xFFC +; X64-NOBMI-NEXT: incl (%rdi,%rax) ; X64-NOBMI-NEXT: retq ; ; X64-BMINOTBM-LABEL: pr38938: diff --git a/llvm/test/CodeGen/X86/fold-and-shift.ll b/llvm/test/CodeGen/X86/fold-and-shift.ll index 41adab63a11a6..985d7c6c82f06 100644 --- a/llvm/test/CodeGen/X86/fold-and-shift.ll +++ b/llvm/test/CodeGen/X86/fold-and-shift.ll @@ -36,8 +36,7 @@ define i32 @t2(ptr %X, i32 %i) { ; X64-LABEL: t2: ; X64: # %bb.0: # %entry ; X64-NEXT: movzwl %si, %eax -; X64-NEXT: addl %eax, %eax -; X64-NEXT: movl (%rdi,%rax,2), %eax +; X64-NEXT: movl (%rdi,%rax,4), %eax ; X64-NEXT: retq entry: %tmp2 = shl i32 %i, 1 diff --git a/llvm/test/CodeGen/X86/fp128-i128.ll b/llvm/test/CodeGen/X86/fp128-i128.ll index b9e6803686621..f48790de86fe9 100644 --- a/llvm/test/CodeGen/X86/fp128-i128.ll +++ b/llvm/test/CodeGen/X86/fp128-i128.ll @@ -137,7 +137,7 @@ define fp128 @TestI128_1(fp128 %x) #0 { ; SSE-NEXT: xorl %ecx, %ecx ; SSE-NEXT: testl %eax, %eax ; SSE-NEXT: sets %cl -; SSE-NEXT: shlq $4, %rcx +; SSE-NEXT: shll $4, %ecx ; SSE-NEXT: movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rcx), %xmm0 ; SSE-NEXT: popq %rax ; SSE-NEXT: retq @@ -151,7 +151,7 @@ define fp128 @TestI128_1(fp128 %x) #0 { ; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: testl %eax, %eax ; AVX-NEXT: sets %cl -; AVX-NEXT: shlq $4, %rcx +; AVX-NEXT: shll $4, %ecx ; AVX-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}(%rcx), %xmm0 ; AVX-NEXT: popq %rax ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/lea-dagdag.ll b/llvm/test/CodeGen/X86/lea-dagdag.ll index 2705bd00f5d2c..f81851a92d8de 100644 --- a/llvm/test/CodeGen/X86/lea-dagdag.ll +++ b/llvm/test/CodeGen/X86/lea-dagdag.ll @@ -199,7 +199,7 @@ define i64 @and_i32_zext_shl_add_i64_overshift(i64 %t0, i32 %t1) { ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $esi killed $esi def $rsi ; CHECK-NEXT: andl $8, %esi -; CHECK-NEXT: shlq $4, %rsi +; CHECK-NEXT: shll $4, %esi ; CHECK-NEXT: leaq (%rsi,%rdi), %rax ; CHECK-NEXT: retq %t4 = and i32 %t1, 8 diff --git a/llvm/test/CodeGen/X86/lea-opt2.ll b/llvm/test/CodeGen/X86/lea-opt2.ll index cec19dcf49c8d..f7588577a3e9a 100644 --- a/llvm/test/CodeGen/X86/lea-opt2.ll +++ b/llvm/test/CodeGen/X86/lea-opt2.ll @@ -192,7 +192,7 @@ define void @test9(i64 %p, i64 %s) { ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testl $4095, %eax # imm = 0xFFF ; CHECK-NEXT: setne %cl -; CHECK-NEXT: shlq $12, %rcx +; CHECK-NEXT: shll $12, %ecx ; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: andq $-4096, %rcx # imm = 0xF000 ; CHECK-NEXT: addq %rcx, %rdi diff --git a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll index 9f0c1ea1dc3f6..a2e4e4784d361 100644 --- a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll +++ b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll @@ -28,10 +28,10 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r ; GENERIC-NEXT: movzbl %r8b, %r14d ; GENERIC-NEXT: ## kill: def $r8d killed $r8d def $r8 ; GENERIC-NEXT: shrl $24, %r8d -; GENERIC-NEXT: movl %ebx, %ebp -; GENERIC-NEXT: shrl $16, %ebp -; GENERIC-NEXT: movzbl %bpl, %r15d -; GENERIC-NEXT: movl (%rax,%r15,4), %ebp +; GENERIC-NEXT: movl %ebx, %r15d +; GENERIC-NEXT: shrl $14, %r15d +; GENERIC-NEXT: andl $1020, %r15d ## imm = 0x3FC +; GENERIC-NEXT: movl (%rax,%r15), %ebp ; GENERIC-NEXT: xorl (%rdi,%r8,4), %ebp ; GENERIC-NEXT: xorl -12(%r9), %ebp ; GENERIC-NEXT: shrl $24, %ebx @@ -46,9 +46,9 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r ; GENERIC-NEXT: ## %bb.2: ## %bb1 ; GENERIC-NEXT: ## in Loop: Header=BB0_1 Depth=1 ; GENERIC-NEXT: movl %r14d, %ebx -; GENERIC-NEXT: shrl $16, %ebx -; GENERIC-NEXT: movzbl %bl, %ebx -; GENERIC-NEXT: xorl (%rax,%rbx,4), %r8d +; GENERIC-NEXT: shrl $14, %ebx +; GENERIC-NEXT: andl $1020, %ebx ## imm = 0x3FC +; GENERIC-NEXT: xorl (%rax,%rbx), %r8d ; GENERIC-NEXT: xorl -4(%r9), %r8d ; GENERIC-NEXT: shrl $24, %r14d ; GENERIC-NEXT: movzbl %bpl, %ebx @@ -61,9 +61,9 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r ; GENERIC-NEXT: shlq $4, %rcx ; GENERIC-NEXT: andl $-16777216, %r8d ## imm = 0xFF000000 ; GENERIC-NEXT: movl %r14d, %r9d -; GENERIC-NEXT: shrl $16, %r9d -; GENERIC-NEXT: movzbl %r9b, %r9d -; GENERIC-NEXT: movzbl 2(%rax,%r9,4), %r9d +; GENERIC-NEXT: shrl $14, %r9d +; GENERIC-NEXT: andl $1020, %r9d ## imm = 0x3FC +; GENERIC-NEXT: movzbl 2(%rax,%r9), %r9d ; GENERIC-NEXT: shll $16, %r9d ; GENERIC-NEXT: orl %r8d, %r9d ; GENERIC-NEXT: xorl 16(%rcx,%rdx), %r9d @@ -93,7 +93,6 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r ; ; ATOM-LABEL: t: ; ATOM: ## %bb.0: ## %entry -; ATOM-NEXT: pushq %rbp ; ATOM-NEXT: pushq %r15 ; ATOM-NEXT: pushq %r14 ; ATOM-NEXT: pushq %rbx @@ -113,10 +112,10 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r ; ATOM-NEXT: movl %r8d, %r14d ; ATOM-NEXT: movzbl %r8b, %r8d ; ATOM-NEXT: shrl $24, %r15d -; ATOM-NEXT: shrl $16, %ebx +; ATOM-NEXT: shrl $14, %ebx ; ATOM-NEXT: shrl $24, %r14d -; ATOM-NEXT: movzbl %bl, %ebx -; ATOM-NEXT: movl (%rax,%rbx,4), %ebx +; ATOM-NEXT: andl $1020, %ebx ## imm = 0x3FC +; ATOM-NEXT: movl (%rax,%rbx), %ebx ; ATOM-NEXT: xorl (%rdi,%r14,4), %ebx ; ATOM-NEXT: movl (%r10,%r8,4), %r14d ; ATOM-NEXT: xorl -12(%r9), %ebx @@ -129,12 +128,12 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r ; ATOM-NEXT: jb LBB0_3 ; ATOM-NEXT: ## %bb.2: ## %bb1 ; ATOM-NEXT: ## in Loop: Header=BB0_1 Depth=1 -; ATOM-NEXT: movl %r14d, %ebp +; ATOM-NEXT: movl %r14d, %r15d ; ATOM-NEXT: movzbl %bl, %ebx ; ATOM-NEXT: shrl $24, %r14d -; ATOM-NEXT: shrl $16, %ebp -; ATOM-NEXT: movzbl %bpl, %r15d -; ATOM-NEXT: xorl (%rax,%r15,4), %r8d +; ATOM-NEXT: shrl $14, %r15d +; ATOM-NEXT: andl $1020, %r15d ## imm = 0x3FC +; ATOM-NEXT: xorl (%rax,%r15), %r8d ; ATOM-NEXT: movl (%r10,%rbx,4), %r15d ; ATOM-NEXT: xorl (%rdi,%r14,4), %r15d ; ATOM-NEXT: xorl -4(%r9), %r8d @@ -146,11 +145,11 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r ; ATOM-NEXT: andl $-16777216, %r8d ## imm = 0xFF000000 ; ATOM-NEXT: shrl $8, %r14d ; ATOM-NEXT: shlq $4, %rcx -; ATOM-NEXT: shrl $16, %r9d +; ATOM-NEXT: shrl $14, %r9d ; ATOM-NEXT: movzbl 3(%rdi,%r14,4), %edi -; ATOM-NEXT: movzbl %r9b, %r9d +; ATOM-NEXT: andl $1020, %r9d ## imm = 0x3FC ; ATOM-NEXT: shll $24, %edi -; ATOM-NEXT: movzbl 2(%rax,%r9,4), %r9d +; ATOM-NEXT: movzbl 2(%rax,%r9), %r9d ; ATOM-NEXT: shll $16, %r9d ; ATOM-NEXT: orl %r8d, %r9d ; ATOM-NEXT: movzbl %bl, %r8d @@ -172,7 +171,6 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r ; ATOM-NEXT: popq %rbx ; ATOM-NEXT: popq %r14 ; ATOM-NEXT: popq %r15 -; ATOM-NEXT: popq %rbp ; ATOM-NEXT: retq entry: %0 = load i32, i32* %rk, align 4 ; [#uses=1] diff --git a/llvm/test/CodeGen/X86/parity.ll b/llvm/test/CodeGen/X86/parity.ll index cada55919c8ce..420f5ba5ab433 100644 --- a/llvm/test/CodeGen/X86/parity.ll +++ b/llvm/test/CodeGen/X86/parity.ll @@ -637,7 +637,7 @@ define i64 @parity_64_shift(i64 %0) { ; X64-NOPOPCNT-NEXT: xorl %eax, %eax ; X64-NOPOPCNT-NEXT: xorb %ch, %cl ; X64-NOPOPCNT-NEXT: setnp %al -; X64-NOPOPCNT-NEXT: addq %rax, %rax +; X64-NOPOPCNT-NEXT: addl %eax, %eax ; X64-NOPOPCNT-NEXT: retq ; ; X86-POPCNT-LABEL: parity_64_shift: @@ -654,7 +654,7 @@ define i64 @parity_64_shift(i64 %0) { ; X64-POPCNT: # %bb.0: ; X64-POPCNT-NEXT: popcntq %rdi, %rax ; X64-POPCNT-NEXT: andl $1, %eax -; X64-POPCNT-NEXT: addq %rax, %rax +; X64-POPCNT-NEXT: addl %eax, %eax ; X64-POPCNT-NEXT: retq %2 = tail call i64 @llvm.ctpop.i64(i64 %0) %3 = shl nuw nsw i64 %2, 1 diff --git a/llvm/test/CodeGen/X86/pr62653.ll b/llvm/test/CodeGen/X86/pr62653.ll index 0a03c1831f657..b6a1bf47983dc 100644 --- a/llvm/test/CodeGen/X86/pr62653.ll +++ b/llvm/test/CodeGen/X86/pr62653.ll @@ -4,124 +4,117 @@ define <64 x i4> @pr62653(<64 x i4> %a0) nounwind { ; CHECK-LABEL: pr62653: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $r9d killed $r9d def $r9 -; CHECK-NEXT: # kill: def $r8d killed $r8d def $r8 -; CHECK-NEXT: # kill: def $ecx killed $ecx def $rcx -; CHECK-NEXT: # kill: def $edx killed $edx def $rdx -; CHECK-NEXT: # kill: def $esi killed $esi def $rsi ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-NEXT: andl $15, %edi +; CHECK-NEXT: shll $4, %edi ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-NEXT: andl $15, %r10d -; CHECK-NEXT: shlq $4, %r10 ; CHECK-NEXT: orq %rdi, %r10 ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-NEXT: andl $15, %edi -; CHECK-NEXT: shlq $8, %rdi +; CHECK-NEXT: shll $8, %edi ; CHECK-NEXT: orq %r10, %rdi ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-NEXT: andl $15, %r10d -; CHECK-NEXT: shlq $12, %r10 +; CHECK-NEXT: shll $12, %r10d ; CHECK-NEXT: orq %rdi, %r10 -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; CHECK-NEXT: andl $15, %r11d -; CHECK-NEXT: shlq $16, %r11 -; CHECK-NEXT: orq %r10, %r11 ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-NEXT: andl $15, %edi -; CHECK-NEXT: shlq $20, %rdi -; CHECK-NEXT: orq %r11, %rdi +; CHECK-NEXT: shll $16, %edi +; CHECK-NEXT: orq %r10, %rdi ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-NEXT: andl $15, %r10d -; CHECK-NEXT: shlq $24, %r10 +; CHECK-NEXT: shll $20, %r10d ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; CHECK-NEXT: andl $15, %r11d -; CHECK-NEXT: shlq $28, %r11 +; CHECK-NEXT: shll $24, %r11d ; CHECK-NEXT: orq %r10, %r11 ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; CHECK-NEXT: andl $15, %r10d -; CHECK-NEXT: shlq $32, %r10 +; CHECK-NEXT: shll $28, %r10d ; CHECK-NEXT: orq %r11, %r10 ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; CHECK-NEXT: andl $15, %r11d -; CHECK-NEXT: shlq $36, %r11 +; CHECK-NEXT: shlq $32, %r11 ; CHECK-NEXT: orq %r10, %r11 ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-NEXT: andl $15, %r10d -; CHECK-NEXT: shlq $40, %r10 +; CHECK-NEXT: shlq $36, %r10 ; CHECK-NEXT: orq %r11, %r10 +; CHECK-NEXT: orq %rdi, %r10 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: andl $15, %edi +; CHECK-NEXT: shlq $40, %rdi ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; CHECK-NEXT: andl $15, %r11d ; CHECK-NEXT: shlq $44, %r11 -; CHECK-NEXT: orq %r10, %r11 ; CHECK-NEXT: orq %rdi, %r11 ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-NEXT: andl $15, %edi ; CHECK-NEXT: shlq $48, %rdi -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; CHECK-NEXT: andl $15, %r10d -; CHECK-NEXT: shlq $52, %r10 -; CHECK-NEXT: orq %rdi, %r10 -; CHECK-NEXT: orq %r11, %r10 -; CHECK-NEXT: movq %r10, 8(%rax) +; CHECK-NEXT: orq %r11, %rdi +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; CHECK-NEXT: andl $15, %r11d +; CHECK-NEXT: shlq $52, %r11 +; CHECK-NEXT: orq %rdi, %r11 +; CHECK-NEXT: orq %r10, %r11 +; CHECK-NEXT: movq %r11, 8(%rax) +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: andl $15, %edi +; CHECK-NEXT: shlq $32, %rdi ; CHECK-NEXT: andl $15, %esi ; CHECK-NEXT: andl $15, %edx -; CHECK-NEXT: shlq $4, %rdx -; CHECK-NEXT: orq %rsi, %rdx +; CHECK-NEXT: shll $4, %edx +; CHECK-NEXT: orl %esi, %edx ; CHECK-NEXT: andl $15, %ecx -; CHECK-NEXT: shlq $8, %rcx -; CHECK-NEXT: orq %rdx, %rcx +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: orl %edx, %ecx ; CHECK-NEXT: andl $15, %r8d -; CHECK-NEXT: shlq $12, %r8 -; CHECK-NEXT: orq %rcx, %r8 +; CHECK-NEXT: shll $12, %r8d +; CHECK-NEXT: orl %ecx, %r8d ; CHECK-NEXT: andl $15, %r9d -; CHECK-NEXT: shlq $16, %r9 -; CHECK-NEXT: orq %r8, %r9 +; CHECK-NEXT: shll $16, %r9d +; CHECK-NEXT: orl %r8d, %r9d ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: andl $15, %ecx -; CHECK-NEXT: shlq $20, %rcx -; CHECK-NEXT: orq %r9, %rcx -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi -; CHECK-NEXT: andl $15, %esi -; CHECK-NEXT: shlq $24, %rsi +; CHECK-NEXT: shll $20, %ecx +; CHECK-NEXT: orl %r9d, %ecx ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: andl $15, %edx -; CHECK-NEXT: shlq $28, %rdx -; CHECK-NEXT: orq %rsi, %rdx -; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: andl $15, %ecx -; CHECK-NEXT: shlq $32, %rcx +; CHECK-NEXT: shll $24, %edx ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi -; CHECK-NEXT: andl $15, %esi -; CHECK-NEXT: shlq $36, %rsi -; CHECK-NEXT: orq %rcx, %rsi +; CHECK-NEXT: shll $28, %esi +; CHECK-NEXT: orl %edx, %esi +; CHECK-NEXT: orl %ecx, %esi +; CHECK-NEXT: orq %rdi, %rsi ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: andl $15, %ecx -; CHECK-NEXT: shlq $40, %rcx +; CHECK-NEXT: shlq $36, %rcx ; CHECK-NEXT: orq %rsi, %rcx +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: andl $15, %edx +; CHECK-NEXT: shlq $40, %rdx +; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: shlq $44, %rcx ; CHECK-NEXT: orq %rdx, %rcx ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: andl $15, %edx -; CHECK-NEXT: shlq $44, %rdx +; CHECK-NEXT: shlq $48, %rdx ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi ; CHECK-NEXT: andl $15, %esi -; CHECK-NEXT: shlq $48, %rsi +; CHECK-NEXT: shlq $52, %rsi ; CHECK-NEXT: orq %rdx, %rsi ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: andl $15, %edx -; CHECK-NEXT: shlq $52, %rdx +; CHECK-NEXT: shlq $56, %rdx ; CHECK-NEXT: orq %rsi, %rdx ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi -; CHECK-NEXT: andl $15, %esi -; CHECK-NEXT: shlq $56, %rsi +; CHECK-NEXT: shlq $60, %rsi ; CHECK-NEXT: orq %rdx, %rsi ; CHECK-NEXT: orq %rcx, %rsi -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: shlq $60, %rcx -; CHECK-NEXT: orq %rsi, %rcx -; CHECK-NEXT: movq %rcx, (%rax) +; CHECK-NEXT: movq %rsi, (%rax) ; CHECK-NEXT: retq %res = shufflevector <64 x i4> %a0, <64 x i4> zeroinitializer, <64 x i32> ret <64 x i4> %res diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll index 213b2b018d0ad..6d2b73e2108ba 100644 --- a/llvm/test/CodeGen/X86/select.ll +++ b/llvm/test/CodeGen/X86/select.ll @@ -393,22 +393,22 @@ define void @test6(i32 %C, ptr %A, ptr %B) nounwind { define x86_fp80 @test7(i32 %tmp8) nounwind { ; GENERIC-LABEL: test7: ; GENERIC: ## %bb.0: -; GENERIC-NEXT: xorl %eax, %eax -; GENERIC-NEXT: testl %edi, %edi -; GENERIC-NEXT: setns %al -; GENERIC-NEXT: shlq $4, %rax -; GENERIC-NEXT: leaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx -; GENERIC-NEXT: fldt (%rax,%rcx) +; GENERIC-NEXT: ## kill: def $edi killed $edi def $rdi +; GENERIC-NEXT: notl %edi +; GENERIC-NEXT: shrl $27, %edi +; GENERIC-NEXT: andl $-16, %edi +; GENERIC-NEXT: leaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; GENERIC-NEXT: fldt (%rdi,%rax) ; GENERIC-NEXT: retq ; ; ATOM-LABEL: test7: ; ATOM: ## %bb.0: -; ATOM-NEXT: xorl %eax, %eax -; ATOM-NEXT: leaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx -; ATOM-NEXT: testl %edi, %edi -; ATOM-NEXT: setns %al -; ATOM-NEXT: shlq $4, %rax -; ATOM-NEXT: fldt (%rax,%rcx) +; ATOM-NEXT: ## kill: def $edi killed $edi def $rdi +; ATOM-NEXT: leaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; ATOM-NEXT: notl %edi +; ATOM-NEXT: shrl $27, %edi +; ATOM-NEXT: andl $-16, %edi +; ATOM-NEXT: fldt (%rdi,%rax) ; ATOM-NEXT: retq ; ; ATHLON-LABEL: test7: diff --git a/llvm/test/CodeGen/X86/select_const.ll b/llvm/test/CodeGen/X86/select_const.ll index dd054348ac014..eba22036701b4 100644 --- a/llvm/test/CodeGen/X86/select_const.ll +++ b/llvm/test/CodeGen/X86/select_const.ll @@ -628,7 +628,7 @@ define i64 @select_pow2_diff_neg_invert(i1 zeroext %cond) { ; X64: # %bb.0: ; X64-NEXT: xorb $1, %dil ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: shlq $7, %rax +; X64-NEXT: shll $7, %eax ; X64-NEXT: addq $-99, %rax ; X64-NEXT: retq %sel = select i1 %cond, i64 -99, i64 29 diff --git a/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll b/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll index 220b7dc46dd94..03f4c0f61cdd1 100644 --- a/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll +++ b/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll @@ -194,7 +194,7 @@ define i64 @sel_shift_bool_i64(i1 %t) { ; ANY: # %bb.0: ; ANY-NEXT: movl %edi, %eax ; ANY-NEXT: andl $1, %eax -; ANY-NEXT: shlq $16, %rax +; ANY-NEXT: shll $16, %eax ; ANY-NEXT: retq %shl = select i1 %t, i64 65536, i64 0 ret i64 %shl diff --git a/llvm/test/CodeGen/X86/setcc.ll b/llvm/test/CodeGen/X86/setcc.ll index c38318d5f6a25..60ac6df3f77af 100644 --- a/llvm/test/CodeGen/X86/setcc.ll +++ b/llvm/test/CodeGen/X86/setcc.ll @@ -64,7 +64,7 @@ define i64 @t3(i64 %x) nounwind readnone ssp { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq $18, %rdi ; X64-NEXT: setb %al -; X64-NEXT: shlq $6, %rax +; X64-NEXT: shll $6, %eax ; X64-NEXT: retq %t0 = icmp ult i64 %x, 18 %if = select i1 %t0, i64 64, i64 0 diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll index ab504d0a43fef..cf45641fba632 100644 --- a/llvm/test/CodeGen/X86/shift-combine.ll +++ b/llvm/test/CodeGen/X86/shift-combine.ll @@ -15,9 +15,8 @@ define dso_local i32 @test_lshr_and(i32 %x) { ; X64-LABEL: test_lshr_and: ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: shrl $2, %edi -; X64-NEXT: andl $3, %edi -; X64-NEXT: movl array(,%rdi,4), %eax +; X64-NEXT: andl $12, %edi +; X64-NEXT: movl array(%rdi), %eax ; X64-NEXT: retq %tmp2 = lshr i32 %x, 2 %tmp3 = and i32 %tmp2, 3 @@ -104,8 +103,8 @@ define dso_local ptr @test_exact4(i32 %a, i32 %b, ptr %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: subl %edi, %esi -; X64-NEXT: shrl $3, %esi -; X64-NEXT: leaq (%rdx,%rsi,4), %rax +; X64-NEXT: shrl %esi +; X64-NEXT: leaq (%rsi,%rdx), %rax ; X64-NEXT: retq %sub = sub i32 %b, %a %shr = lshr exact i32 %sub, 3 @@ -126,8 +125,8 @@ define dso_local ptr @test_exact5(i32 %a, i32 %b, ptr %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: subl %edi, %esi -; X64-NEXT: shrl $3, %esi -; X64-NEXT: leaq (%rdx,%rsi,4), %rax +; X64-NEXT: shrl %esi +; X64-NEXT: leaq (%rsi,%rdx), %rax ; X64-NEXT: retq %sub = sub i32 %b, %a %shr = lshr exact i32 %sub, 3 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll index 3ca0e2121e0d1..ce8d2acd035f6 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -255,28 +255,28 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1 ; SSE2-NEXT: andl $7, %r8d ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: andl $7, %r9d -; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movzwl -24(%rsp,%rdx,2), %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: movzwl -24(%rsp,%rdi,2), %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movzwl -24(%rsp,%r9,2), %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: movzwl -24(%rsp,%r8,2), %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: movzwl -24(%rsp,%r10,2), %ecx -; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: movzwl -24(%rsp,%r10,2), %r10d +; SSE2-NEXT: movd %r10d, %xmm0 ; SSE2-NEXT: movzwl -24(%rsp,%rax,2), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movzwl -24(%rsp,%r9,2), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzwl -24(%rsp,%r8,2), %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzwl -24(%rsp,%rdx,2), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %eax ; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: movzwl -24(%rsp,%rdi,2), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: retq ; @@ -299,28 +299,28 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1 ; SSSE3-NEXT: andl $7, %r8d ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: andl $7, %r9d -; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movzwl -24(%rsp,%rdx,2), %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: movzwl -24(%rsp,%rdi,2), %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %r10d +; SSSE3-NEXT: movd %r10d, %xmm0 ; SSSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzwl -24(%rsp,%rdx,2), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax ; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSSE3-NEXT: movzwl -24(%rsp,%rdi,2), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll index f3bafec3399a7..8f78438dedf92 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll @@ -293,52 +293,52 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, ; AVX1-NEXT: # kill: def $edx killed $edx def $rdx ; AVX1-NEXT: # kill: def $esi killed $esi def $rsi ; AVX1-NEXT: # kill: def $edi killed $edi def $rdi -; AVX1-NEXT: andl $15, %edi -; AVX1-NEXT: vmovaps %ymm0, (%rsp) -; AVX1-NEXT: movzwl (%rsp,%rdi,2), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: andl $15, %esi -; AVX1-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm0, %xmm0 -; AVX1-NEXT: andl $15, %edx -; AVX1-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm0, %xmm0 -; AVX1-NEXT: andl $15, %ecx -; AVX1-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm0, %xmm0 -; AVX1-NEXT: andl $15, %r8d -; AVX1-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm0, %xmm0 -; AVX1-NEXT: andl $15, %r9d -; AVX1-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm0, %xmm0 -; AVX1-NEXT: movl 16(%rbp), %eax -; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0 -; AVX1-NEXT: movl 24(%rbp), %eax -; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl 32(%rbp), %eax ; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: vmovaps %ymm0, (%rsp) ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: movl 40(%rbp), %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl 48(%rbp), %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl 56(%rbp), %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl 64(%rbp), %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl 72(%rbp), %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl 80(%rbp), %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl 88(%rbp), %eax ; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX1-NEXT: andl $15, %edi +; AVX1-NEXT: movzwl (%rsp,%rdi,2), %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: andl $15, %esi +; AVX1-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $15, %edx +; AVX1-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $15, %ecx +; AVX1-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $15, %r8d +; AVX1-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $15, %r9d +; AVX1-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1 +; AVX1-NEXT: movl 16(%rbp), %eax +; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: movl 24(%rbp), %eax +; AVX1-NEXT: andl $15, %eax ; AVX1-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: movq %rbp, %rsp ; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq @@ -355,52 +355,52 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, ; AVX2-NEXT: # kill: def $edx killed $edx def $rdx ; AVX2-NEXT: # kill: def $esi killed $esi def $rsi ; AVX2-NEXT: # kill: def $edi killed $edi def $rdi -; AVX2-NEXT: andl $15, %edi -; AVX2-NEXT: vmovaps %ymm0, (%rsp) -; AVX2-NEXT: movzwl (%rsp,%rdi,2), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: andl $15, %esi -; AVX2-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm0, %xmm0 -; AVX2-NEXT: andl $15, %edx -; AVX2-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm0, %xmm0 -; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm0, %xmm0 -; AVX2-NEXT: andl $15, %r8d -; AVX2-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm0, %xmm0 -; AVX2-NEXT: andl $15, %r9d -; AVX2-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm0, %xmm0 -; AVX2-NEXT: movl 16(%rbp), %eax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0 -; AVX2-NEXT: movl 24(%rbp), %eax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl 32(%rbp), %eax ; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vmovaps %ymm0, (%rsp) ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: movl 40(%rbp), %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl 48(%rbp), %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl 56(%rbp), %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl 64(%rbp), %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl 72(%rbp), %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl 80(%rbp), %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl 88(%rbp), %eax ; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX2-NEXT: andl $15, %edi +; AVX2-NEXT: movzwl (%rsp,%rdi,2), %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: andl $15, %esi +; AVX2-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $15, %edx +; AVX2-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $15, %ecx +; AVX2-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $15, %r8d +; AVX2-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $15, %r9d +; AVX2-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1 +; AVX2-NEXT: movl 16(%rbp), %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: movl 24(%rbp), %eax +; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: movq %rbp, %rsp ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq @@ -448,52 +448,52 @@ define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i ; AVX1-NEXT: # kill: def $edx killed $edx def $rdx ; AVX1-NEXT: # kill: def $esi killed $esi def $rsi ; AVX1-NEXT: # kill: def $edi killed $edi def $rdi -; AVX1-NEXT: andl $7, %edi -; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movzwl -24(%rsp,%rdi,2), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: andl $7, %esi -; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0 -; AVX1-NEXT: andl $7, %edx -; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0 -; AVX1-NEXT: andl $7, %ecx -; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0 -; AVX1-NEXT: andl $7, %r8d -; AVX1-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0 -; AVX1-NEXT: andl $7, %r9d -; AVX1-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0 +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax -; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0 +; AVX1-NEXT: andl $7, %edi +; AVX1-NEXT: movzwl -24(%rsp,%rdi,2), %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: andl $7, %esi +; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $7, %edx +; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $7, %ecx +; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $7, %r8d +; AVX1-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $7, %r9d +; AVX1-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX1-NEXT: andl $7, %eax ; AVX1-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm1, %xmm1 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX1-NEXT: andl $7, %eax ; AVX1-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16: @@ -504,52 +504,52 @@ define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i ; AVX2-NEXT: # kill: def $edx killed $edx def $rdx ; AVX2-NEXT: # kill: def $esi killed $esi def $rsi ; AVX2-NEXT: # kill: def $edi killed $edi def $rdi -; AVX2-NEXT: andl $7, %edi -; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movzwl -24(%rsp,%rdi,2), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: andl $7, %esi -; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0 -; AVX2-NEXT: andl $7, %edx -; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0 -; AVX2-NEXT: andl $7, %ecx -; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0 -; AVX2-NEXT: andl $7, %r8d -; AVX2-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0 -; AVX2-NEXT: andl $7, %r9d -; AVX2-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX2-NEXT: andl $7, %eax -; AVX2-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0 +; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX2-NEXT: andl $7, %eax -; AVX2-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX2-NEXT: andl $7, %eax -; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax -; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX2-NEXT: andl $7, %eax -; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX2-NEXT: andl $7, %eax -; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX2-NEXT: andl $7, %eax -; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX2-NEXT: andl $7, %eax -; AVX2-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX2-NEXT: andl $7, %eax -; AVX2-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0 +; AVX2-NEXT: andl $7, %edi +; AVX2-NEXT: movzwl -24(%rsp,%rdi,2), %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: andl $7, %esi +; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $7, %edx +; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $7, %ecx +; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $7, %r8d +; AVX2-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $7, %r9d +; AVX2-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX2-NEXT: andl $7, %eax ; AVX2-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm1, %xmm1 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX2-NEXT: andl $7, %eax ; AVX2-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %x0 = extractelement <8 x i16> %x, i32 %i0 %x1 = extractelement <8 x i16> %x, i32 %i1 @@ -597,13 +597,13 @@ define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, ptr %i) nounwin ; ALL-NEXT: movq %rsp, %rbp ; ALL-NEXT: andq $-32, %rsp ; ALL-NEXT: subq $64, %rsp -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: movq 8(%rdi), %rcx +; ALL-NEXT: movl (%rdi), %eax +; ALL-NEXT: movl 8(%rdi), %ecx ; ALL-NEXT: andl $3, %eax ; ALL-NEXT: andl $3, %ecx -; ALL-NEXT: movq 16(%rdi), %rdx +; ALL-NEXT: movl 16(%rdi), %edx ; ALL-NEXT: andl $3, %edx -; ALL-NEXT: movq 24(%rdi), %rsi +; ALL-NEXT: movl 24(%rdi), %esi ; ALL-NEXT: andl $3, %esi ; ALL-NEXT: vmovaps %ymm0, (%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero @@ -637,13 +637,13 @@ define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, ptr %i) nounwin define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, ptr %i) nounwind { ; ALL-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64: ; ALL: # %bb.0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: movq 8(%rdi), %rcx +; ALL-NEXT: movl (%rdi), %eax +; ALL-NEXT: movl 8(%rdi), %ecx ; ALL-NEXT: andl $1, %eax ; ALL-NEXT: andl $1, %ecx -; ALL-NEXT: movq 16(%rdi), %rdx +; ALL-NEXT: movl 16(%rdi), %edx ; ALL-NEXT: andl $1, %edx -; ALL-NEXT: movq 24(%rdi), %rsi +; ALL-NEXT: movl 24(%rdi), %esi ; ALL-NEXT: andl $1, %esi ; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll index 0c57f497aa8aa..61434e942b3c5 100644 --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -651,18 +651,18 @@ define i64 @vselect_any_extend_vector_inreg_crash(ptr %x) { ; SSE: # %bb.0: ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: andl $1, %eax -; SSE-NEXT: shlq $15, %rax +; SSE-NEXT: shll $15, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: vselect_any_extend_vector_inreg_crash: ; AVX: # %bb.0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: andl $1, %eax -; AVX-NEXT: shlq $15, %rax +; AVX-NEXT: shll $15, %eax ; AVX-NEXT: retq 0: %1 = load <8 x i8>, ptr %x diff --git a/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll b/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll index 3f64a383abd2c..96e67a52b786a 100644 --- a/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll +++ b/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll @@ -14,7 +14,7 @@ define i64 @test1(ptr %data) { ; ; X64-LABEL: test1: ; X64: # %bb.0: # %entry -; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movzbl (%rdi), %eax ; X64-NEXT: shll $2, %eax ; X64-NEXT: andl $60, %eax ; X64-NEXT: retq @@ -37,7 +37,7 @@ define ptr @test2(ptr %data) { ; ; X64-LABEL: test2: ; X64: # %bb.0: # %entry -; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movzbl (%rdi), %eax ; X64-NEXT: andl $15, %eax ; X64-NEXT: leaq (%rdi,%rax,4), %rax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/zext-shl.ll b/llvm/test/CodeGen/X86/zext-shl.ll index 8c27e0da6acf7..bc0981781df8f 100644 --- a/llvm/test/CodeGen/X86/zext-shl.ll +++ b/llvm/test/CodeGen/X86/zext-shl.ll @@ -51,7 +51,7 @@ define i64 @i64_zext_shift_i16_zext_i8(i8 %a0) nounwind { ; X64-LABEL: i64_zext_shift_i16_zext_i8: ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: shlq $5, %rax +; X64-NEXT: shll $5, %eax ; X64-NEXT: retq %t0 = zext i8 %a0 to i16 %t1 = shl i16 %t0, 5 @@ -112,7 +112,7 @@ define i128 @i128_zext_shift_i64_zext_i8(i8 %a0) nounwind { ; X64-LABEL: i128_zext_shift_i64_zext_i8: ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: shlq $4, %rax +; X64-NEXT: shll $4, %eax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: retq %t0 = zext i8 %a0 to i64 @@ -136,7 +136,7 @@ define i128 @i128_zext_shift_i64_zext_i16(i16 %a0) nounwind { ; X64-LABEL: i128_zext_shift_i64_zext_i16: ; X64: # %bb.0: ; X64-NEXT: movzwl %di, %eax -; X64-NEXT: shlq $7, %rax +; X64-NEXT: shll $7, %eax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: retq %t0 = zext i16 %a0 to i64